{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 17469, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2.9193334579467773, "learning_rate": 1e-05, "loss": 1.5817, "step": 10 }, { "epoch": 0.0, "grad_norm": 2.434458017349243, "learning_rate": 1e-05, "loss": 1.3487, "step": 20 }, { "epoch": 0.01, "grad_norm": 2.4167187213897705, "learning_rate": 1e-05, "loss": 1.3269, "step": 30 }, { "epoch": 0.01, "grad_norm": 3.078380823135376, "learning_rate": 1e-05, "loss": 1.3464, "step": 40 }, { "epoch": 0.01, "grad_norm": 2.6377758979797363, "learning_rate": 1e-05, "loss": 1.2832, "step": 50 }, { "epoch": 0.01, "grad_norm": 2.3049213886260986, "learning_rate": 1e-05, "loss": 1.3479, "step": 60 }, { "epoch": 0.01, "grad_norm": 2.2546005249023438, "learning_rate": 1e-05, "loss": 1.35, "step": 70 }, { "epoch": 0.01, "grad_norm": 2.328960418701172, "learning_rate": 1e-05, "loss": 1.4031, "step": 80 }, { "epoch": 0.02, "grad_norm": 2.6193785667419434, "learning_rate": 1e-05, "loss": 1.2959, "step": 90 }, { "epoch": 0.02, "grad_norm": 2.4487783908843994, "learning_rate": 1e-05, "loss": 1.292, "step": 100 }, { "epoch": 0.02, "grad_norm": 2.7081260681152344, "learning_rate": 1e-05, "loss": 1.3578, "step": 110 }, { "epoch": 0.02, "grad_norm": 2.4693126678466797, "learning_rate": 1e-05, "loss": 1.346, "step": 120 }, { "epoch": 0.02, "grad_norm": 2.3680224418640137, "learning_rate": 1e-05, "loss": 1.2894, "step": 130 }, { "epoch": 0.02, "grad_norm": 6.633376598358154, "learning_rate": 1e-05, "loss": 1.336, "step": 140 }, { "epoch": 0.03, "grad_norm": 3.109175682067871, "learning_rate": 1e-05, "loss": 1.3236, "step": 150 }, { "epoch": 0.03, "grad_norm": 2.653766393661499, "learning_rate": 1e-05, "loss": 1.301, "step": 160 }, { "epoch": 0.03, "grad_norm": 2.339801788330078, "learning_rate": 1e-05, "loss": 1.35, "step": 170 }, { "epoch": 0.03, "grad_norm": 2.5112967491149902, "learning_rate": 1e-05, "loss": 1.4245, "step": 180 }, { "epoch": 0.03, "grad_norm": 2.812804937362671, "learning_rate": 1e-05, "loss": 1.4252, "step": 190 }, { "epoch": 0.03, "grad_norm": 2.4148075580596924, "learning_rate": 1e-05, "loss": 1.2881, "step": 200 }, { "epoch": 0.04, "grad_norm": 2.221367120742798, "learning_rate": 1e-05, "loss": 1.2941, "step": 210 }, { "epoch": 0.04, "grad_norm": 1.8252381086349487, "learning_rate": 1e-05, "loss": 1.3305, "step": 220 }, { "epoch": 0.04, "grad_norm": 2.6195831298828125, "learning_rate": 1e-05, "loss": 1.2651, "step": 230 }, { "epoch": 0.04, "grad_norm": 3.789186716079712, "learning_rate": 1e-05, "loss": 1.3352, "step": 240 }, { "epoch": 0.04, "grad_norm": 2.3786251544952393, "learning_rate": 1e-05, "loss": 1.3115, "step": 250 }, { "epoch": 0.04, "grad_norm": 2.0984363555908203, "learning_rate": 1e-05, "loss": 1.4102, "step": 260 }, { "epoch": 0.05, "grad_norm": 3.215639591217041, "learning_rate": 1e-05, "loss": 1.3049, "step": 270 }, { "epoch": 0.05, "grad_norm": 2.619779109954834, "learning_rate": 1e-05, "loss": 1.3686, "step": 280 }, { "epoch": 0.05, "grad_norm": 2.424197196960449, "learning_rate": 1e-05, "loss": 1.3401, "step": 290 }, { "epoch": 0.05, "grad_norm": 1.6028305292129517, "learning_rate": 1e-05, "loss": 1.3297, "step": 300 }, { "epoch": 0.05, "grad_norm": 3.2444934844970703, "learning_rate": 1e-05, "loss": 1.369, "step": 310 }, { "epoch": 0.05, "grad_norm": 3.5839474201202393, "learning_rate": 1e-05, "loss": 1.3516, "step": 320 }, { "epoch": 0.06, "grad_norm": 3.0733659267425537, "learning_rate": 1e-05, "loss": 1.3525, "step": 330 }, { "epoch": 0.06, "grad_norm": 2.1361422538757324, "learning_rate": 1e-05, "loss": 1.3495, "step": 340 }, { "epoch": 0.06, "grad_norm": 2.298948287963867, "learning_rate": 1e-05, "loss": 1.3479, "step": 350 }, { "epoch": 0.06, "grad_norm": 2.780656337738037, "learning_rate": 1e-05, "loss": 1.3573, "step": 360 }, { "epoch": 0.06, "grad_norm": 4.663308143615723, "learning_rate": 1e-05, "loss": 1.3882, "step": 370 }, { "epoch": 0.07, "grad_norm": 2.2056221961975098, "learning_rate": 1e-05, "loss": 1.3765, "step": 380 }, { "epoch": 0.07, "grad_norm": 2.040045976638794, "learning_rate": 1e-05, "loss": 1.3591, "step": 390 }, { "epoch": 0.07, "grad_norm": 2.020139694213867, "learning_rate": 1e-05, "loss": 1.4021, "step": 400 }, { "epoch": 0.07, "grad_norm": 2.6178290843963623, "learning_rate": 1e-05, "loss": 1.3574, "step": 410 }, { "epoch": 0.07, "grad_norm": 2.0279769897460938, "learning_rate": 1e-05, "loss": 1.2923, "step": 420 }, { "epoch": 0.07, "grad_norm": 1.8427987098693848, "learning_rate": 1e-05, "loss": 1.4751, "step": 430 }, { "epoch": 0.08, "grad_norm": 2.862971782684326, "learning_rate": 1e-05, "loss": 1.4609, "step": 440 }, { "epoch": 0.08, "grad_norm": 3.7004852294921875, "learning_rate": 1e-05, "loss": 1.4173, "step": 450 }, { "epoch": 0.08, "grad_norm": 1.815732479095459, "learning_rate": 1e-05, "loss": 1.3416, "step": 460 }, { "epoch": 0.08, "grad_norm": 2.0111870765686035, "learning_rate": 1e-05, "loss": 1.2098, "step": 470 }, { "epoch": 0.08, "grad_norm": 2.208897590637207, "learning_rate": 1e-05, "loss": 1.4199, "step": 480 }, { "epoch": 0.08, "grad_norm": 2.9113574028015137, "learning_rate": 1e-05, "loss": 1.392, "step": 490 }, { "epoch": 0.09, "grad_norm": 1.8567959070205688, "learning_rate": 1e-05, "loss": 1.2849, "step": 500 }, { "epoch": 0.09, "grad_norm": 2.282118320465088, "learning_rate": 1e-05, "loss": 1.3741, "step": 510 }, { "epoch": 0.09, "grad_norm": 3.6298553943634033, "learning_rate": 1e-05, "loss": 1.3421, "step": 520 }, { "epoch": 0.09, "grad_norm": 2.0505871772766113, "learning_rate": 1e-05, "loss": 1.308, "step": 530 }, { "epoch": 0.09, "grad_norm": 2.0846567153930664, "learning_rate": 1e-05, "loss": 1.4474, "step": 540 }, { "epoch": 0.09, "grad_norm": 2.3861382007598877, "learning_rate": 1e-05, "loss": 1.2142, "step": 550 }, { "epoch": 0.1, "grad_norm": 2.2822299003601074, "learning_rate": 1e-05, "loss": 1.2969, "step": 560 }, { "epoch": 0.1, "grad_norm": 3.0871715545654297, "learning_rate": 1e-05, "loss": 1.461, "step": 570 }, { "epoch": 0.1, "grad_norm": 2.1369597911834717, "learning_rate": 1e-05, "loss": 1.3103, "step": 580 }, { "epoch": 0.1, "grad_norm": 2.3976328372955322, "learning_rate": 1e-05, "loss": 1.3737, "step": 590 }, { "epoch": 0.1, "grad_norm": 2.572042465209961, "learning_rate": 1e-05, "loss": 1.2747, "step": 600 }, { "epoch": 0.1, "grad_norm": 2.673109531402588, "learning_rate": 1e-05, "loss": 1.4067, "step": 610 }, { "epoch": 0.11, "grad_norm": 2.054105520248413, "learning_rate": 1e-05, "loss": 1.3836, "step": 620 }, { "epoch": 0.11, "grad_norm": 3.0587263107299805, "learning_rate": 1e-05, "loss": 1.3234, "step": 630 }, { "epoch": 0.11, "grad_norm": 1.9629148244857788, "learning_rate": 1e-05, "loss": 1.3481, "step": 640 }, { "epoch": 0.11, "grad_norm": 1.9313011169433594, "learning_rate": 1e-05, "loss": 1.334, "step": 650 }, { "epoch": 0.11, "grad_norm": 2.0011725425720215, "learning_rate": 1e-05, "loss": 1.3259, "step": 660 }, { "epoch": 0.12, "grad_norm": 1.842392086982727, "learning_rate": 1e-05, "loss": 1.2759, "step": 670 }, { "epoch": 0.12, "grad_norm": 1.944788932800293, "learning_rate": 1e-05, "loss": 1.3423, "step": 680 }, { "epoch": 0.12, "grad_norm": 2.603177785873413, "learning_rate": 1e-05, "loss": 1.2942, "step": 690 }, { "epoch": 0.12, "grad_norm": 2.1123621463775635, "learning_rate": 1e-05, "loss": 1.2612, "step": 700 }, { "epoch": 0.12, "grad_norm": 4.298437118530273, "learning_rate": 1e-05, "loss": 1.3443, "step": 710 }, { "epoch": 0.12, "grad_norm": 2.184427261352539, "learning_rate": 1e-05, "loss": 1.2893, "step": 720 }, { "epoch": 0.13, "grad_norm": 3.9805634021759033, "learning_rate": 1e-05, "loss": 1.3534, "step": 730 }, { "epoch": 0.13, "grad_norm": 2.200209140777588, "learning_rate": 1e-05, "loss": 1.5907, "step": 740 }, { "epoch": 0.13, "grad_norm": 3.0145490169525146, "learning_rate": 1e-05, "loss": 1.3638, "step": 750 }, { "epoch": 0.13, "grad_norm": 3.1146225929260254, "learning_rate": 1e-05, "loss": 1.4594, "step": 760 }, { "epoch": 0.13, "grad_norm": 1.9472726583480835, "learning_rate": 1e-05, "loss": 1.4046, "step": 770 }, { "epoch": 0.13, "grad_norm": 2.558044672012329, "learning_rate": 1e-05, "loss": 1.3187, "step": 780 }, { "epoch": 0.14, "grad_norm": 2.1615829467773438, "learning_rate": 1e-05, "loss": 1.3915, "step": 790 }, { "epoch": 0.14, "grad_norm": 2.4719650745391846, "learning_rate": 1e-05, "loss": 1.2974, "step": 800 }, { "epoch": 0.14, "grad_norm": 2.8266043663024902, "learning_rate": 1e-05, "loss": 1.4546, "step": 810 }, { "epoch": 0.14, "grad_norm": 1.942686676979065, "learning_rate": 1e-05, "loss": 1.2597, "step": 820 }, { "epoch": 0.14, "grad_norm": 2.5434083938598633, "learning_rate": 1e-05, "loss": 1.3298, "step": 830 }, { "epoch": 0.14, "grad_norm": 2.2210099697113037, "learning_rate": 1e-05, "loss": 1.3343, "step": 840 }, { "epoch": 0.15, "grad_norm": 2.09978985786438, "learning_rate": 1e-05, "loss": 1.2395, "step": 850 }, { "epoch": 0.15, "grad_norm": 1.9050910472869873, "learning_rate": 1e-05, "loss": 1.3362, "step": 860 }, { "epoch": 0.15, "grad_norm": 2.2624146938323975, "learning_rate": 1e-05, "loss": 1.3459, "step": 870 }, { "epoch": 0.15, "grad_norm": 2.076939582824707, "learning_rate": 1e-05, "loss": 1.2785, "step": 880 }, { "epoch": 0.15, "grad_norm": 2.847289562225342, "learning_rate": 1e-05, "loss": 1.3979, "step": 890 }, { "epoch": 0.15, "grad_norm": 2.6758952140808105, "learning_rate": 1e-05, "loss": 1.3496, "step": 900 }, { "epoch": 0.16, "grad_norm": 2.7525131702423096, "learning_rate": 1e-05, "loss": 1.412, "step": 910 }, { "epoch": 0.16, "grad_norm": 2.080402135848999, "learning_rate": 1e-05, "loss": 1.4179, "step": 920 }, { "epoch": 0.16, "grad_norm": 2.6619815826416016, "learning_rate": 1e-05, "loss": 1.3204, "step": 930 }, { "epoch": 0.16, "grad_norm": 2.1060123443603516, "learning_rate": 1e-05, "loss": 1.3582, "step": 940 }, { "epoch": 0.16, "grad_norm": 2.734252452850342, "learning_rate": 1e-05, "loss": 1.3209, "step": 950 }, { "epoch": 0.16, "grad_norm": 2.7691597938537598, "learning_rate": 1e-05, "loss": 1.3503, "step": 960 }, { "epoch": 0.17, "grad_norm": 2.4318649768829346, "learning_rate": 1e-05, "loss": 1.2655, "step": 970 }, { "epoch": 0.17, "grad_norm": 2.137103319168091, "learning_rate": 1e-05, "loss": 1.4927, "step": 980 }, { "epoch": 0.17, "grad_norm": 2.2520899772644043, "learning_rate": 1e-05, "loss": 1.3405, "step": 990 }, { "epoch": 0.17, "grad_norm": 2.5094006061553955, "learning_rate": 1e-05, "loss": 1.4049, "step": 1000 }, { "epoch": 0.17, "grad_norm": 2.372251033782959, "learning_rate": 1e-05, "loss": 1.3484, "step": 1010 }, { "epoch": 0.18, "grad_norm": 2.1078174114227295, "learning_rate": 1e-05, "loss": 1.3468, "step": 1020 }, { "epoch": 0.18, "grad_norm": 2.324495315551758, "learning_rate": 1e-05, "loss": 1.3558, "step": 1030 }, { "epoch": 0.18, "grad_norm": 3.389418363571167, "learning_rate": 1e-05, "loss": 1.3081, "step": 1040 }, { "epoch": 0.18, "grad_norm": 2.6623008251190186, "learning_rate": 1e-05, "loss": 1.3211, "step": 1050 }, { "epoch": 0.18, "grad_norm": 1.8556021451950073, "learning_rate": 1e-05, "loss": 1.3689, "step": 1060 }, { "epoch": 0.18, "grad_norm": 1.9243156909942627, "learning_rate": 1e-05, "loss": 1.2978, "step": 1070 }, { "epoch": 0.19, "grad_norm": 2.082162618637085, "learning_rate": 1e-05, "loss": 1.3405, "step": 1080 }, { "epoch": 0.19, "grad_norm": 2.7506792545318604, "learning_rate": 1e-05, "loss": 1.3986, "step": 1090 }, { "epoch": 0.19, "grad_norm": 3.331587791442871, "learning_rate": 1e-05, "loss": 1.3117, "step": 1100 }, { "epoch": 0.19, "grad_norm": 2.870234727859497, "learning_rate": 1e-05, "loss": 1.4165, "step": 1110 }, { "epoch": 0.19, "grad_norm": 2.293771743774414, "learning_rate": 1e-05, "loss": 1.3671, "step": 1120 }, { "epoch": 0.19, "grad_norm": 3.1112711429595947, "learning_rate": 1e-05, "loss": 1.4002, "step": 1130 }, { "epoch": 0.2, "grad_norm": 3.0179507732391357, "learning_rate": 1e-05, "loss": 1.3784, "step": 1140 }, { "epoch": 0.2, "grad_norm": 1.8166738748550415, "learning_rate": 1e-05, "loss": 1.3889, "step": 1150 }, { "epoch": 0.2, "grad_norm": 2.134061574935913, "learning_rate": 1e-05, "loss": 1.3192, "step": 1160 }, { "epoch": 0.2, "grad_norm": 2.651945114135742, "learning_rate": 1e-05, "loss": 1.3642, "step": 1170 }, { "epoch": 0.2, "grad_norm": 2.9652693271636963, "learning_rate": 1e-05, "loss": 1.356, "step": 1180 }, { "epoch": 0.2, "grad_norm": 2.771118402481079, "learning_rate": 1e-05, "loss": 1.3345, "step": 1190 }, { "epoch": 0.21, "grad_norm": 2.6212990283966064, "learning_rate": 1e-05, "loss": 1.3111, "step": 1200 }, { "epoch": 0.21, "grad_norm": 2.779235363006592, "learning_rate": 1e-05, "loss": 1.3837, "step": 1210 }, { "epoch": 0.21, "grad_norm": 2.5824780464172363, "learning_rate": 1e-05, "loss": 1.2821, "step": 1220 }, { "epoch": 0.21, "grad_norm": 2.7208638191223145, "learning_rate": 1e-05, "loss": 1.3874, "step": 1230 }, { "epoch": 0.21, "grad_norm": 3.4320294857025146, "learning_rate": 1e-05, "loss": 1.317, "step": 1240 }, { "epoch": 0.21, "grad_norm": 2.185110330581665, "learning_rate": 1e-05, "loss": 1.3984, "step": 1250 }, { "epoch": 0.22, "grad_norm": 2.3204574584960938, "learning_rate": 1e-05, "loss": 1.3539, "step": 1260 }, { "epoch": 0.22, "grad_norm": 2.0239412784576416, "learning_rate": 1e-05, "loss": 1.3766, "step": 1270 }, { "epoch": 0.22, "grad_norm": 2.068580150604248, "learning_rate": 1e-05, "loss": 1.3301, "step": 1280 }, { "epoch": 0.22, "grad_norm": 2.349250555038452, "learning_rate": 1e-05, "loss": 1.2955, "step": 1290 }, { "epoch": 0.22, "grad_norm": 2.8978281021118164, "learning_rate": 1e-05, "loss": 1.4535, "step": 1300 }, { "epoch": 0.22, "grad_norm": 2.288252353668213, "learning_rate": 1e-05, "loss": 1.2881, "step": 1310 }, { "epoch": 0.23, "grad_norm": 1.632664442062378, "learning_rate": 1e-05, "loss": 1.3044, "step": 1320 }, { "epoch": 0.23, "grad_norm": 2.277397394180298, "learning_rate": 1e-05, "loss": 1.2918, "step": 1330 }, { "epoch": 0.23, "grad_norm": 2.837981700897217, "learning_rate": 1e-05, "loss": 1.385, "step": 1340 }, { "epoch": 0.23, "grad_norm": 2.083287000656128, "learning_rate": 1e-05, "loss": 1.2665, "step": 1350 }, { "epoch": 0.23, "grad_norm": 2.0568087100982666, "learning_rate": 1e-05, "loss": 1.1869, "step": 1360 }, { "epoch": 0.24, "grad_norm": 2.4070591926574707, "learning_rate": 1e-05, "loss": 1.3262, "step": 1370 }, { "epoch": 0.24, "grad_norm": 2.0220119953155518, "learning_rate": 1e-05, "loss": 1.3278, "step": 1380 }, { "epoch": 0.24, "grad_norm": 2.2559282779693604, "learning_rate": 1e-05, "loss": 1.3381, "step": 1390 }, { "epoch": 0.24, "grad_norm": 2.258565902709961, "learning_rate": 1e-05, "loss": 1.3113, "step": 1400 }, { "epoch": 0.24, "grad_norm": 2.2906646728515625, "learning_rate": 1e-05, "loss": 1.2238, "step": 1410 }, { "epoch": 0.24, "grad_norm": 2.1853528022766113, "learning_rate": 1e-05, "loss": 1.3785, "step": 1420 }, { "epoch": 0.25, "grad_norm": 3.12762188911438, "learning_rate": 1e-05, "loss": 1.4494, "step": 1430 }, { "epoch": 0.25, "grad_norm": 1.401248574256897, "learning_rate": 1e-05, "loss": 1.2383, "step": 1440 }, { "epoch": 0.25, "grad_norm": 2.6352481842041016, "learning_rate": 1e-05, "loss": 1.3514, "step": 1450 }, { "epoch": 0.25, "grad_norm": 2.0247933864593506, "learning_rate": 1e-05, "loss": 1.2866, "step": 1460 }, { "epoch": 0.25, "grad_norm": 2.5140461921691895, "learning_rate": 1e-05, "loss": 1.3917, "step": 1470 }, { "epoch": 0.25, "grad_norm": 2.628725051879883, "learning_rate": 1e-05, "loss": 1.3509, "step": 1480 }, { "epoch": 0.26, "grad_norm": 1.9856632947921753, "learning_rate": 1e-05, "loss": 1.3269, "step": 1490 }, { "epoch": 0.26, "grad_norm": 1.7394587993621826, "learning_rate": 1e-05, "loss": 1.3407, "step": 1500 }, { "epoch": 0.26, "grad_norm": 2.4537129402160645, "learning_rate": 1e-05, "loss": 1.3306, "step": 1510 }, { "epoch": 0.26, "grad_norm": 2.5386736392974854, "learning_rate": 1e-05, "loss": 1.3187, "step": 1520 }, { "epoch": 0.26, "grad_norm": 3.3441104888916016, "learning_rate": 1e-05, "loss": 1.4401, "step": 1530 }, { "epoch": 0.26, "grad_norm": 2.6362714767456055, "learning_rate": 1e-05, "loss": 1.337, "step": 1540 }, { "epoch": 0.27, "grad_norm": 1.830051064491272, "learning_rate": 1e-05, "loss": 1.3022, "step": 1550 }, { "epoch": 0.27, "grad_norm": 2.8917646408081055, "learning_rate": 1e-05, "loss": 1.352, "step": 1560 }, { "epoch": 0.27, "grad_norm": 2.4951720237731934, "learning_rate": 1e-05, "loss": 1.35, "step": 1570 }, { "epoch": 0.27, "grad_norm": 1.9048365354537964, "learning_rate": 1e-05, "loss": 1.4491, "step": 1580 }, { "epoch": 0.27, "grad_norm": 1.7537648677825928, "learning_rate": 1e-05, "loss": 1.2354, "step": 1590 }, { "epoch": 0.27, "grad_norm": 2.120579719543457, "learning_rate": 1e-05, "loss": 1.3471, "step": 1600 }, { "epoch": 0.28, "grad_norm": 2.85201096534729, "learning_rate": 1e-05, "loss": 1.2367, "step": 1610 }, { "epoch": 0.28, "grad_norm": 1.9622502326965332, "learning_rate": 1e-05, "loss": 1.305, "step": 1620 }, { "epoch": 0.28, "grad_norm": 1.735097050666809, "learning_rate": 1e-05, "loss": 1.4083, "step": 1630 }, { "epoch": 0.28, "grad_norm": 2.6957337856292725, "learning_rate": 1e-05, "loss": 1.29, "step": 1640 }, { "epoch": 0.28, "grad_norm": 2.07291841506958, "learning_rate": 1e-05, "loss": 1.3921, "step": 1650 }, { "epoch": 0.29, "grad_norm": 2.0542261600494385, "learning_rate": 1e-05, "loss": 1.3678, "step": 1660 }, { "epoch": 0.29, "grad_norm": 2.107818126678467, "learning_rate": 1e-05, "loss": 1.2903, "step": 1670 }, { "epoch": 0.29, "grad_norm": 2.1922590732574463, "learning_rate": 1e-05, "loss": 1.2826, "step": 1680 }, { "epoch": 0.29, "grad_norm": 2.963850736618042, "learning_rate": 1e-05, "loss": 1.3797, "step": 1690 }, { "epoch": 0.29, "grad_norm": 2.5969626903533936, "learning_rate": 1e-05, "loss": 1.3398, "step": 1700 }, { "epoch": 0.29, "grad_norm": 2.1566317081451416, "learning_rate": 1e-05, "loss": 1.245, "step": 1710 }, { "epoch": 0.3, "grad_norm": 2.023244619369507, "learning_rate": 1e-05, "loss": 1.3088, "step": 1720 }, { "epoch": 0.3, "grad_norm": 2.4707469940185547, "learning_rate": 1e-05, "loss": 1.2738, "step": 1730 }, { "epoch": 0.3, "grad_norm": 1.61325204372406, "learning_rate": 1e-05, "loss": 1.3641, "step": 1740 }, { "epoch": 0.3, "grad_norm": 2.0144903659820557, "learning_rate": 1e-05, "loss": 1.3668, "step": 1750 }, { "epoch": 0.3, "grad_norm": 1.6015106439590454, "learning_rate": 1e-05, "loss": 1.4913, "step": 1760 }, { "epoch": 0.3, "grad_norm": 2.9650461673736572, "learning_rate": 1e-05, "loss": 1.4019, "step": 1770 }, { "epoch": 0.31, "grad_norm": 2.607697010040283, "learning_rate": 1e-05, "loss": 1.2866, "step": 1780 }, { "epoch": 0.31, "grad_norm": 2.2201220989227295, "learning_rate": 1e-05, "loss": 1.3585, "step": 1790 }, { "epoch": 0.31, "grad_norm": 3.4735159873962402, "learning_rate": 1e-05, "loss": 1.4571, "step": 1800 }, { "epoch": 0.31, "grad_norm": 2.2776782512664795, "learning_rate": 1e-05, "loss": 1.3219, "step": 1810 }, { "epoch": 0.31, "grad_norm": 2.329009532928467, "learning_rate": 1e-05, "loss": 1.385, "step": 1820 }, { "epoch": 0.31, "grad_norm": 2.494612216949463, "learning_rate": 1e-05, "loss": 1.2941, "step": 1830 }, { "epoch": 0.32, "grad_norm": 4.000184059143066, "learning_rate": 1e-05, "loss": 1.3872, "step": 1840 }, { "epoch": 0.32, "grad_norm": 2.1177732944488525, "learning_rate": 1e-05, "loss": 1.414, "step": 1850 }, { "epoch": 0.32, "grad_norm": 3.023552894592285, "learning_rate": 1e-05, "loss": 1.3941, "step": 1860 }, { "epoch": 0.32, "grad_norm": 2.5864198207855225, "learning_rate": 1e-05, "loss": 1.345, "step": 1870 }, { "epoch": 0.32, "grad_norm": 2.5800952911376953, "learning_rate": 1e-05, "loss": 1.3384, "step": 1880 }, { "epoch": 0.32, "grad_norm": 2.223054885864258, "learning_rate": 1e-05, "loss": 1.4299, "step": 1890 }, { "epoch": 0.33, "grad_norm": 2.060757875442505, "learning_rate": 1e-05, "loss": 1.3994, "step": 1900 }, { "epoch": 0.33, "grad_norm": 2.429835557937622, "learning_rate": 1e-05, "loss": 1.3322, "step": 1910 }, { "epoch": 0.33, "grad_norm": 1.8696309328079224, "learning_rate": 1e-05, "loss": 1.25, "step": 1920 }, { "epoch": 0.33, "grad_norm": 1.8844784498214722, "learning_rate": 1e-05, "loss": 1.2992, "step": 1930 }, { "epoch": 0.33, "grad_norm": 2.0955729484558105, "learning_rate": 1e-05, "loss": 1.339, "step": 1940 }, { "epoch": 0.33, "grad_norm": 1.8389264345169067, "learning_rate": 1e-05, "loss": 1.3021, "step": 1950 }, { "epoch": 0.34, "grad_norm": 2.6430881023406982, "learning_rate": 1e-05, "loss": 1.3589, "step": 1960 }, { "epoch": 0.34, "grad_norm": 2.4001126289367676, "learning_rate": 1e-05, "loss": 1.286, "step": 1970 }, { "epoch": 0.34, "grad_norm": 2.283057928085327, "learning_rate": 1e-05, "loss": 1.3626, "step": 1980 }, { "epoch": 0.34, "grad_norm": 2.0561628341674805, "learning_rate": 1e-05, "loss": 1.4019, "step": 1990 }, { "epoch": 0.34, "grad_norm": 2.2793378829956055, "learning_rate": 1e-05, "loss": 1.354, "step": 2000 }, { "epoch": 0.35, "grad_norm": 2.4692368507385254, "learning_rate": 1e-05, "loss": 1.3677, "step": 2010 }, { "epoch": 0.35, "grad_norm": 1.9414951801300049, "learning_rate": 1e-05, "loss": 1.2852, "step": 2020 }, { "epoch": 0.35, "grad_norm": 2.1738226413726807, "learning_rate": 1e-05, "loss": 1.2924, "step": 2030 }, { "epoch": 0.35, "grad_norm": 2.1066489219665527, "learning_rate": 1e-05, "loss": 1.304, "step": 2040 }, { "epoch": 0.35, "grad_norm": 4.439203262329102, "learning_rate": 1e-05, "loss": 1.3377, "step": 2050 }, { "epoch": 0.35, "grad_norm": 2.1386377811431885, "learning_rate": 1e-05, "loss": 1.3543, "step": 2060 }, { "epoch": 0.36, "grad_norm": 2.179619073867798, "learning_rate": 1e-05, "loss": 1.4117, "step": 2070 }, { "epoch": 0.36, "grad_norm": 2.7357661724090576, "learning_rate": 1e-05, "loss": 1.4537, "step": 2080 }, { "epoch": 0.36, "grad_norm": 1.8351653814315796, "learning_rate": 1e-05, "loss": 1.2952, "step": 2090 }, { "epoch": 0.36, "grad_norm": 2.759627342224121, "learning_rate": 1e-05, "loss": 1.4223, "step": 2100 }, { "epoch": 0.36, "grad_norm": 2.220985174179077, "learning_rate": 1e-05, "loss": 1.3881, "step": 2110 }, { "epoch": 0.36, "grad_norm": 2.878201484680176, "learning_rate": 1e-05, "loss": 1.3814, "step": 2120 }, { "epoch": 0.37, "grad_norm": 2.227640390396118, "learning_rate": 1e-05, "loss": 1.315, "step": 2130 }, { "epoch": 0.37, "grad_norm": 1.974422574043274, "learning_rate": 1e-05, "loss": 1.2577, "step": 2140 }, { "epoch": 0.37, "grad_norm": 2.6234285831451416, "learning_rate": 1e-05, "loss": 1.3741, "step": 2150 }, { "epoch": 0.37, "grad_norm": 2.4010727405548096, "learning_rate": 1e-05, "loss": 1.3185, "step": 2160 }, { "epoch": 0.37, "grad_norm": 3.83111310005188, "learning_rate": 1e-05, "loss": 1.3551, "step": 2170 }, { "epoch": 0.37, "grad_norm": 2.351682424545288, "learning_rate": 1e-05, "loss": 1.3692, "step": 2180 }, { "epoch": 0.38, "grad_norm": 2.604748487472534, "learning_rate": 1e-05, "loss": 1.2968, "step": 2190 }, { "epoch": 0.38, "grad_norm": 3.1189563274383545, "learning_rate": 1e-05, "loss": 1.3221, "step": 2200 }, { "epoch": 0.38, "grad_norm": 2.773430347442627, "learning_rate": 1e-05, "loss": 1.2218, "step": 2210 }, { "epoch": 0.38, "grad_norm": 2.6280314922332764, "learning_rate": 1e-05, "loss": 1.3572, "step": 2220 }, { "epoch": 0.38, "grad_norm": 2.858031749725342, "learning_rate": 1e-05, "loss": 1.3583, "step": 2230 }, { "epoch": 0.38, "grad_norm": 2.839733362197876, "learning_rate": 1e-05, "loss": 1.3644, "step": 2240 }, { "epoch": 0.39, "grad_norm": 1.971848726272583, "learning_rate": 1e-05, "loss": 1.2817, "step": 2250 }, { "epoch": 0.39, "grad_norm": 2.195857524871826, "learning_rate": 1e-05, "loss": 1.3027, "step": 2260 }, { "epoch": 0.39, "grad_norm": 1.9531887769699097, "learning_rate": 1e-05, "loss": 1.3062, "step": 2270 }, { "epoch": 0.39, "grad_norm": 2.1299996376037598, "learning_rate": 1e-05, "loss": 1.3014, "step": 2280 }, { "epoch": 0.39, "grad_norm": 2.4937584400177, "learning_rate": 1e-05, "loss": 1.2315, "step": 2290 }, { "epoch": 0.39, "grad_norm": 2.707040309906006, "learning_rate": 1e-05, "loss": 1.3776, "step": 2300 }, { "epoch": 0.4, "grad_norm": 2.4510104656219482, "learning_rate": 1e-05, "loss": 1.4079, "step": 2310 }, { "epoch": 0.4, "grad_norm": 2.667187452316284, "learning_rate": 1e-05, "loss": 1.3291, "step": 2320 }, { "epoch": 0.4, "grad_norm": 2.8011958599090576, "learning_rate": 1e-05, "loss": 1.305, "step": 2330 }, { "epoch": 0.4, "grad_norm": 2.067420482635498, "learning_rate": 1e-05, "loss": 1.2674, "step": 2340 }, { "epoch": 0.4, "grad_norm": 2.243347644805908, "learning_rate": 1e-05, "loss": 1.2858, "step": 2350 }, { "epoch": 0.41, "grad_norm": 2.0893757343292236, "learning_rate": 1e-05, "loss": 1.3556, "step": 2360 }, { "epoch": 0.41, "grad_norm": 2.331129550933838, "learning_rate": 1e-05, "loss": 1.3768, "step": 2370 }, { "epoch": 0.41, "grad_norm": 2.0916008949279785, "learning_rate": 1e-05, "loss": 1.2819, "step": 2380 }, { "epoch": 0.41, "grad_norm": 2.073061227798462, "learning_rate": 1e-05, "loss": 1.2555, "step": 2390 }, { "epoch": 0.41, "grad_norm": 2.3167691230773926, "learning_rate": 1e-05, "loss": 1.2621, "step": 2400 }, { "epoch": 0.41, "grad_norm": 2.0317132472991943, "learning_rate": 1e-05, "loss": 1.4108, "step": 2410 }, { "epoch": 0.42, "grad_norm": 2.494563579559326, "learning_rate": 1e-05, "loss": 1.2998, "step": 2420 }, { "epoch": 0.42, "grad_norm": 2.281918525695801, "learning_rate": 1e-05, "loss": 1.4285, "step": 2430 }, { "epoch": 0.42, "grad_norm": 3.9818170070648193, "learning_rate": 1e-05, "loss": 1.3844, "step": 2440 }, { "epoch": 0.42, "grad_norm": 2.736210823059082, "learning_rate": 1e-05, "loss": 1.3575, "step": 2450 }, { "epoch": 0.42, "grad_norm": 2.2397067546844482, "learning_rate": 1e-05, "loss": 1.2842, "step": 2460 }, { "epoch": 0.42, "grad_norm": 3.424915313720703, "learning_rate": 1e-05, "loss": 1.25, "step": 2470 }, { "epoch": 0.43, "grad_norm": 2.1230766773223877, "learning_rate": 1e-05, "loss": 1.3246, "step": 2480 }, { "epoch": 0.43, "grad_norm": 2.0383758544921875, "learning_rate": 1e-05, "loss": 1.2224, "step": 2490 }, { "epoch": 0.43, "grad_norm": 3.118562698364258, "learning_rate": 1e-05, "loss": 1.4465, "step": 2500 }, { "epoch": 0.43, "grad_norm": 2.747936964035034, "learning_rate": 1e-05, "loss": 1.3906, "step": 2510 }, { "epoch": 0.43, "grad_norm": 3.0030086040496826, "learning_rate": 1e-05, "loss": 1.3728, "step": 2520 }, { "epoch": 0.43, "grad_norm": 2.7183146476745605, "learning_rate": 1e-05, "loss": 1.3141, "step": 2530 }, { "epoch": 0.44, "grad_norm": 1.8749139308929443, "learning_rate": 1e-05, "loss": 1.3219, "step": 2540 }, { "epoch": 0.44, "grad_norm": 1.784529685974121, "learning_rate": 1e-05, "loss": 1.4004, "step": 2550 }, { "epoch": 0.44, "grad_norm": 2.992795944213867, "learning_rate": 1e-05, "loss": 1.2798, "step": 2560 }, { "epoch": 0.44, "grad_norm": 2.0557336807250977, "learning_rate": 1e-05, "loss": 1.1834, "step": 2570 }, { "epoch": 0.44, "grad_norm": 2.8129782676696777, "learning_rate": 1e-05, "loss": 1.4837, "step": 2580 }, { "epoch": 0.44, "grad_norm": 2.194706439971924, "learning_rate": 1e-05, "loss": 1.3455, "step": 2590 }, { "epoch": 0.45, "grad_norm": 2.341437816619873, "learning_rate": 1e-05, "loss": 1.1929, "step": 2600 }, { "epoch": 0.45, "grad_norm": 2.618860960006714, "learning_rate": 1e-05, "loss": 1.3953, "step": 2610 }, { "epoch": 0.45, "grad_norm": 3.1975460052490234, "learning_rate": 1e-05, "loss": 1.3278, "step": 2620 }, { "epoch": 0.45, "grad_norm": 2.3264036178588867, "learning_rate": 1e-05, "loss": 1.2772, "step": 2630 }, { "epoch": 0.45, "grad_norm": 3.271860361099243, "learning_rate": 1e-05, "loss": 1.2736, "step": 2640 }, { "epoch": 0.46, "grad_norm": 2.292814016342163, "learning_rate": 1e-05, "loss": 1.2622, "step": 2650 }, { "epoch": 0.46, "grad_norm": 2.3970437049865723, "learning_rate": 1e-05, "loss": 1.399, "step": 2660 }, { "epoch": 0.46, "grad_norm": 2.5156729221343994, "learning_rate": 1e-05, "loss": 1.2094, "step": 2670 }, { "epoch": 0.46, "grad_norm": 2.0000627040863037, "learning_rate": 1e-05, "loss": 1.2816, "step": 2680 }, { "epoch": 0.46, "grad_norm": 2.0072364807128906, "learning_rate": 1e-05, "loss": 1.303, "step": 2690 }, { "epoch": 0.46, "grad_norm": 2.7943062782287598, "learning_rate": 1e-05, "loss": 1.3585, "step": 2700 }, { "epoch": 0.47, "grad_norm": 2.5227713584899902, "learning_rate": 1e-05, "loss": 1.2586, "step": 2710 }, { "epoch": 0.47, "grad_norm": 6.766931533813477, "learning_rate": 1e-05, "loss": 1.4304, "step": 2720 }, { "epoch": 0.47, "grad_norm": 2.1842198371887207, "learning_rate": 1e-05, "loss": 1.2814, "step": 2730 }, { "epoch": 0.47, "grad_norm": 2.2196881771087646, "learning_rate": 1e-05, "loss": 1.2937, "step": 2740 }, { "epoch": 0.47, "grad_norm": 2.007570743560791, "learning_rate": 1e-05, "loss": 1.378, "step": 2750 }, { "epoch": 0.47, "grad_norm": 2.672393798828125, "learning_rate": 1e-05, "loss": 1.3915, "step": 2760 }, { "epoch": 0.48, "grad_norm": 2.169638156890869, "learning_rate": 1e-05, "loss": 1.3554, "step": 2770 }, { "epoch": 0.48, "grad_norm": 2.4125783443450928, "learning_rate": 1e-05, "loss": 1.3345, "step": 2780 }, { "epoch": 0.48, "grad_norm": 2.443183660507202, "learning_rate": 1e-05, "loss": 1.3754, "step": 2790 }, { "epoch": 0.48, "grad_norm": 3.338090419769287, "learning_rate": 1e-05, "loss": 1.3564, "step": 2800 }, { "epoch": 0.48, "grad_norm": 2.6799206733703613, "learning_rate": 1e-05, "loss": 1.4033, "step": 2810 }, { "epoch": 0.48, "grad_norm": 2.752776622772217, "learning_rate": 1e-05, "loss": 1.3011, "step": 2820 }, { "epoch": 0.49, "grad_norm": 2.800459384918213, "learning_rate": 1e-05, "loss": 1.3509, "step": 2830 }, { "epoch": 0.49, "grad_norm": 2.97355318069458, "learning_rate": 1e-05, "loss": 1.478, "step": 2840 }, { "epoch": 0.49, "grad_norm": 3.2402892112731934, "learning_rate": 1e-05, "loss": 1.2713, "step": 2850 }, { "epoch": 0.49, "grad_norm": 2.3085737228393555, "learning_rate": 1e-05, "loss": 1.3607, "step": 2860 }, { "epoch": 0.49, "grad_norm": 2.410074234008789, "learning_rate": 1e-05, "loss": 1.3084, "step": 2870 }, { "epoch": 0.49, "grad_norm": 2.0424160957336426, "learning_rate": 1e-05, "loss": 1.325, "step": 2880 }, { "epoch": 0.5, "grad_norm": 3.200083017349243, "learning_rate": 1e-05, "loss": 1.3144, "step": 2890 }, { "epoch": 0.5, "grad_norm": 2.7350451946258545, "learning_rate": 1e-05, "loss": 1.3447, "step": 2900 }, { "epoch": 0.5, "grad_norm": 3.373868703842163, "learning_rate": 1e-05, "loss": 1.3489, "step": 2910 }, { "epoch": 0.5, "grad_norm": 2.9406344890594482, "learning_rate": 1e-05, "loss": 1.349, "step": 2920 }, { "epoch": 0.5, "grad_norm": 2.075248956680298, "learning_rate": 1e-05, "loss": 1.2307, "step": 2930 }, { "epoch": 0.5, "grad_norm": 2.607004404067993, "learning_rate": 1e-05, "loss": 1.3246, "step": 2940 }, { "epoch": 0.51, "grad_norm": 3.0145184993743896, "learning_rate": 1e-05, "loss": 1.3412, "step": 2950 }, { "epoch": 0.51, "grad_norm": 2.537825345993042, "learning_rate": 1e-05, "loss": 1.3499, "step": 2960 }, { "epoch": 0.51, "grad_norm": 2.785065174102783, "learning_rate": 1e-05, "loss": 1.325, "step": 2970 }, { "epoch": 0.51, "grad_norm": 2.4221601486206055, "learning_rate": 1e-05, "loss": 1.3289, "step": 2980 }, { "epoch": 0.51, "grad_norm": 2.5729472637176514, "learning_rate": 1e-05, "loss": 1.3444, "step": 2990 }, { "epoch": 0.52, "grad_norm": 2.2976810932159424, "learning_rate": 1e-05, "loss": 1.2523, "step": 3000 }, { "epoch": 0.52, "grad_norm": 2.300956964492798, "learning_rate": 1e-05, "loss": 1.3421, "step": 3010 }, { "epoch": 0.52, "grad_norm": 2.143294334411621, "learning_rate": 1e-05, "loss": 1.3209, "step": 3020 }, { "epoch": 0.52, "grad_norm": 1.957260251045227, "learning_rate": 1e-05, "loss": 1.3478, "step": 3030 }, { "epoch": 0.52, "grad_norm": 2.4198296070098877, "learning_rate": 1e-05, "loss": 1.39, "step": 3040 }, { "epoch": 0.52, "grad_norm": 2.0126090049743652, "learning_rate": 1e-05, "loss": 1.3365, "step": 3050 }, { "epoch": 0.53, "grad_norm": 1.9037426710128784, "learning_rate": 1e-05, "loss": 1.2968, "step": 3060 }, { "epoch": 0.53, "grad_norm": 2.2140746116638184, "learning_rate": 1e-05, "loss": 1.3395, "step": 3070 }, { "epoch": 0.53, "grad_norm": 1.4906034469604492, "learning_rate": 1e-05, "loss": 1.3678, "step": 3080 }, { "epoch": 0.53, "grad_norm": 2.0024867057800293, "learning_rate": 1e-05, "loss": 1.274, "step": 3090 }, { "epoch": 0.53, "grad_norm": 2.1890878677368164, "learning_rate": 1e-05, "loss": 1.3005, "step": 3100 }, { "epoch": 0.53, "grad_norm": 2.7136616706848145, "learning_rate": 1e-05, "loss": 1.3384, "step": 3110 }, { "epoch": 0.54, "grad_norm": 2.1568996906280518, "learning_rate": 1e-05, "loss": 1.3166, "step": 3120 }, { "epoch": 0.54, "grad_norm": 1.9385143518447876, "learning_rate": 1e-05, "loss": 1.2731, "step": 3130 }, { "epoch": 0.54, "grad_norm": 2.474834680557251, "learning_rate": 1e-05, "loss": 1.3038, "step": 3140 }, { "epoch": 0.54, "grad_norm": 2.2946722507476807, "learning_rate": 1e-05, "loss": 1.2439, "step": 3150 }, { "epoch": 0.54, "grad_norm": 2.2470133304595947, "learning_rate": 1e-05, "loss": 1.3573, "step": 3160 }, { "epoch": 0.54, "grad_norm": 2.878171443939209, "learning_rate": 1e-05, "loss": 1.3902, "step": 3170 }, { "epoch": 0.55, "grad_norm": 2.551673173904419, "learning_rate": 1e-05, "loss": 1.2785, "step": 3180 }, { "epoch": 0.55, "grad_norm": 2.7930572032928467, "learning_rate": 1e-05, "loss": 1.3438, "step": 3190 }, { "epoch": 0.55, "grad_norm": 2.136298179626465, "learning_rate": 1e-05, "loss": 1.2169, "step": 3200 }, { "epoch": 0.55, "grad_norm": 2.7656638622283936, "learning_rate": 1e-05, "loss": 1.3341, "step": 3210 }, { "epoch": 0.55, "grad_norm": 1.9639734029769897, "learning_rate": 1e-05, "loss": 1.3665, "step": 3220 }, { "epoch": 0.55, "grad_norm": 2.61297345161438, "learning_rate": 1e-05, "loss": 1.3103, "step": 3230 }, { "epoch": 0.56, "grad_norm": 2.183431386947632, "learning_rate": 1e-05, "loss": 1.3381, "step": 3240 }, { "epoch": 0.56, "grad_norm": 3.3795530796051025, "learning_rate": 1e-05, "loss": 1.417, "step": 3250 }, { "epoch": 0.56, "grad_norm": 1.683171033859253, "learning_rate": 1e-05, "loss": 1.3867, "step": 3260 }, { "epoch": 0.56, "grad_norm": 2.4533402919769287, "learning_rate": 1e-05, "loss": 1.2866, "step": 3270 }, { "epoch": 0.56, "grad_norm": 3.316845417022705, "learning_rate": 1e-05, "loss": 1.3988, "step": 3280 }, { "epoch": 0.57, "grad_norm": 2.9724698066711426, "learning_rate": 1e-05, "loss": 1.3656, "step": 3290 }, { "epoch": 0.57, "grad_norm": 2.1463451385498047, "learning_rate": 1e-05, "loss": 1.3868, "step": 3300 }, { "epoch": 0.57, "grad_norm": 2.425827741622925, "learning_rate": 1e-05, "loss": 1.4846, "step": 3310 }, { "epoch": 0.57, "grad_norm": 2.0959482192993164, "learning_rate": 1e-05, "loss": 1.3961, "step": 3320 }, { "epoch": 0.57, "grad_norm": 5.785223007202148, "learning_rate": 1e-05, "loss": 1.3817, "step": 3330 }, { "epoch": 0.57, "grad_norm": 2.5019736289978027, "learning_rate": 1e-05, "loss": 1.3205, "step": 3340 }, { "epoch": 0.58, "grad_norm": 2.9260520935058594, "learning_rate": 1e-05, "loss": 1.2854, "step": 3350 }, { "epoch": 0.58, "grad_norm": 2.714916229248047, "learning_rate": 1e-05, "loss": 1.3533, "step": 3360 }, { "epoch": 0.58, "grad_norm": 3.024867534637451, "learning_rate": 1e-05, "loss": 1.3566, "step": 3370 }, { "epoch": 0.58, "grad_norm": 2.382457971572876, "learning_rate": 1e-05, "loss": 1.276, "step": 3380 }, { "epoch": 0.58, "grad_norm": 2.94223690032959, "learning_rate": 1e-05, "loss": 1.3663, "step": 3390 }, { "epoch": 0.58, "grad_norm": 2.0350470542907715, "learning_rate": 1e-05, "loss": 1.2721, "step": 3400 }, { "epoch": 0.59, "grad_norm": 2.4905200004577637, "learning_rate": 1e-05, "loss": 1.2781, "step": 3410 }, { "epoch": 0.59, "grad_norm": 2.03529953956604, "learning_rate": 1e-05, "loss": 1.3776, "step": 3420 }, { "epoch": 0.59, "grad_norm": 2.4352126121520996, "learning_rate": 1e-05, "loss": 1.2821, "step": 3430 }, { "epoch": 0.59, "grad_norm": 2.3159918785095215, "learning_rate": 1e-05, "loss": 1.3652, "step": 3440 }, { "epoch": 0.59, "grad_norm": 2.64506459236145, "learning_rate": 1e-05, "loss": 1.2642, "step": 3450 }, { "epoch": 0.59, "grad_norm": 1.939044713973999, "learning_rate": 1e-05, "loss": 1.3028, "step": 3460 }, { "epoch": 0.6, "grad_norm": 1.9863433837890625, "learning_rate": 1e-05, "loss": 1.4451, "step": 3470 }, { "epoch": 0.6, "grad_norm": 2.6311700344085693, "learning_rate": 1e-05, "loss": 1.3471, "step": 3480 }, { "epoch": 0.6, "grad_norm": 2.2302627563476562, "learning_rate": 1e-05, "loss": 1.434, "step": 3490 }, { "epoch": 0.6, "grad_norm": 2.555927276611328, "learning_rate": 1e-05, "loss": 1.2639, "step": 3500 }, { "epoch": 0.6, "grad_norm": 1.8889572620391846, "learning_rate": 1e-05, "loss": 1.3316, "step": 3510 }, { "epoch": 0.6, "grad_norm": 1.86160409450531, "learning_rate": 1e-05, "loss": 1.2543, "step": 3520 }, { "epoch": 0.61, "grad_norm": 1.794211506843567, "learning_rate": 1e-05, "loss": 1.373, "step": 3530 }, { "epoch": 0.61, "grad_norm": 2.82426118850708, "learning_rate": 1e-05, "loss": 1.2373, "step": 3540 }, { "epoch": 0.61, "grad_norm": 2.1324963569641113, "learning_rate": 1e-05, "loss": 1.2348, "step": 3550 }, { "epoch": 0.61, "grad_norm": 2.985074043273926, "learning_rate": 1e-05, "loss": 1.3833, "step": 3560 }, { "epoch": 0.61, "grad_norm": 2.9578144550323486, "learning_rate": 1e-05, "loss": 1.3657, "step": 3570 }, { "epoch": 0.61, "grad_norm": 2.3407962322235107, "learning_rate": 1e-05, "loss": 1.2717, "step": 3580 }, { "epoch": 0.62, "grad_norm": 2.082918643951416, "learning_rate": 1e-05, "loss": 1.3622, "step": 3590 }, { "epoch": 0.62, "grad_norm": 1.811771273612976, "learning_rate": 1e-05, "loss": 1.3736, "step": 3600 }, { "epoch": 0.62, "grad_norm": 2.3119630813598633, "learning_rate": 1e-05, "loss": 1.339, "step": 3610 }, { "epoch": 0.62, "grad_norm": 2.398604393005371, "learning_rate": 1e-05, "loss": 1.3543, "step": 3620 }, { "epoch": 0.62, "grad_norm": 2.1124989986419678, "learning_rate": 1e-05, "loss": 1.2732, "step": 3630 }, { "epoch": 0.63, "grad_norm": 2.2756052017211914, "learning_rate": 1e-05, "loss": 1.2775, "step": 3640 }, { "epoch": 0.63, "grad_norm": 1.8524532318115234, "learning_rate": 1e-05, "loss": 1.3438, "step": 3650 }, { "epoch": 0.63, "grad_norm": 2.433467149734497, "learning_rate": 1e-05, "loss": 1.2721, "step": 3660 }, { "epoch": 0.63, "grad_norm": 2.1145050525665283, "learning_rate": 1e-05, "loss": 1.2946, "step": 3670 }, { "epoch": 0.63, "grad_norm": 2.255388021469116, "learning_rate": 1e-05, "loss": 1.3833, "step": 3680 }, { "epoch": 0.63, "grad_norm": 2.235727310180664, "learning_rate": 1e-05, "loss": 1.3377, "step": 3690 }, { "epoch": 0.64, "grad_norm": 2.267134189605713, "learning_rate": 1e-05, "loss": 1.4201, "step": 3700 }, { "epoch": 0.64, "grad_norm": 2.27997088432312, "learning_rate": 1e-05, "loss": 1.2713, "step": 3710 }, { "epoch": 0.64, "grad_norm": 2.4130237102508545, "learning_rate": 1e-05, "loss": 1.3445, "step": 3720 }, { "epoch": 0.64, "grad_norm": 2.032923460006714, "learning_rate": 1e-05, "loss": 1.3873, "step": 3730 }, { "epoch": 0.64, "grad_norm": 3.0984504222869873, "learning_rate": 1e-05, "loss": 1.4304, "step": 3740 }, { "epoch": 0.64, "grad_norm": 3.5242090225219727, "learning_rate": 1e-05, "loss": 1.2671, "step": 3750 }, { "epoch": 0.65, "grad_norm": 2.271066665649414, "learning_rate": 1e-05, "loss": 1.2898, "step": 3760 }, { "epoch": 0.65, "grad_norm": 1.8168696165084839, "learning_rate": 1e-05, "loss": 1.3366, "step": 3770 }, { "epoch": 0.65, "grad_norm": 2.690077781677246, "learning_rate": 1e-05, "loss": 1.3832, "step": 3780 }, { "epoch": 0.65, "grad_norm": 3.2696776390075684, "learning_rate": 1e-05, "loss": 1.2963, "step": 3790 }, { "epoch": 0.65, "grad_norm": 2.6965296268463135, "learning_rate": 1e-05, "loss": 1.3049, "step": 3800 }, { "epoch": 0.65, "grad_norm": 2.884469985961914, "learning_rate": 1e-05, "loss": 1.3735, "step": 3810 }, { "epoch": 0.66, "grad_norm": 4.111697673797607, "learning_rate": 1e-05, "loss": 1.2462, "step": 3820 }, { "epoch": 0.66, "grad_norm": 2.5264475345611572, "learning_rate": 1e-05, "loss": 1.2981, "step": 3830 }, { "epoch": 0.66, "grad_norm": 1.9028539657592773, "learning_rate": 1e-05, "loss": 1.3386, "step": 3840 }, { "epoch": 0.66, "grad_norm": 2.7717933654785156, "learning_rate": 1e-05, "loss": 1.3897, "step": 3850 }, { "epoch": 0.66, "grad_norm": 2.108096122741699, "learning_rate": 1e-05, "loss": 1.3349, "step": 3860 }, { "epoch": 0.66, "grad_norm": 1.7096315622329712, "learning_rate": 1e-05, "loss": 1.4314, "step": 3870 }, { "epoch": 0.67, "grad_norm": 2.125641107559204, "learning_rate": 1e-05, "loss": 1.3241, "step": 3880 }, { "epoch": 0.67, "grad_norm": 1.9365934133529663, "learning_rate": 1e-05, "loss": 1.4185, "step": 3890 }, { "epoch": 0.67, "grad_norm": 3.282160997390747, "learning_rate": 1e-05, "loss": 1.4088, "step": 3900 }, { "epoch": 0.67, "grad_norm": 1.8728342056274414, "learning_rate": 1e-05, "loss": 1.3255, "step": 3910 }, { "epoch": 0.67, "grad_norm": 2.0046260356903076, "learning_rate": 1e-05, "loss": 1.2997, "step": 3920 }, { "epoch": 0.67, "grad_norm": 2.4220221042633057, "learning_rate": 1e-05, "loss": 1.304, "step": 3930 }, { "epoch": 0.68, "grad_norm": 2.533700942993164, "learning_rate": 1e-05, "loss": 1.2687, "step": 3940 }, { "epoch": 0.68, "grad_norm": 2.124030590057373, "learning_rate": 1e-05, "loss": 1.3217, "step": 3950 }, { "epoch": 0.68, "grad_norm": 3.410753011703491, "learning_rate": 1e-05, "loss": 1.2749, "step": 3960 }, { "epoch": 0.68, "grad_norm": 2.2407126426696777, "learning_rate": 1e-05, "loss": 1.3307, "step": 3970 }, { "epoch": 0.68, "grad_norm": 2.7697319984436035, "learning_rate": 1e-05, "loss": 1.2859, "step": 3980 }, { "epoch": 0.69, "grad_norm": 2.492711067199707, "learning_rate": 1e-05, "loss": 1.3889, "step": 3990 }, { "epoch": 0.69, "grad_norm": 2.103754997253418, "learning_rate": 1e-05, "loss": 1.3198, "step": 4000 }, { "epoch": 0.69, "grad_norm": 2.285733699798584, "learning_rate": 1e-05, "loss": 1.3397, "step": 4010 }, { "epoch": 0.69, "grad_norm": 1.908586859703064, "learning_rate": 1e-05, "loss": 1.4036, "step": 4020 }, { "epoch": 0.69, "grad_norm": 3.6941065788269043, "learning_rate": 1e-05, "loss": 1.3399, "step": 4030 }, { "epoch": 0.69, "grad_norm": 2.5054636001586914, "learning_rate": 1e-05, "loss": 1.2888, "step": 4040 }, { "epoch": 0.7, "grad_norm": 2.9840850830078125, "learning_rate": 1e-05, "loss": 1.39, "step": 4050 }, { "epoch": 0.7, "grad_norm": 1.9543534517288208, "learning_rate": 1e-05, "loss": 1.3243, "step": 4060 }, { "epoch": 0.7, "grad_norm": 2.3872506618499756, "learning_rate": 1e-05, "loss": 1.3321, "step": 4070 }, { "epoch": 0.7, "grad_norm": 3.167403221130371, "learning_rate": 1e-05, "loss": 1.297, "step": 4080 }, { "epoch": 0.7, "grad_norm": 2.350149631500244, "learning_rate": 1e-05, "loss": 1.2943, "step": 4090 }, { "epoch": 0.7, "grad_norm": 2.307448148727417, "learning_rate": 1e-05, "loss": 1.1838, "step": 4100 }, { "epoch": 0.71, "grad_norm": 3.179839849472046, "learning_rate": 1e-05, "loss": 1.3981, "step": 4110 }, { "epoch": 0.71, "grad_norm": 1.8496357202529907, "learning_rate": 1e-05, "loss": 1.3317, "step": 4120 }, { "epoch": 0.71, "grad_norm": 2.6441686153411865, "learning_rate": 1e-05, "loss": 1.3354, "step": 4130 }, { "epoch": 0.71, "grad_norm": 2.0524277687072754, "learning_rate": 1e-05, "loss": 1.3213, "step": 4140 }, { "epoch": 0.71, "grad_norm": 2.1615052223205566, "learning_rate": 1e-05, "loss": 1.2763, "step": 4150 }, { "epoch": 0.71, "grad_norm": 2.3598992824554443, "learning_rate": 1e-05, "loss": 1.2179, "step": 4160 }, { "epoch": 0.72, "grad_norm": 1.8935139179229736, "learning_rate": 1e-05, "loss": 1.3008, "step": 4170 }, { "epoch": 0.72, "grad_norm": 2.8705596923828125, "learning_rate": 1e-05, "loss": 1.359, "step": 4180 }, { "epoch": 0.72, "grad_norm": 2.2775766849517822, "learning_rate": 1e-05, "loss": 1.2492, "step": 4190 }, { "epoch": 0.72, "grad_norm": 2.049804449081421, "learning_rate": 1e-05, "loss": 1.3017, "step": 4200 }, { "epoch": 0.72, "grad_norm": 3.507704257965088, "learning_rate": 1e-05, "loss": 1.237, "step": 4210 }, { "epoch": 0.72, "grad_norm": 2.9242076873779297, "learning_rate": 1e-05, "loss": 1.4219, "step": 4220 }, { "epoch": 0.73, "grad_norm": 1.7825714349746704, "learning_rate": 1e-05, "loss": 1.3866, "step": 4230 }, { "epoch": 0.73, "grad_norm": 2.5417842864990234, "learning_rate": 1e-05, "loss": 1.3076, "step": 4240 }, { "epoch": 0.73, "grad_norm": 2.281651020050049, "learning_rate": 1e-05, "loss": 1.3076, "step": 4250 }, { "epoch": 0.73, "grad_norm": 2.7986044883728027, "learning_rate": 1e-05, "loss": 1.2935, "step": 4260 }, { "epoch": 0.73, "grad_norm": 2.2875053882598877, "learning_rate": 1e-05, "loss": 1.2891, "step": 4270 }, { "epoch": 0.74, "grad_norm": 2.329230308532715, "learning_rate": 1e-05, "loss": 1.2477, "step": 4280 }, { "epoch": 0.74, "grad_norm": 2.3861818313598633, "learning_rate": 1e-05, "loss": 1.4013, "step": 4290 }, { "epoch": 0.74, "grad_norm": 1.9318095445632935, "learning_rate": 1e-05, "loss": 1.2565, "step": 4300 }, { "epoch": 0.74, "grad_norm": 2.428346633911133, "learning_rate": 1e-05, "loss": 1.3452, "step": 4310 }, { "epoch": 0.74, "grad_norm": 3.1091361045837402, "learning_rate": 1e-05, "loss": 1.37, "step": 4320 }, { "epoch": 0.74, "grad_norm": 2.457003593444824, "learning_rate": 1e-05, "loss": 1.3968, "step": 4330 }, { "epoch": 0.75, "grad_norm": 2.365961790084839, "learning_rate": 1e-05, "loss": 1.4461, "step": 4340 }, { "epoch": 0.75, "grad_norm": 3.6295409202575684, "learning_rate": 1e-05, "loss": 1.4079, "step": 4350 }, { "epoch": 0.75, "grad_norm": 1.9824382066726685, "learning_rate": 1e-05, "loss": 1.3563, "step": 4360 }, { "epoch": 0.75, "grad_norm": 2.455552101135254, "learning_rate": 1e-05, "loss": 1.4229, "step": 4370 }, { "epoch": 0.75, "grad_norm": 2.2429590225219727, "learning_rate": 1e-05, "loss": 1.312, "step": 4380 }, { "epoch": 0.75, "grad_norm": 2.8446462154388428, "learning_rate": 1e-05, "loss": 1.382, "step": 4390 }, { "epoch": 0.76, "grad_norm": 2.4450223445892334, "learning_rate": 1e-05, "loss": 1.3029, "step": 4400 }, { "epoch": 0.76, "grad_norm": 2.3352503776550293, "learning_rate": 1e-05, "loss": 1.3553, "step": 4410 }, { "epoch": 0.76, "grad_norm": 2.1491830348968506, "learning_rate": 1e-05, "loss": 1.3227, "step": 4420 }, { "epoch": 0.76, "grad_norm": 2.02473521232605, "learning_rate": 1e-05, "loss": 1.2365, "step": 4430 }, { "epoch": 0.76, "grad_norm": 3.059314489364624, "learning_rate": 1e-05, "loss": 1.4442, "step": 4440 }, { "epoch": 0.76, "grad_norm": 1.8352389335632324, "learning_rate": 1e-05, "loss": 1.3092, "step": 4450 }, { "epoch": 0.77, "grad_norm": 2.165952205657959, "learning_rate": 1e-05, "loss": 1.335, "step": 4460 }, { "epoch": 0.77, "grad_norm": 3.213463544845581, "learning_rate": 1e-05, "loss": 1.2948, "step": 4470 }, { "epoch": 0.77, "grad_norm": 2.2183351516723633, "learning_rate": 1e-05, "loss": 1.341, "step": 4480 }, { "epoch": 0.77, "grad_norm": 2.9775967597961426, "learning_rate": 1e-05, "loss": 1.2896, "step": 4490 }, { "epoch": 0.77, "grad_norm": 2.169218063354492, "learning_rate": 1e-05, "loss": 1.3855, "step": 4500 }, { "epoch": 0.77, "grad_norm": 2.246225357055664, "learning_rate": 1e-05, "loss": 1.3308, "step": 4510 }, { "epoch": 0.78, "grad_norm": 2.859213352203369, "learning_rate": 1e-05, "loss": 1.2775, "step": 4520 }, { "epoch": 0.78, "grad_norm": 2.4901084899902344, "learning_rate": 1e-05, "loss": 1.277, "step": 4530 }, { "epoch": 0.78, "grad_norm": 1.7237708568572998, "learning_rate": 1e-05, "loss": 1.233, "step": 4540 }, { "epoch": 0.78, "grad_norm": 1.759042501449585, "learning_rate": 1e-05, "loss": 1.191, "step": 4550 }, { "epoch": 0.78, "grad_norm": 2.1091668605804443, "learning_rate": 1e-05, "loss": 1.4199, "step": 4560 }, { "epoch": 0.78, "grad_norm": 2.2881340980529785, "learning_rate": 1e-05, "loss": 1.3126, "step": 4570 }, { "epoch": 0.79, "grad_norm": 2.373076915740967, "learning_rate": 1e-05, "loss": 1.3503, "step": 4580 }, { "epoch": 0.79, "grad_norm": 2.527299404144287, "learning_rate": 1e-05, "loss": 1.3033, "step": 4590 }, { "epoch": 0.79, "grad_norm": 1.6886786222457886, "learning_rate": 1e-05, "loss": 1.3138, "step": 4600 }, { "epoch": 0.79, "grad_norm": 2.049691915512085, "learning_rate": 1e-05, "loss": 1.4259, "step": 4610 }, { "epoch": 0.79, "grad_norm": 1.689084529876709, "learning_rate": 1e-05, "loss": 1.2643, "step": 4620 }, { "epoch": 0.8, "grad_norm": 2.415674924850464, "learning_rate": 1e-05, "loss": 1.3704, "step": 4630 }, { "epoch": 0.8, "grad_norm": 2.1368796825408936, "learning_rate": 1e-05, "loss": 1.403, "step": 4640 }, { "epoch": 0.8, "grad_norm": 2.140413761138916, "learning_rate": 1e-05, "loss": 1.2695, "step": 4650 }, { "epoch": 0.8, "grad_norm": 2.2510221004486084, "learning_rate": 1e-05, "loss": 1.3455, "step": 4660 }, { "epoch": 0.8, "grad_norm": 1.8963171243667603, "learning_rate": 1e-05, "loss": 1.3342, "step": 4670 }, { "epoch": 0.8, "grad_norm": 2.0337469577789307, "learning_rate": 1e-05, "loss": 1.3418, "step": 4680 }, { "epoch": 0.81, "grad_norm": 2.0309271812438965, "learning_rate": 1e-05, "loss": 1.3451, "step": 4690 }, { "epoch": 0.81, "grad_norm": 1.512277603149414, "learning_rate": 1e-05, "loss": 1.3368, "step": 4700 }, { "epoch": 0.81, "grad_norm": 2.1851866245269775, "learning_rate": 1e-05, "loss": 1.3171, "step": 4710 }, { "epoch": 0.81, "grad_norm": 1.9769792556762695, "learning_rate": 1e-05, "loss": 1.3401, "step": 4720 }, { "epoch": 0.81, "grad_norm": 2.2425997257232666, "learning_rate": 1e-05, "loss": 1.4102, "step": 4730 }, { "epoch": 0.81, "grad_norm": 2.012803316116333, "learning_rate": 1e-05, "loss": 1.2514, "step": 4740 }, { "epoch": 0.82, "grad_norm": 2.3681206703186035, "learning_rate": 1e-05, "loss": 1.2983, "step": 4750 }, { "epoch": 0.82, "grad_norm": 2.1048636436462402, "learning_rate": 1e-05, "loss": 1.2526, "step": 4760 }, { "epoch": 0.82, "grad_norm": 2.0571296215057373, "learning_rate": 1e-05, "loss": 1.3304, "step": 4770 }, { "epoch": 0.82, "grad_norm": 1.8607044219970703, "learning_rate": 1e-05, "loss": 1.3497, "step": 4780 }, { "epoch": 0.82, "grad_norm": 2.278191566467285, "learning_rate": 1e-05, "loss": 1.1942, "step": 4790 }, { "epoch": 0.82, "grad_norm": 1.9115612506866455, "learning_rate": 1e-05, "loss": 1.3426, "step": 4800 }, { "epoch": 0.83, "grad_norm": 2.5302011966705322, "learning_rate": 1e-05, "loss": 1.3114, "step": 4810 }, { "epoch": 0.83, "grad_norm": 2.074410915374756, "learning_rate": 1e-05, "loss": 1.3329, "step": 4820 }, { "epoch": 0.83, "grad_norm": 1.883925199508667, "learning_rate": 1e-05, "loss": 1.3415, "step": 4830 }, { "epoch": 0.83, "grad_norm": 1.8144758939743042, "learning_rate": 1e-05, "loss": 1.3551, "step": 4840 }, { "epoch": 0.83, "grad_norm": 2.2435834407806396, "learning_rate": 1e-05, "loss": 1.2709, "step": 4850 }, { "epoch": 0.83, "grad_norm": 1.9630836248397827, "learning_rate": 1e-05, "loss": 1.2654, "step": 4860 }, { "epoch": 0.84, "grad_norm": 2.0202670097351074, "learning_rate": 1e-05, "loss": 1.4137, "step": 4870 }, { "epoch": 0.84, "grad_norm": 2.2011775970458984, "learning_rate": 1e-05, "loss": 1.2729, "step": 4880 }, { "epoch": 0.84, "grad_norm": 1.7111473083496094, "learning_rate": 1e-05, "loss": 1.3189, "step": 4890 }, { "epoch": 0.84, "grad_norm": 2.1530821323394775, "learning_rate": 1e-05, "loss": 1.3891, "step": 4900 }, { "epoch": 0.84, "grad_norm": 2.096242904663086, "learning_rate": 1e-05, "loss": 1.3674, "step": 4910 }, { "epoch": 0.84, "grad_norm": 2.7188382148742676, "learning_rate": 1e-05, "loss": 1.3524, "step": 4920 }, { "epoch": 0.85, "grad_norm": 2.8607091903686523, "learning_rate": 1e-05, "loss": 1.3462, "step": 4930 }, { "epoch": 0.85, "grad_norm": 1.8992630243301392, "learning_rate": 1e-05, "loss": 1.1978, "step": 4940 }, { "epoch": 0.85, "grad_norm": 1.6134096384048462, "learning_rate": 1e-05, "loss": 1.2554, "step": 4950 }, { "epoch": 0.85, "grad_norm": 2.189053773880005, "learning_rate": 1e-05, "loss": 1.3185, "step": 4960 }, { "epoch": 0.85, "grad_norm": 1.9997789859771729, "learning_rate": 1e-05, "loss": 1.4556, "step": 4970 }, { "epoch": 0.86, "grad_norm": 1.966931939125061, "learning_rate": 1e-05, "loss": 1.3377, "step": 4980 }, { "epoch": 0.86, "grad_norm": 1.6255134344100952, "learning_rate": 1e-05, "loss": 1.2812, "step": 4990 }, { "epoch": 0.86, "grad_norm": 2.119266986846924, "learning_rate": 1e-05, "loss": 1.3713, "step": 5000 }, { "epoch": 0.86, "grad_norm": 2.6048781871795654, "learning_rate": 1e-05, "loss": 1.2766, "step": 5010 }, { "epoch": 0.86, "grad_norm": 2.272893190383911, "learning_rate": 1e-05, "loss": 1.311, "step": 5020 }, { "epoch": 0.86, "grad_norm": 2.620817184448242, "learning_rate": 1e-05, "loss": 1.3302, "step": 5030 }, { "epoch": 0.87, "grad_norm": 2.3737807273864746, "learning_rate": 1e-05, "loss": 1.254, "step": 5040 }, { "epoch": 0.87, "grad_norm": 2.072502374649048, "learning_rate": 1e-05, "loss": 1.244, "step": 5050 }, { "epoch": 0.87, "grad_norm": 2.080003261566162, "learning_rate": 1e-05, "loss": 1.187, "step": 5060 }, { "epoch": 0.87, "grad_norm": 2.330199718475342, "learning_rate": 1e-05, "loss": 1.235, "step": 5070 }, { "epoch": 0.87, "grad_norm": 2.5456225872039795, "learning_rate": 1e-05, "loss": 1.2679, "step": 5080 }, { "epoch": 0.87, "grad_norm": 2.637150526046753, "learning_rate": 1e-05, "loss": 1.3361, "step": 5090 }, { "epoch": 0.88, "grad_norm": 2.265345811843872, "learning_rate": 1e-05, "loss": 1.3633, "step": 5100 }, { "epoch": 0.88, "grad_norm": 2.577023506164551, "learning_rate": 1e-05, "loss": 1.2505, "step": 5110 }, { "epoch": 0.88, "grad_norm": 2.394627094268799, "learning_rate": 1e-05, "loss": 1.2796, "step": 5120 }, { "epoch": 0.88, "grad_norm": 2.3129894733428955, "learning_rate": 1e-05, "loss": 1.2923, "step": 5130 }, { "epoch": 0.88, "grad_norm": 1.9741424322128296, "learning_rate": 1e-05, "loss": 1.3085, "step": 5140 }, { "epoch": 0.88, "grad_norm": 2.1835744380950928, "learning_rate": 1e-05, "loss": 1.2279, "step": 5150 }, { "epoch": 0.89, "grad_norm": 3.0756380558013916, "learning_rate": 1e-05, "loss": 1.2934, "step": 5160 }, { "epoch": 0.89, "grad_norm": 2.079761505126953, "learning_rate": 1e-05, "loss": 1.3547, "step": 5170 }, { "epoch": 0.89, "grad_norm": 1.9494423866271973, "learning_rate": 1e-05, "loss": 1.3259, "step": 5180 }, { "epoch": 0.89, "grad_norm": 3.535632610321045, "learning_rate": 1e-05, "loss": 1.283, "step": 5190 }, { "epoch": 0.89, "grad_norm": 2.5230371952056885, "learning_rate": 1e-05, "loss": 1.4404, "step": 5200 }, { "epoch": 0.89, "grad_norm": 1.859532117843628, "learning_rate": 1e-05, "loss": 1.3204, "step": 5210 }, { "epoch": 0.9, "grad_norm": 2.2032246589660645, "learning_rate": 1e-05, "loss": 1.303, "step": 5220 }, { "epoch": 0.9, "grad_norm": 2.504431962966919, "learning_rate": 1e-05, "loss": 1.273, "step": 5230 }, { "epoch": 0.9, "grad_norm": 2.2306768894195557, "learning_rate": 1e-05, "loss": 1.3615, "step": 5240 }, { "epoch": 0.9, "grad_norm": 2.0646228790283203, "learning_rate": 1e-05, "loss": 1.3246, "step": 5250 }, { "epoch": 0.9, "grad_norm": 2.632603883743286, "learning_rate": 1e-05, "loss": 1.3169, "step": 5260 }, { "epoch": 0.91, "grad_norm": 1.8753434419631958, "learning_rate": 1e-05, "loss": 1.4855, "step": 5270 }, { "epoch": 0.91, "grad_norm": 2.7019877433776855, "learning_rate": 1e-05, "loss": 1.3734, "step": 5280 }, { "epoch": 0.91, "grad_norm": 2.3574256896972656, "learning_rate": 1e-05, "loss": 1.4232, "step": 5290 }, { "epoch": 0.91, "grad_norm": 2.2648611068725586, "learning_rate": 1e-05, "loss": 1.3317, "step": 5300 }, { "epoch": 0.91, "grad_norm": 2.8861615657806396, "learning_rate": 1e-05, "loss": 1.3584, "step": 5310 }, { "epoch": 0.91, "grad_norm": 1.9412791728973389, "learning_rate": 1e-05, "loss": 1.3193, "step": 5320 }, { "epoch": 0.92, "grad_norm": 1.8828357458114624, "learning_rate": 1e-05, "loss": 1.4051, "step": 5330 }, { "epoch": 0.92, "grad_norm": 2.7321279048919678, "learning_rate": 1e-05, "loss": 1.2798, "step": 5340 }, { "epoch": 0.92, "grad_norm": 1.8962711095809937, "learning_rate": 1e-05, "loss": 1.2847, "step": 5350 }, { "epoch": 0.92, "grad_norm": 2.476041316986084, "learning_rate": 1e-05, "loss": 1.3349, "step": 5360 }, { "epoch": 0.92, "grad_norm": 2.4387452602386475, "learning_rate": 1e-05, "loss": 1.3193, "step": 5370 }, { "epoch": 0.92, "grad_norm": 2.867537021636963, "learning_rate": 1e-05, "loss": 1.3567, "step": 5380 }, { "epoch": 0.93, "grad_norm": 1.8666340112686157, "learning_rate": 1e-05, "loss": 1.2492, "step": 5390 }, { "epoch": 0.93, "grad_norm": 2.384167432785034, "learning_rate": 1e-05, "loss": 1.2917, "step": 5400 }, { "epoch": 0.93, "grad_norm": 3.2074921131134033, "learning_rate": 1e-05, "loss": 1.3411, "step": 5410 }, { "epoch": 0.93, "grad_norm": 2.630751609802246, "learning_rate": 1e-05, "loss": 1.4372, "step": 5420 }, { "epoch": 0.93, "grad_norm": 2.105861186981201, "learning_rate": 1e-05, "loss": 1.4445, "step": 5430 }, { "epoch": 0.93, "grad_norm": 2.2219021320343018, "learning_rate": 1e-05, "loss": 1.3419, "step": 5440 }, { "epoch": 0.94, "grad_norm": 1.8928240537643433, "learning_rate": 1e-05, "loss": 1.3888, "step": 5450 }, { "epoch": 0.94, "grad_norm": 2.9438905715942383, "learning_rate": 1e-05, "loss": 1.3179, "step": 5460 }, { "epoch": 0.94, "grad_norm": 2.450190305709839, "learning_rate": 1e-05, "loss": 1.2891, "step": 5470 }, { "epoch": 0.94, "grad_norm": 1.6955147981643677, "learning_rate": 1e-05, "loss": 1.2105, "step": 5480 }, { "epoch": 0.94, "grad_norm": 2.195138931274414, "learning_rate": 1e-05, "loss": 1.3871, "step": 5490 }, { "epoch": 0.94, "grad_norm": 2.4984004497528076, "learning_rate": 1e-05, "loss": 1.311, "step": 5500 }, { "epoch": 0.95, "grad_norm": 1.8167157173156738, "learning_rate": 1e-05, "loss": 1.2877, "step": 5510 }, { "epoch": 0.95, "grad_norm": 2.051131248474121, "learning_rate": 1e-05, "loss": 1.3146, "step": 5520 }, { "epoch": 0.95, "grad_norm": 2.0520663261413574, "learning_rate": 1e-05, "loss": 1.3981, "step": 5530 }, { "epoch": 0.95, "grad_norm": 4.21244478225708, "learning_rate": 1e-05, "loss": 1.3601, "step": 5540 }, { "epoch": 0.95, "grad_norm": 2.481689453125, "learning_rate": 1e-05, "loss": 1.3433, "step": 5550 }, { "epoch": 0.95, "grad_norm": 2.1406707763671875, "learning_rate": 1e-05, "loss": 1.353, "step": 5560 }, { "epoch": 0.96, "grad_norm": 2.8597412109375, "learning_rate": 1e-05, "loss": 1.3786, "step": 5570 }, { "epoch": 0.96, "grad_norm": 1.6117429733276367, "learning_rate": 1e-05, "loss": 1.2154, "step": 5580 }, { "epoch": 0.96, "grad_norm": 1.9745125770568848, "learning_rate": 1e-05, "loss": 1.2937, "step": 5590 }, { "epoch": 0.96, "grad_norm": 1.9402446746826172, "learning_rate": 1e-05, "loss": 1.3025, "step": 5600 }, { "epoch": 0.96, "grad_norm": 2.1005172729492188, "learning_rate": 1e-05, "loss": 1.2995, "step": 5610 }, { "epoch": 0.97, "grad_norm": 3.382460117340088, "learning_rate": 1e-05, "loss": 1.3255, "step": 5620 }, { "epoch": 0.97, "grad_norm": 2.3474137783050537, "learning_rate": 1e-05, "loss": 1.327, "step": 5630 }, { "epoch": 0.97, "grad_norm": 2.1676485538482666, "learning_rate": 1e-05, "loss": 1.3022, "step": 5640 }, { "epoch": 0.97, "grad_norm": 4.638181686401367, "learning_rate": 1e-05, "loss": 1.3579, "step": 5650 }, { "epoch": 0.97, "grad_norm": 1.8829666376113892, "learning_rate": 1e-05, "loss": 1.2526, "step": 5660 }, { "epoch": 0.97, "grad_norm": 2.283449649810791, "learning_rate": 1e-05, "loss": 1.2292, "step": 5670 }, { "epoch": 0.98, "grad_norm": 2.7156941890716553, "learning_rate": 1e-05, "loss": 1.3473, "step": 5680 }, { "epoch": 0.98, "grad_norm": 2.614403486251831, "learning_rate": 1e-05, "loss": 1.2649, "step": 5690 }, { "epoch": 0.98, "grad_norm": 3.518841505050659, "learning_rate": 1e-05, "loss": 1.4286, "step": 5700 }, { "epoch": 0.98, "grad_norm": 2.694382429122925, "learning_rate": 1e-05, "loss": 1.3353, "step": 5710 }, { "epoch": 0.98, "grad_norm": 2.2780184745788574, "learning_rate": 1e-05, "loss": 1.279, "step": 5720 }, { "epoch": 0.98, "grad_norm": 2.911449670791626, "learning_rate": 1e-05, "loss": 1.3446, "step": 5730 }, { "epoch": 0.99, "grad_norm": 1.7798064947128296, "learning_rate": 1e-05, "loss": 1.2469, "step": 5740 }, { "epoch": 0.99, "grad_norm": 2.5752665996551514, "learning_rate": 1e-05, "loss": 1.2476, "step": 5750 }, { "epoch": 0.99, "grad_norm": 2.1589324474334717, "learning_rate": 1e-05, "loss": 1.3267, "step": 5760 }, { "epoch": 0.99, "grad_norm": 3.644857168197632, "learning_rate": 1e-05, "loss": 1.4403, "step": 5770 }, { "epoch": 0.99, "grad_norm": 2.159285545349121, "learning_rate": 1e-05, "loss": 1.2866, "step": 5780 }, { "epoch": 0.99, "grad_norm": 2.4415762424468994, "learning_rate": 1e-05, "loss": 1.2472, "step": 5790 }, { "epoch": 1.0, "grad_norm": 2.266463279724121, "learning_rate": 1e-05, "loss": 1.3598, "step": 5800 }, { "epoch": 1.0, "grad_norm": 2.450885534286499, "learning_rate": 1e-05, "loss": 1.2823, "step": 5810 }, { "epoch": 1.0, "grad_norm": 1.997843861579895, "learning_rate": 1e-05, "loss": 1.3197, "step": 5820 }, { "epoch": 1.0, "eval_loss": 1.7925161123275757, "eval_runtime": 21.4712, "eval_samples_per_second": 46.574, "eval_steps_per_second": 46.574, "step": 5823 }, { "epoch": 1.0, "grad_norm": 1.5618516206741333, "learning_rate": 1e-05, "loss": 1.1163, "step": 5830 }, { "epoch": 1.0, "grad_norm": 1.7338979244232178, "learning_rate": 1e-05, "loss": 0.9587, "step": 5840 }, { "epoch": 1.0, "grad_norm": 1.972121000289917, "learning_rate": 1e-05, "loss": 1.0549, "step": 5850 }, { "epoch": 1.01, "grad_norm": 3.1814982891082764, "learning_rate": 1e-05, "loss": 0.9983, "step": 5860 }, { "epoch": 1.01, "grad_norm": 2.32295560836792, "learning_rate": 1e-05, "loss": 0.9349, "step": 5870 }, { "epoch": 1.01, "grad_norm": 2.3985841274261475, "learning_rate": 1e-05, "loss": 0.9369, "step": 5880 }, { "epoch": 1.01, "grad_norm": 2.472193717956543, "learning_rate": 1e-05, "loss": 1.1074, "step": 5890 }, { "epoch": 1.01, "grad_norm": 2.7933571338653564, "learning_rate": 1e-05, "loss": 1.0088, "step": 5900 }, { "epoch": 1.01, "grad_norm": 3.421544075012207, "learning_rate": 1e-05, "loss": 1.0128, "step": 5910 }, { "epoch": 1.02, "grad_norm": 2.9460196495056152, "learning_rate": 1e-05, "loss": 0.9706, "step": 5920 }, { "epoch": 1.02, "grad_norm": 2.7304258346557617, "learning_rate": 1e-05, "loss": 0.9553, "step": 5930 }, { "epoch": 1.02, "grad_norm": 2.0735466480255127, "learning_rate": 1e-05, "loss": 1.0339, "step": 5940 }, { "epoch": 1.02, "grad_norm": 2.4456515312194824, "learning_rate": 1e-05, "loss": 0.9472, "step": 5950 }, { "epoch": 1.02, "grad_norm": 2.9821925163269043, "learning_rate": 1e-05, "loss": 0.9725, "step": 5960 }, { "epoch": 1.03, "grad_norm": 2.6493866443634033, "learning_rate": 1e-05, "loss": 1.0446, "step": 5970 }, { "epoch": 1.03, "grad_norm": 3.323594093322754, "learning_rate": 1e-05, "loss": 0.9804, "step": 5980 }, { "epoch": 1.03, "grad_norm": 2.6612460613250732, "learning_rate": 1e-05, "loss": 1.0335, "step": 5990 }, { "epoch": 1.03, "grad_norm": 2.080219030380249, "learning_rate": 1e-05, "loss": 0.9365, "step": 6000 }, { "epoch": 1.03, "grad_norm": 3.0296177864074707, "learning_rate": 1e-05, "loss": 0.9439, "step": 6010 }, { "epoch": 1.03, "grad_norm": 4.418188095092773, "learning_rate": 1e-05, "loss": 1.0635, "step": 6020 }, { "epoch": 1.04, "grad_norm": 1.9283742904663086, "learning_rate": 1e-05, "loss": 0.9956, "step": 6030 }, { "epoch": 1.04, "grad_norm": 2.918304681777954, "learning_rate": 1e-05, "loss": 1.0152, "step": 6040 }, { "epoch": 1.04, "grad_norm": 2.512454032897949, "learning_rate": 1e-05, "loss": 1.0259, "step": 6050 }, { "epoch": 1.04, "grad_norm": 2.7206339836120605, "learning_rate": 1e-05, "loss": 0.9879, "step": 6060 }, { "epoch": 1.04, "grad_norm": 2.2170846462249756, "learning_rate": 1e-05, "loss": 0.9946, "step": 6070 }, { "epoch": 1.04, "grad_norm": 2.6726131439208984, "learning_rate": 1e-05, "loss": 0.9833, "step": 6080 }, { "epoch": 1.05, "grad_norm": 2.8087282180786133, "learning_rate": 1e-05, "loss": 1.0294, "step": 6090 }, { "epoch": 1.05, "grad_norm": 1.9470926523208618, "learning_rate": 1e-05, "loss": 1.0807, "step": 6100 }, { "epoch": 1.05, "grad_norm": 2.086599588394165, "learning_rate": 1e-05, "loss": 0.9602, "step": 6110 }, { "epoch": 1.05, "grad_norm": 2.537266254425049, "learning_rate": 1e-05, "loss": 0.9963, "step": 6120 }, { "epoch": 1.05, "grad_norm": 2.6490066051483154, "learning_rate": 1e-05, "loss": 1.0077, "step": 6130 }, { "epoch": 1.05, "grad_norm": 2.294142723083496, "learning_rate": 1e-05, "loss": 1.0145, "step": 6140 }, { "epoch": 1.06, "grad_norm": 1.8552401065826416, "learning_rate": 1e-05, "loss": 0.8509, "step": 6150 }, { "epoch": 1.06, "grad_norm": 1.9774388074874878, "learning_rate": 1e-05, "loss": 0.9475, "step": 6160 }, { "epoch": 1.06, "grad_norm": 2.0472378730773926, "learning_rate": 1e-05, "loss": 1.0339, "step": 6170 }, { "epoch": 1.06, "grad_norm": 1.7465121746063232, "learning_rate": 1e-05, "loss": 1.0642, "step": 6180 }, { "epoch": 1.06, "grad_norm": 1.965936541557312, "learning_rate": 1e-05, "loss": 0.9558, "step": 6190 }, { "epoch": 1.06, "grad_norm": 3.1949400901794434, "learning_rate": 1e-05, "loss": 0.9357, "step": 6200 }, { "epoch": 1.07, "grad_norm": 3.121185302734375, "learning_rate": 1e-05, "loss": 1.0069, "step": 6210 }, { "epoch": 1.07, "grad_norm": 2.357470750808716, "learning_rate": 1e-05, "loss": 0.9, "step": 6220 }, { "epoch": 1.07, "grad_norm": 2.186276435852051, "learning_rate": 1e-05, "loss": 0.952, "step": 6230 }, { "epoch": 1.07, "grad_norm": 2.1679868698120117, "learning_rate": 1e-05, "loss": 0.9949, "step": 6240 }, { "epoch": 1.07, "grad_norm": 2.6253249645233154, "learning_rate": 1e-05, "loss": 0.995, "step": 6250 }, { "epoch": 1.08, "grad_norm": 2.671788454055786, "learning_rate": 1e-05, "loss": 1.0111, "step": 6260 }, { "epoch": 1.08, "grad_norm": 3.011953592300415, "learning_rate": 1e-05, "loss": 1.0067, "step": 6270 }, { "epoch": 1.08, "grad_norm": 2.894116163253784, "learning_rate": 1e-05, "loss": 1.0278, "step": 6280 }, { "epoch": 1.08, "grad_norm": 1.782424807548523, "learning_rate": 1e-05, "loss": 0.9849, "step": 6290 }, { "epoch": 1.08, "grad_norm": 2.0209150314331055, "learning_rate": 1e-05, "loss": 1.0661, "step": 6300 }, { "epoch": 1.08, "grad_norm": 2.0378525257110596, "learning_rate": 1e-05, "loss": 1.0675, "step": 6310 }, { "epoch": 1.09, "grad_norm": 2.555785655975342, "learning_rate": 1e-05, "loss": 1.0687, "step": 6320 }, { "epoch": 1.09, "grad_norm": 2.312987804412842, "learning_rate": 1e-05, "loss": 0.9451, "step": 6330 }, { "epoch": 1.09, "grad_norm": 2.7663776874542236, "learning_rate": 1e-05, "loss": 1.0758, "step": 6340 }, { "epoch": 1.09, "grad_norm": 4.143360137939453, "learning_rate": 1e-05, "loss": 0.9993, "step": 6350 }, { "epoch": 1.09, "grad_norm": 2.927686929702759, "learning_rate": 1e-05, "loss": 1.0186, "step": 6360 }, { "epoch": 1.09, "grad_norm": 2.797344207763672, "learning_rate": 1e-05, "loss": 1.0245, "step": 6370 }, { "epoch": 1.1, "grad_norm": 3.264024496078491, "learning_rate": 1e-05, "loss": 1.1007, "step": 6380 }, { "epoch": 1.1, "grad_norm": 2.4195101261138916, "learning_rate": 1e-05, "loss": 0.9724, "step": 6390 }, { "epoch": 1.1, "grad_norm": 2.8212947845458984, "learning_rate": 1e-05, "loss": 1.0232, "step": 6400 }, { "epoch": 1.1, "grad_norm": 1.9952080249786377, "learning_rate": 1e-05, "loss": 1.0124, "step": 6410 }, { "epoch": 1.1, "grad_norm": 3.3921620845794678, "learning_rate": 1e-05, "loss": 1.0581, "step": 6420 }, { "epoch": 1.1, "grad_norm": 2.338721752166748, "learning_rate": 1e-05, "loss": 0.9655, "step": 6430 }, { "epoch": 1.11, "grad_norm": 2.622527599334717, "learning_rate": 1e-05, "loss": 1.0654, "step": 6440 }, { "epoch": 1.11, "grad_norm": 2.8540849685668945, "learning_rate": 1e-05, "loss": 0.9467, "step": 6450 }, { "epoch": 1.11, "grad_norm": 2.3519699573516846, "learning_rate": 1e-05, "loss": 0.989, "step": 6460 }, { "epoch": 1.11, "grad_norm": 2.0406546592712402, "learning_rate": 1e-05, "loss": 0.9715, "step": 6470 }, { "epoch": 1.11, "grad_norm": 3.2899045944213867, "learning_rate": 1e-05, "loss": 0.9661, "step": 6480 }, { "epoch": 1.11, "grad_norm": 1.9575726985931396, "learning_rate": 1e-05, "loss": 1.0636, "step": 6490 }, { "epoch": 1.12, "grad_norm": 2.0779426097869873, "learning_rate": 1e-05, "loss": 1.0781, "step": 6500 }, { "epoch": 1.12, "grad_norm": 2.743601083755493, "learning_rate": 1e-05, "loss": 1.0193, "step": 6510 }, { "epoch": 1.12, "grad_norm": 1.9085830450057983, "learning_rate": 1e-05, "loss": 0.9416, "step": 6520 }, { "epoch": 1.12, "grad_norm": 2.8828158378601074, "learning_rate": 1e-05, "loss": 1.0238, "step": 6530 }, { "epoch": 1.12, "grad_norm": 2.3883135318756104, "learning_rate": 1e-05, "loss": 1.0709, "step": 6540 }, { "epoch": 1.12, "grad_norm": 2.5728261470794678, "learning_rate": 1e-05, "loss": 0.9783, "step": 6550 }, { "epoch": 1.13, "grad_norm": 1.7872670888900757, "learning_rate": 1e-05, "loss": 1.0118, "step": 6560 }, { "epoch": 1.13, "grad_norm": 3.844386100769043, "learning_rate": 1e-05, "loss": 1.062, "step": 6570 }, { "epoch": 1.13, "grad_norm": 2.2257680892944336, "learning_rate": 1e-05, "loss": 0.9837, "step": 6580 }, { "epoch": 1.13, "grad_norm": 2.2637107372283936, "learning_rate": 1e-05, "loss": 1.0532, "step": 6590 }, { "epoch": 1.13, "grad_norm": 2.8380672931671143, "learning_rate": 1e-05, "loss": 0.9508, "step": 6600 }, { "epoch": 1.14, "grad_norm": 2.0793893337249756, "learning_rate": 1e-05, "loss": 0.9432, "step": 6610 }, { "epoch": 1.14, "grad_norm": 2.721628427505493, "learning_rate": 1e-05, "loss": 1.0368, "step": 6620 }, { "epoch": 1.14, "grad_norm": 2.677793025970459, "learning_rate": 1e-05, "loss": 1.0157, "step": 6630 }, { "epoch": 1.14, "grad_norm": 3.3213117122650146, "learning_rate": 1e-05, "loss": 0.9747, "step": 6640 }, { "epoch": 1.14, "grad_norm": 2.401367425918579, "learning_rate": 1e-05, "loss": 1.0084, "step": 6650 }, { "epoch": 1.14, "grad_norm": 2.3696703910827637, "learning_rate": 1e-05, "loss": 1.0004, "step": 6660 }, { "epoch": 1.15, "grad_norm": 2.552927255630493, "learning_rate": 1e-05, "loss": 1.1054, "step": 6670 }, { "epoch": 1.15, "grad_norm": 2.5474822521209717, "learning_rate": 1e-05, "loss": 1.0448, "step": 6680 }, { "epoch": 1.15, "grad_norm": 2.3490467071533203, "learning_rate": 1e-05, "loss": 1.0061, "step": 6690 }, { "epoch": 1.15, "grad_norm": 2.2008440494537354, "learning_rate": 1e-05, "loss": 1.0262, "step": 6700 }, { "epoch": 1.15, "grad_norm": 2.552922487258911, "learning_rate": 1e-05, "loss": 1.0031, "step": 6710 }, { "epoch": 1.15, "grad_norm": 2.1921865940093994, "learning_rate": 1e-05, "loss": 1.041, "step": 6720 }, { "epoch": 1.16, "grad_norm": 2.6241724491119385, "learning_rate": 1e-05, "loss": 1.0084, "step": 6730 }, { "epoch": 1.16, "grad_norm": 2.2282745838165283, "learning_rate": 1e-05, "loss": 0.8996, "step": 6740 }, { "epoch": 1.16, "grad_norm": 2.343564987182617, "learning_rate": 1e-05, "loss": 1.0102, "step": 6750 }, { "epoch": 1.16, "grad_norm": 2.3204636573791504, "learning_rate": 1e-05, "loss": 0.9502, "step": 6760 }, { "epoch": 1.16, "grad_norm": 2.019423484802246, "learning_rate": 1e-05, "loss": 0.9669, "step": 6770 }, { "epoch": 1.16, "grad_norm": 2.683650493621826, "learning_rate": 1e-05, "loss": 0.9615, "step": 6780 }, { "epoch": 1.17, "grad_norm": 2.899923086166382, "learning_rate": 1e-05, "loss": 0.9532, "step": 6790 }, { "epoch": 1.17, "grad_norm": 2.707458019256592, "learning_rate": 1e-05, "loss": 1.0163, "step": 6800 }, { "epoch": 1.17, "grad_norm": 2.8542733192443848, "learning_rate": 1e-05, "loss": 1.0348, "step": 6810 }, { "epoch": 1.17, "grad_norm": 2.4394218921661377, "learning_rate": 1e-05, "loss": 1.1294, "step": 6820 }, { "epoch": 1.17, "grad_norm": 3.711557149887085, "learning_rate": 1e-05, "loss": 0.9588, "step": 6830 }, { "epoch": 1.17, "grad_norm": 2.206164836883545, "learning_rate": 1e-05, "loss": 1.0459, "step": 6840 }, { "epoch": 1.18, "grad_norm": 2.1192731857299805, "learning_rate": 1e-05, "loss": 1.1014, "step": 6850 }, { "epoch": 1.18, "grad_norm": 2.472332000732422, "learning_rate": 1e-05, "loss": 1.0396, "step": 6860 }, { "epoch": 1.18, "grad_norm": 2.860532522201538, "learning_rate": 1e-05, "loss": 1.0652, "step": 6870 }, { "epoch": 1.18, "grad_norm": 1.9870842695236206, "learning_rate": 1e-05, "loss": 0.941, "step": 6880 }, { "epoch": 1.18, "grad_norm": 2.527759552001953, "learning_rate": 1e-05, "loss": 1.0487, "step": 6890 }, { "epoch": 1.18, "grad_norm": 3.2542383670806885, "learning_rate": 1e-05, "loss": 1.1416, "step": 6900 }, { "epoch": 1.19, "grad_norm": 2.2846920490264893, "learning_rate": 1e-05, "loss": 1.0324, "step": 6910 }, { "epoch": 1.19, "grad_norm": 2.865117073059082, "learning_rate": 1e-05, "loss": 1.0439, "step": 6920 }, { "epoch": 1.19, "grad_norm": 2.6450443267822266, "learning_rate": 1e-05, "loss": 0.9566, "step": 6930 }, { "epoch": 1.19, "grad_norm": 3.0172007083892822, "learning_rate": 1e-05, "loss": 1.0769, "step": 6940 }, { "epoch": 1.19, "grad_norm": 4.062388896942139, "learning_rate": 1e-05, "loss": 1.0179, "step": 6950 }, { "epoch": 1.2, "grad_norm": 3.4772136211395264, "learning_rate": 1e-05, "loss": 0.9977, "step": 6960 }, { "epoch": 1.2, "grad_norm": 2.269806385040283, "learning_rate": 1e-05, "loss": 0.9665, "step": 6970 }, { "epoch": 1.2, "grad_norm": 2.0248801708221436, "learning_rate": 1e-05, "loss": 1.0624, "step": 6980 }, { "epoch": 1.2, "grad_norm": 3.0228490829467773, "learning_rate": 1e-05, "loss": 1.0126, "step": 6990 }, { "epoch": 1.2, "grad_norm": 2.6162188053131104, "learning_rate": 1e-05, "loss": 0.9477, "step": 7000 }, { "epoch": 1.2, "grad_norm": 2.0334103107452393, "learning_rate": 1e-05, "loss": 1.0403, "step": 7010 }, { "epoch": 1.21, "grad_norm": 2.4257733821868896, "learning_rate": 1e-05, "loss": 0.9266, "step": 7020 }, { "epoch": 1.21, "grad_norm": 2.222496509552002, "learning_rate": 1e-05, "loss": 0.9598, "step": 7030 }, { "epoch": 1.21, "grad_norm": 3.3846302032470703, "learning_rate": 1e-05, "loss": 1.0264, "step": 7040 }, { "epoch": 1.21, "grad_norm": 3.849187135696411, "learning_rate": 1e-05, "loss": 1.0838, "step": 7050 }, { "epoch": 1.21, "grad_norm": 2.097158432006836, "learning_rate": 1e-05, "loss": 0.9943, "step": 7060 }, { "epoch": 1.21, "grad_norm": 2.259446859359741, "learning_rate": 1e-05, "loss": 1.0005, "step": 7070 }, { "epoch": 1.22, "grad_norm": 1.8879165649414062, "learning_rate": 1e-05, "loss": 1.0426, "step": 7080 }, { "epoch": 1.22, "grad_norm": 2.0662331581115723, "learning_rate": 1e-05, "loss": 1.0646, "step": 7090 }, { "epoch": 1.22, "grad_norm": 2.568582057952881, "learning_rate": 1e-05, "loss": 1.0497, "step": 7100 }, { "epoch": 1.22, "grad_norm": 2.3837697505950928, "learning_rate": 1e-05, "loss": 1.0589, "step": 7110 }, { "epoch": 1.22, "grad_norm": 2.908559560775757, "learning_rate": 1e-05, "loss": 0.9937, "step": 7120 }, { "epoch": 1.22, "grad_norm": 2.302889823913574, "learning_rate": 1e-05, "loss": 1.042, "step": 7130 }, { "epoch": 1.23, "grad_norm": 2.683570146560669, "learning_rate": 1e-05, "loss": 1.1151, "step": 7140 }, { "epoch": 1.23, "grad_norm": 2.5803396701812744, "learning_rate": 1e-05, "loss": 1.0203, "step": 7150 }, { "epoch": 1.23, "grad_norm": 2.4933981895446777, "learning_rate": 1e-05, "loss": 1.0585, "step": 7160 }, { "epoch": 1.23, "grad_norm": 2.319493055343628, "learning_rate": 1e-05, "loss": 1.0397, "step": 7170 }, { "epoch": 1.23, "grad_norm": 2.282255172729492, "learning_rate": 1e-05, "loss": 1.0919, "step": 7180 }, { "epoch": 1.23, "grad_norm": 2.5834693908691406, "learning_rate": 1e-05, "loss": 0.9382, "step": 7190 }, { "epoch": 1.24, "grad_norm": 2.854360818862915, "learning_rate": 1e-05, "loss": 1.0088, "step": 7200 }, { "epoch": 1.24, "grad_norm": 2.8476474285125732, "learning_rate": 1e-05, "loss": 0.9774, "step": 7210 }, { "epoch": 1.24, "grad_norm": 2.5642004013061523, "learning_rate": 1e-05, "loss": 1.0274, "step": 7220 }, { "epoch": 1.24, "grad_norm": 2.215512990951538, "learning_rate": 1e-05, "loss": 1.073, "step": 7230 }, { "epoch": 1.24, "grad_norm": 3.149976968765259, "learning_rate": 1e-05, "loss": 1.0018, "step": 7240 }, { "epoch": 1.25, "grad_norm": 2.221637010574341, "learning_rate": 1e-05, "loss": 0.9686, "step": 7250 }, { "epoch": 1.25, "grad_norm": 2.236781358718872, "learning_rate": 1e-05, "loss": 1.027, "step": 7260 }, { "epoch": 1.25, "grad_norm": 2.7379977703094482, "learning_rate": 1e-05, "loss": 0.9815, "step": 7270 }, { "epoch": 1.25, "grad_norm": 2.726146697998047, "learning_rate": 1e-05, "loss": 0.9777, "step": 7280 }, { "epoch": 1.25, "grad_norm": 2.037290573120117, "learning_rate": 1e-05, "loss": 0.9795, "step": 7290 }, { "epoch": 1.25, "grad_norm": 2.7081196308135986, "learning_rate": 1e-05, "loss": 1.0047, "step": 7300 }, { "epoch": 1.26, "grad_norm": 2.575348138809204, "learning_rate": 1e-05, "loss": 0.9683, "step": 7310 }, { "epoch": 1.26, "grad_norm": 2.6672821044921875, "learning_rate": 1e-05, "loss": 1.0013, "step": 7320 }, { "epoch": 1.26, "grad_norm": 2.878222703933716, "learning_rate": 1e-05, "loss": 1.0415, "step": 7330 }, { "epoch": 1.26, "grad_norm": 1.99146568775177, "learning_rate": 1e-05, "loss": 1.0387, "step": 7340 }, { "epoch": 1.26, "grad_norm": 2.957350492477417, "learning_rate": 1e-05, "loss": 1.0126, "step": 7350 }, { "epoch": 1.26, "grad_norm": 2.2003567218780518, "learning_rate": 1e-05, "loss": 0.888, "step": 7360 }, { "epoch": 1.27, "grad_norm": 2.5942909717559814, "learning_rate": 1e-05, "loss": 1.0561, "step": 7370 }, { "epoch": 1.27, "grad_norm": 2.9146692752838135, "learning_rate": 1e-05, "loss": 1.018, "step": 7380 }, { "epoch": 1.27, "grad_norm": 2.8458762168884277, "learning_rate": 1e-05, "loss": 1.0827, "step": 7390 }, { "epoch": 1.27, "grad_norm": 1.8542159795761108, "learning_rate": 1e-05, "loss": 1.0302, "step": 7400 }, { "epoch": 1.27, "grad_norm": 2.5300040245056152, "learning_rate": 1e-05, "loss": 1.0084, "step": 7410 }, { "epoch": 1.27, "grad_norm": 3.089399576187134, "learning_rate": 1e-05, "loss": 0.9713, "step": 7420 }, { "epoch": 1.28, "grad_norm": 2.373389482498169, "learning_rate": 1e-05, "loss": 1.0608, "step": 7430 }, { "epoch": 1.28, "grad_norm": 2.3854312896728516, "learning_rate": 1e-05, "loss": 1.0177, "step": 7440 }, { "epoch": 1.28, "grad_norm": 2.3859362602233887, "learning_rate": 1e-05, "loss": 0.9844, "step": 7450 }, { "epoch": 1.28, "grad_norm": 2.767002820968628, "learning_rate": 1e-05, "loss": 1.0356, "step": 7460 }, { "epoch": 1.28, "grad_norm": 2.1323554515838623, "learning_rate": 1e-05, "loss": 0.9526, "step": 7470 }, { "epoch": 1.28, "grad_norm": 2.690873622894287, "learning_rate": 1e-05, "loss": 0.9732, "step": 7480 }, { "epoch": 1.29, "grad_norm": 3.478398561477661, "learning_rate": 1e-05, "loss": 0.9905, "step": 7490 }, { "epoch": 1.29, "grad_norm": 1.8604772090911865, "learning_rate": 1e-05, "loss": 0.9636, "step": 7500 }, { "epoch": 1.29, "grad_norm": 2.680375576019287, "learning_rate": 1e-05, "loss": 0.9631, "step": 7510 }, { "epoch": 1.29, "grad_norm": 3.15445613861084, "learning_rate": 1e-05, "loss": 1.0039, "step": 7520 }, { "epoch": 1.29, "grad_norm": 3.388442039489746, "learning_rate": 1e-05, "loss": 0.9484, "step": 7530 }, { "epoch": 1.29, "grad_norm": 2.1244375705718994, "learning_rate": 1e-05, "loss": 0.9882, "step": 7540 }, { "epoch": 1.3, "grad_norm": 1.9992311000823975, "learning_rate": 1e-05, "loss": 0.9835, "step": 7550 }, { "epoch": 1.3, "grad_norm": 2.6015098094940186, "learning_rate": 1e-05, "loss": 1.0097, "step": 7560 }, { "epoch": 1.3, "grad_norm": 1.9045817852020264, "learning_rate": 1e-05, "loss": 0.9762, "step": 7570 }, { "epoch": 1.3, "grad_norm": 2.8165290355682373, "learning_rate": 1e-05, "loss": 0.9632, "step": 7580 }, { "epoch": 1.3, "grad_norm": 3.154522657394409, "learning_rate": 1e-05, "loss": 1.0249, "step": 7590 }, { "epoch": 1.31, "grad_norm": 3.180574893951416, "learning_rate": 1e-05, "loss": 1.0563, "step": 7600 }, { "epoch": 1.31, "grad_norm": 4.546901226043701, "learning_rate": 1e-05, "loss": 1.0525, "step": 7610 }, { "epoch": 1.31, "grad_norm": 2.081376552581787, "learning_rate": 1e-05, "loss": 0.9529, "step": 7620 }, { "epoch": 1.31, "grad_norm": 2.6588551998138428, "learning_rate": 1e-05, "loss": 1.0004, "step": 7630 }, { "epoch": 1.31, "grad_norm": 2.580198049545288, "learning_rate": 1e-05, "loss": 1.0921, "step": 7640 }, { "epoch": 1.31, "grad_norm": 2.5459742546081543, "learning_rate": 1e-05, "loss": 0.9399, "step": 7650 }, { "epoch": 1.32, "grad_norm": 2.4501724243164062, "learning_rate": 1e-05, "loss": 1.0721, "step": 7660 }, { "epoch": 1.32, "grad_norm": 2.613929033279419, "learning_rate": 1e-05, "loss": 1.0456, "step": 7670 }, { "epoch": 1.32, "grad_norm": 1.8623414039611816, "learning_rate": 1e-05, "loss": 0.9783, "step": 7680 }, { "epoch": 1.32, "grad_norm": 2.3043484687805176, "learning_rate": 1e-05, "loss": 1.0286, "step": 7690 }, { "epoch": 1.32, "grad_norm": 2.5169615745544434, "learning_rate": 1e-05, "loss": 1.0884, "step": 7700 }, { "epoch": 1.32, "grad_norm": 2.3486950397491455, "learning_rate": 1e-05, "loss": 0.9957, "step": 7710 }, { "epoch": 1.33, "grad_norm": 3.348045587539673, "learning_rate": 1e-05, "loss": 0.9804, "step": 7720 }, { "epoch": 1.33, "grad_norm": 2.1623213291168213, "learning_rate": 1e-05, "loss": 1.0494, "step": 7730 }, { "epoch": 1.33, "grad_norm": 1.6808427572250366, "learning_rate": 1e-05, "loss": 1.0663, "step": 7740 }, { "epoch": 1.33, "grad_norm": 2.15962290763855, "learning_rate": 1e-05, "loss": 1.0142, "step": 7750 }, { "epoch": 1.33, "grad_norm": 2.7483596801757812, "learning_rate": 1e-05, "loss": 1.0353, "step": 7760 }, { "epoch": 1.33, "grad_norm": 3.0963194370269775, "learning_rate": 1e-05, "loss": 1.0867, "step": 7770 }, { "epoch": 1.34, "grad_norm": 2.584580898284912, "learning_rate": 1e-05, "loss": 1.0744, "step": 7780 }, { "epoch": 1.34, "grad_norm": 3.4675183296203613, "learning_rate": 1e-05, "loss": 1.0784, "step": 7790 }, { "epoch": 1.34, "grad_norm": 2.236694097518921, "learning_rate": 1e-05, "loss": 1.0281, "step": 7800 }, { "epoch": 1.34, "grad_norm": 5.138028621673584, "learning_rate": 1e-05, "loss": 1.0198, "step": 7810 }, { "epoch": 1.34, "grad_norm": 2.8963077068328857, "learning_rate": 1e-05, "loss": 0.9776, "step": 7820 }, { "epoch": 1.34, "grad_norm": 3.836012125015259, "learning_rate": 1e-05, "loss": 1.0364, "step": 7830 }, { "epoch": 1.35, "grad_norm": 3.667417287826538, "learning_rate": 1e-05, "loss": 1.013, "step": 7840 }, { "epoch": 1.35, "grad_norm": 2.6331534385681152, "learning_rate": 1e-05, "loss": 1.0265, "step": 7850 }, { "epoch": 1.35, "grad_norm": 2.822760820388794, "learning_rate": 1e-05, "loss": 1.078, "step": 7860 }, { "epoch": 1.35, "grad_norm": 2.772062301635742, "learning_rate": 1e-05, "loss": 0.9334, "step": 7870 }, { "epoch": 1.35, "grad_norm": 2.1583237648010254, "learning_rate": 1e-05, "loss": 0.8797, "step": 7880 }, { "epoch": 1.35, "grad_norm": 3.0897679328918457, "learning_rate": 1e-05, "loss": 0.9849, "step": 7890 }, { "epoch": 1.36, "grad_norm": 2.45992112159729, "learning_rate": 1e-05, "loss": 1.0646, "step": 7900 }, { "epoch": 1.36, "grad_norm": 2.1727113723754883, "learning_rate": 1e-05, "loss": 1.0321, "step": 7910 }, { "epoch": 1.36, "grad_norm": 2.950498580932617, "learning_rate": 1e-05, "loss": 1.0255, "step": 7920 }, { "epoch": 1.36, "grad_norm": 2.294767141342163, "learning_rate": 1e-05, "loss": 0.989, "step": 7930 }, { "epoch": 1.36, "grad_norm": 2.484980583190918, "learning_rate": 1e-05, "loss": 0.9724, "step": 7940 }, { "epoch": 1.37, "grad_norm": 3.3656985759735107, "learning_rate": 1e-05, "loss": 1.0837, "step": 7950 }, { "epoch": 1.37, "grad_norm": 2.5616161823272705, "learning_rate": 1e-05, "loss": 1.0084, "step": 7960 }, { "epoch": 1.37, "grad_norm": 3.1650032997131348, "learning_rate": 1e-05, "loss": 1.1708, "step": 7970 }, { "epoch": 1.37, "grad_norm": 2.4175758361816406, "learning_rate": 1e-05, "loss": 1.0317, "step": 7980 }, { "epoch": 1.37, "grad_norm": 2.108388662338257, "learning_rate": 1e-05, "loss": 1.0534, "step": 7990 }, { "epoch": 1.37, "grad_norm": 2.224473714828491, "learning_rate": 1e-05, "loss": 0.9725, "step": 8000 }, { "epoch": 1.38, "grad_norm": 2.2060694694519043, "learning_rate": 1e-05, "loss": 1.004, "step": 8010 }, { "epoch": 1.38, "grad_norm": 1.9540616273880005, "learning_rate": 1e-05, "loss": 0.8094, "step": 8020 }, { "epoch": 1.38, "grad_norm": 3.420384645462036, "learning_rate": 1e-05, "loss": 1.042, "step": 8030 }, { "epoch": 1.38, "grad_norm": 2.2382652759552, "learning_rate": 1e-05, "loss": 1.0907, "step": 8040 }, { "epoch": 1.38, "grad_norm": 3.1269848346710205, "learning_rate": 1e-05, "loss": 1.0609, "step": 8050 }, { "epoch": 1.38, "grad_norm": 2.7086892127990723, "learning_rate": 1e-05, "loss": 0.9583, "step": 8060 }, { "epoch": 1.39, "grad_norm": 2.5433905124664307, "learning_rate": 1e-05, "loss": 0.9998, "step": 8070 }, { "epoch": 1.39, "grad_norm": 2.4779505729675293, "learning_rate": 1e-05, "loss": 1.0018, "step": 8080 }, { "epoch": 1.39, "grad_norm": 2.4827795028686523, "learning_rate": 1e-05, "loss": 1.0067, "step": 8090 }, { "epoch": 1.39, "grad_norm": 2.616666793823242, "learning_rate": 1e-05, "loss": 1.0181, "step": 8100 }, { "epoch": 1.39, "grad_norm": 2.293524980545044, "learning_rate": 1e-05, "loss": 1.033, "step": 8110 }, { "epoch": 1.39, "grad_norm": 2.3285727500915527, "learning_rate": 1e-05, "loss": 0.9987, "step": 8120 }, { "epoch": 1.4, "grad_norm": 2.2849738597869873, "learning_rate": 1e-05, "loss": 1.014, "step": 8130 }, { "epoch": 1.4, "grad_norm": 3.6285767555236816, "learning_rate": 1e-05, "loss": 1.0186, "step": 8140 }, { "epoch": 1.4, "grad_norm": 2.7985305786132812, "learning_rate": 1e-05, "loss": 1.1144, "step": 8150 }, { "epoch": 1.4, "grad_norm": 2.4430348873138428, "learning_rate": 1e-05, "loss": 1.0596, "step": 8160 }, { "epoch": 1.4, "grad_norm": 2.200226306915283, "learning_rate": 1e-05, "loss": 1.0944, "step": 8170 }, { "epoch": 1.4, "grad_norm": 1.8424794673919678, "learning_rate": 1e-05, "loss": 1.1051, "step": 8180 }, { "epoch": 1.41, "grad_norm": 2.2834882736206055, "learning_rate": 1e-05, "loss": 0.9787, "step": 8190 }, { "epoch": 1.41, "grad_norm": 2.308248519897461, "learning_rate": 1e-05, "loss": 1.0679, "step": 8200 }, { "epoch": 1.41, "grad_norm": 3.680285692214966, "learning_rate": 1e-05, "loss": 0.996, "step": 8210 }, { "epoch": 1.41, "grad_norm": 2.7210543155670166, "learning_rate": 1e-05, "loss": 0.9188, "step": 8220 }, { "epoch": 1.41, "grad_norm": 2.5917136669158936, "learning_rate": 1e-05, "loss": 1.0204, "step": 8230 }, { "epoch": 1.42, "grad_norm": 2.6894149780273438, "learning_rate": 1e-05, "loss": 0.9765, "step": 8240 }, { "epoch": 1.42, "grad_norm": 2.9170303344726562, "learning_rate": 1e-05, "loss": 1.0352, "step": 8250 }, { "epoch": 1.42, "grad_norm": 2.6600093841552734, "learning_rate": 1e-05, "loss": 0.9624, "step": 8260 }, { "epoch": 1.42, "grad_norm": 3.1691620349884033, "learning_rate": 1e-05, "loss": 1.0014, "step": 8270 }, { "epoch": 1.42, "grad_norm": 2.4668002128601074, "learning_rate": 1e-05, "loss": 1.1043, "step": 8280 }, { "epoch": 1.42, "grad_norm": 3.1661458015441895, "learning_rate": 1e-05, "loss": 1.0113, "step": 8290 }, { "epoch": 1.43, "grad_norm": 2.3365416526794434, "learning_rate": 1e-05, "loss": 1.0482, "step": 8300 }, { "epoch": 1.43, "grad_norm": 3.319361925125122, "learning_rate": 1e-05, "loss": 1.1109, "step": 8310 }, { "epoch": 1.43, "grad_norm": 2.097994089126587, "learning_rate": 1e-05, "loss": 0.9544, "step": 8320 }, { "epoch": 1.43, "grad_norm": 2.9699785709381104, "learning_rate": 1e-05, "loss": 1.026, "step": 8330 }, { "epoch": 1.43, "grad_norm": 2.399681568145752, "learning_rate": 1e-05, "loss": 1.0759, "step": 8340 }, { "epoch": 1.43, "grad_norm": 2.4438891410827637, "learning_rate": 1e-05, "loss": 1.0336, "step": 8350 }, { "epoch": 1.44, "grad_norm": 2.4506747722625732, "learning_rate": 1e-05, "loss": 1.0359, "step": 8360 }, { "epoch": 1.44, "grad_norm": 3.281940221786499, "learning_rate": 1e-05, "loss": 1.1251, "step": 8370 }, { "epoch": 1.44, "grad_norm": 2.040092706680298, "learning_rate": 1e-05, "loss": 1.0515, "step": 8380 }, { "epoch": 1.44, "grad_norm": 2.184382677078247, "learning_rate": 1e-05, "loss": 0.9284, "step": 8390 }, { "epoch": 1.44, "grad_norm": 2.638476848602295, "learning_rate": 1e-05, "loss": 1.0476, "step": 8400 }, { "epoch": 1.44, "grad_norm": 2.5259788036346436, "learning_rate": 1e-05, "loss": 1.0159, "step": 8410 }, { "epoch": 1.45, "grad_norm": 2.3715500831604004, "learning_rate": 1e-05, "loss": 1.016, "step": 8420 }, { "epoch": 1.45, "grad_norm": 2.3363490104675293, "learning_rate": 1e-05, "loss": 0.9059, "step": 8430 }, { "epoch": 1.45, "grad_norm": 3.1245510578155518, "learning_rate": 1e-05, "loss": 1.0613, "step": 8440 }, { "epoch": 1.45, "grad_norm": 2.519963502883911, "learning_rate": 1e-05, "loss": 1.0959, "step": 8450 }, { "epoch": 1.45, "grad_norm": 3.118851900100708, "learning_rate": 1e-05, "loss": 1.0572, "step": 8460 }, { "epoch": 1.45, "grad_norm": 2.3911423683166504, "learning_rate": 1e-05, "loss": 1.0629, "step": 8470 }, { "epoch": 1.46, "grad_norm": 3.0603713989257812, "learning_rate": 1e-05, "loss": 1.097, "step": 8480 }, { "epoch": 1.46, "grad_norm": 3.1234920024871826, "learning_rate": 1e-05, "loss": 1.0664, "step": 8490 }, { "epoch": 1.46, "grad_norm": 3.358424186706543, "learning_rate": 1e-05, "loss": 1.0644, "step": 8500 }, { "epoch": 1.46, "grad_norm": 2.3433587551116943, "learning_rate": 1e-05, "loss": 1.0608, "step": 8510 }, { "epoch": 1.46, "grad_norm": 3.0287795066833496, "learning_rate": 1e-05, "loss": 1.0, "step": 8520 }, { "epoch": 1.46, "grad_norm": 2.1926498413085938, "learning_rate": 1e-05, "loss": 0.9871, "step": 8530 }, { "epoch": 1.47, "grad_norm": 2.370448589324951, "learning_rate": 1e-05, "loss": 1.0169, "step": 8540 }, { "epoch": 1.47, "grad_norm": 3.0587711334228516, "learning_rate": 1e-05, "loss": 1.1135, "step": 8550 }, { "epoch": 1.47, "grad_norm": 2.773609161376953, "learning_rate": 1e-05, "loss": 1.0053, "step": 8560 }, { "epoch": 1.47, "grad_norm": 2.279136896133423, "learning_rate": 1e-05, "loss": 1.0227, "step": 8570 }, { "epoch": 1.47, "grad_norm": 2.549975633621216, "learning_rate": 1e-05, "loss": 0.9351, "step": 8580 }, { "epoch": 1.48, "grad_norm": 3.1325323581695557, "learning_rate": 1e-05, "loss": 0.9615, "step": 8590 }, { "epoch": 1.48, "grad_norm": 3.074357748031616, "learning_rate": 1e-05, "loss": 1.0796, "step": 8600 }, { "epoch": 1.48, "grad_norm": 2.1816437244415283, "learning_rate": 1e-05, "loss": 1.0263, "step": 8610 }, { "epoch": 1.48, "grad_norm": 2.568363904953003, "learning_rate": 1e-05, "loss": 1.1019, "step": 8620 }, { "epoch": 1.48, "grad_norm": 2.1658637523651123, "learning_rate": 1e-05, "loss": 1.06, "step": 8630 }, { "epoch": 1.48, "grad_norm": 2.6504440307617188, "learning_rate": 1e-05, "loss": 1.0738, "step": 8640 }, { "epoch": 1.49, "grad_norm": 2.858308792114258, "learning_rate": 1e-05, "loss": 1.0557, "step": 8650 }, { "epoch": 1.49, "grad_norm": 2.064356803894043, "learning_rate": 1e-05, "loss": 1.011, "step": 8660 }, { "epoch": 1.49, "grad_norm": 3.340825080871582, "learning_rate": 1e-05, "loss": 1.0774, "step": 8670 }, { "epoch": 1.49, "grad_norm": 3.6675853729248047, "learning_rate": 1e-05, "loss": 1.1196, "step": 8680 }, { "epoch": 1.49, "grad_norm": 1.820267677307129, "learning_rate": 1e-05, "loss": 0.8902, "step": 8690 }, { "epoch": 1.49, "grad_norm": 2.1053669452667236, "learning_rate": 1e-05, "loss": 0.993, "step": 8700 }, { "epoch": 1.5, "grad_norm": 1.9717823266983032, "learning_rate": 1e-05, "loss": 1.0134, "step": 8710 }, { "epoch": 1.5, "grad_norm": 2.224724292755127, "learning_rate": 1e-05, "loss": 0.9832, "step": 8720 }, { "epoch": 1.5, "grad_norm": 2.688229560852051, "learning_rate": 1e-05, "loss": 1.099, "step": 8730 }, { "epoch": 1.5, "grad_norm": 2.633141279220581, "learning_rate": 1e-05, "loss": 0.999, "step": 8740 }, { "epoch": 1.5, "grad_norm": 3.245795726776123, "learning_rate": 1e-05, "loss": 0.97, "step": 8750 }, { "epoch": 1.5, "grad_norm": 4.065086364746094, "learning_rate": 1e-05, "loss": 1.1426, "step": 8760 }, { "epoch": 1.51, "grad_norm": 3.8590381145477295, "learning_rate": 1e-05, "loss": 1.0187, "step": 8770 }, { "epoch": 1.51, "grad_norm": 2.5020790100097656, "learning_rate": 1e-05, "loss": 0.9483, "step": 8780 }, { "epoch": 1.51, "grad_norm": 2.2966628074645996, "learning_rate": 1e-05, "loss": 0.9955, "step": 8790 }, { "epoch": 1.51, "grad_norm": 2.021965980529785, "learning_rate": 1e-05, "loss": 1.1033, "step": 8800 }, { "epoch": 1.51, "grad_norm": 2.4566996097564697, "learning_rate": 1e-05, "loss": 1.0353, "step": 8810 }, { "epoch": 1.51, "grad_norm": 2.8194572925567627, "learning_rate": 1e-05, "loss": 1.0071, "step": 8820 }, { "epoch": 1.52, "grad_norm": 2.5652499198913574, "learning_rate": 1e-05, "loss": 1.0224, "step": 8830 }, { "epoch": 1.52, "grad_norm": 2.745292901992798, "learning_rate": 1e-05, "loss": 1.0342, "step": 8840 }, { "epoch": 1.52, "grad_norm": 2.3959574699401855, "learning_rate": 1e-05, "loss": 0.9508, "step": 8850 }, { "epoch": 1.52, "grad_norm": 1.9239192008972168, "learning_rate": 1e-05, "loss": 1.0539, "step": 8860 }, { "epoch": 1.52, "grad_norm": 2.345761299133301, "learning_rate": 1e-05, "loss": 1.0635, "step": 8870 }, { "epoch": 1.52, "grad_norm": 2.51446270942688, "learning_rate": 1e-05, "loss": 1.0992, "step": 8880 }, { "epoch": 1.53, "grad_norm": 1.9711889028549194, "learning_rate": 1e-05, "loss": 1.0458, "step": 8890 }, { "epoch": 1.53, "grad_norm": 2.8537259101867676, "learning_rate": 1e-05, "loss": 0.9539, "step": 8900 }, { "epoch": 1.53, "grad_norm": 2.368511438369751, "learning_rate": 1e-05, "loss": 0.9527, "step": 8910 }, { "epoch": 1.53, "grad_norm": 3.031531810760498, "learning_rate": 1e-05, "loss": 1.1827, "step": 8920 }, { "epoch": 1.53, "grad_norm": 2.894315004348755, "learning_rate": 1e-05, "loss": 1.0435, "step": 8930 }, { "epoch": 1.54, "grad_norm": 3.3386778831481934, "learning_rate": 1e-05, "loss": 1.0066, "step": 8940 }, { "epoch": 1.54, "grad_norm": 3.863614320755005, "learning_rate": 1e-05, "loss": 1.1069, "step": 8950 }, { "epoch": 1.54, "grad_norm": 2.729940176010132, "learning_rate": 1e-05, "loss": 1.0084, "step": 8960 }, { "epoch": 1.54, "grad_norm": 2.8039450645446777, "learning_rate": 1e-05, "loss": 0.9883, "step": 8970 }, { "epoch": 1.54, "grad_norm": 2.1626617908477783, "learning_rate": 1e-05, "loss": 0.9813, "step": 8980 }, { "epoch": 1.54, "grad_norm": 3.061138153076172, "learning_rate": 1e-05, "loss": 1.0289, "step": 8990 }, { "epoch": 1.55, "grad_norm": 2.4434292316436768, "learning_rate": 1e-05, "loss": 1.0519, "step": 9000 }, { "epoch": 1.55, "grad_norm": 2.2953484058380127, "learning_rate": 1e-05, "loss": 1.0233, "step": 9010 }, { "epoch": 1.55, "grad_norm": 2.4721264839172363, "learning_rate": 1e-05, "loss": 1.0358, "step": 9020 }, { "epoch": 1.55, "grad_norm": 1.9545519351959229, "learning_rate": 1e-05, "loss": 1.0004, "step": 9030 }, { "epoch": 1.55, "grad_norm": 2.3768677711486816, "learning_rate": 1e-05, "loss": 1.0209, "step": 9040 }, { "epoch": 1.55, "grad_norm": 2.243574619293213, "learning_rate": 1e-05, "loss": 0.9869, "step": 9050 }, { "epoch": 1.56, "grad_norm": 2.229619264602661, "learning_rate": 1e-05, "loss": 0.9306, "step": 9060 }, { "epoch": 1.56, "grad_norm": 2.9487369060516357, "learning_rate": 1e-05, "loss": 1.0493, "step": 9070 }, { "epoch": 1.56, "grad_norm": 1.8562098741531372, "learning_rate": 1e-05, "loss": 1.0514, "step": 9080 }, { "epoch": 1.56, "grad_norm": 2.3863279819488525, "learning_rate": 1e-05, "loss": 0.9485, "step": 9090 }, { "epoch": 1.56, "grad_norm": 6.1171770095825195, "learning_rate": 1e-05, "loss": 1.1196, "step": 9100 }, { "epoch": 1.56, "grad_norm": 2.144599199295044, "learning_rate": 1e-05, "loss": 1.0483, "step": 9110 }, { "epoch": 1.57, "grad_norm": 2.211543560028076, "learning_rate": 1e-05, "loss": 1.128, "step": 9120 }, { "epoch": 1.57, "grad_norm": 1.9530116319656372, "learning_rate": 1e-05, "loss": 0.9435, "step": 9130 }, { "epoch": 1.57, "grad_norm": 2.7647440433502197, "learning_rate": 1e-05, "loss": 1.0141, "step": 9140 }, { "epoch": 1.57, "grad_norm": 2.4697787761688232, "learning_rate": 1e-05, "loss": 1.0476, "step": 9150 }, { "epoch": 1.57, "grad_norm": 3.207587480545044, "learning_rate": 1e-05, "loss": 0.992, "step": 9160 }, { "epoch": 1.57, "grad_norm": 2.5492539405822754, "learning_rate": 1e-05, "loss": 0.9752, "step": 9170 }, { "epoch": 1.58, "grad_norm": 2.7407188415527344, "learning_rate": 1e-05, "loss": 1.0607, "step": 9180 }, { "epoch": 1.58, "grad_norm": 2.3383569717407227, "learning_rate": 1e-05, "loss": 1.0751, "step": 9190 }, { "epoch": 1.58, "grad_norm": 2.1230878829956055, "learning_rate": 1e-05, "loss": 0.952, "step": 9200 }, { "epoch": 1.58, "grad_norm": 1.9829449653625488, "learning_rate": 1e-05, "loss": 1.0335, "step": 9210 }, { "epoch": 1.58, "grad_norm": 4.799077987670898, "learning_rate": 1e-05, "loss": 1.0537, "step": 9220 }, { "epoch": 1.59, "grad_norm": 2.8595337867736816, "learning_rate": 1e-05, "loss": 1.0133, "step": 9230 }, { "epoch": 1.59, "grad_norm": 2.6511495113372803, "learning_rate": 1e-05, "loss": 1.0115, "step": 9240 }, { "epoch": 1.59, "grad_norm": 1.9823254346847534, "learning_rate": 1e-05, "loss": 1.0785, "step": 9250 }, { "epoch": 1.59, "grad_norm": 2.7708723545074463, "learning_rate": 1e-05, "loss": 1.1013, "step": 9260 }, { "epoch": 1.59, "grad_norm": 2.3901848793029785, "learning_rate": 1e-05, "loss": 1.0543, "step": 9270 }, { "epoch": 1.59, "grad_norm": 2.7122440338134766, "learning_rate": 1e-05, "loss": 1.1529, "step": 9280 }, { "epoch": 1.6, "grad_norm": 3.1970365047454834, "learning_rate": 1e-05, "loss": 0.9951, "step": 9290 }, { "epoch": 1.6, "grad_norm": 2.774059534072876, "learning_rate": 1e-05, "loss": 1.0656, "step": 9300 }, { "epoch": 1.6, "grad_norm": 2.749340295791626, "learning_rate": 1e-05, "loss": 1.0989, "step": 9310 }, { "epoch": 1.6, "grad_norm": 2.2948596477508545, "learning_rate": 1e-05, "loss": 1.0436, "step": 9320 }, { "epoch": 1.6, "grad_norm": 3.7892682552337646, "learning_rate": 1e-05, "loss": 1.0071, "step": 9330 }, { "epoch": 1.6, "grad_norm": 2.0370535850524902, "learning_rate": 1e-05, "loss": 1.0727, "step": 9340 }, { "epoch": 1.61, "grad_norm": 2.822446823120117, "learning_rate": 1e-05, "loss": 1.0199, "step": 9350 }, { "epoch": 1.61, "grad_norm": 2.6964447498321533, "learning_rate": 1e-05, "loss": 1.0402, "step": 9360 }, { "epoch": 1.61, "grad_norm": 2.7196106910705566, "learning_rate": 1e-05, "loss": 0.9824, "step": 9370 }, { "epoch": 1.61, "grad_norm": 2.7182745933532715, "learning_rate": 1e-05, "loss": 1.0771, "step": 9380 }, { "epoch": 1.61, "grad_norm": 2.1117007732391357, "learning_rate": 1e-05, "loss": 1.0465, "step": 9390 }, { "epoch": 1.61, "grad_norm": 1.7967630624771118, "learning_rate": 1e-05, "loss": 0.992, "step": 9400 }, { "epoch": 1.62, "grad_norm": 2.49383282661438, "learning_rate": 1e-05, "loss": 1.0395, "step": 9410 }, { "epoch": 1.62, "grad_norm": 2.5202503204345703, "learning_rate": 1e-05, "loss": 1.0795, "step": 9420 }, { "epoch": 1.62, "grad_norm": 2.5935745239257812, "learning_rate": 1e-05, "loss": 1.02, "step": 9430 }, { "epoch": 1.62, "grad_norm": 2.4098920822143555, "learning_rate": 1e-05, "loss": 1.03, "step": 9440 }, { "epoch": 1.62, "grad_norm": 2.3071539402008057, "learning_rate": 1e-05, "loss": 1.0464, "step": 9450 }, { "epoch": 1.62, "grad_norm": 1.990229606628418, "learning_rate": 1e-05, "loss": 0.9617, "step": 9460 }, { "epoch": 1.63, "grad_norm": 2.6037960052490234, "learning_rate": 1e-05, "loss": 0.9039, "step": 9470 }, { "epoch": 1.63, "grad_norm": 5.809933185577393, "learning_rate": 1e-05, "loss": 1.1791, "step": 9480 }, { "epoch": 1.63, "grad_norm": 2.2311456203460693, "learning_rate": 1e-05, "loss": 0.9794, "step": 9490 }, { "epoch": 1.63, "grad_norm": 3.2406017780303955, "learning_rate": 1e-05, "loss": 1.0225, "step": 9500 }, { "epoch": 1.63, "grad_norm": 2.0920634269714355, "learning_rate": 1e-05, "loss": 0.9394, "step": 9510 }, { "epoch": 1.63, "grad_norm": 2.660165309906006, "learning_rate": 1e-05, "loss": 1.0346, "step": 9520 }, { "epoch": 1.64, "grad_norm": 2.8337314128875732, "learning_rate": 1e-05, "loss": 1.1071, "step": 9530 }, { "epoch": 1.64, "grad_norm": 2.5402066707611084, "learning_rate": 1e-05, "loss": 1.0252, "step": 9540 }, { "epoch": 1.64, "grad_norm": 2.081479549407959, "learning_rate": 1e-05, "loss": 1.021, "step": 9550 }, { "epoch": 1.64, "grad_norm": 1.7497069835662842, "learning_rate": 1e-05, "loss": 1.0593, "step": 9560 }, { "epoch": 1.64, "grad_norm": 2.5563762187957764, "learning_rate": 1e-05, "loss": 1.1167, "step": 9570 }, { "epoch": 1.65, "grad_norm": 1.853631854057312, "learning_rate": 1e-05, "loss": 0.9478, "step": 9580 }, { "epoch": 1.65, "grad_norm": 1.8485876321792603, "learning_rate": 1e-05, "loss": 0.943, "step": 9590 }, { "epoch": 1.65, "grad_norm": 2.1595373153686523, "learning_rate": 1e-05, "loss": 1.0255, "step": 9600 }, { "epoch": 1.65, "grad_norm": 2.704416513442993, "learning_rate": 1e-05, "loss": 1.0146, "step": 9610 }, { "epoch": 1.65, "grad_norm": 3.183342933654785, "learning_rate": 1e-05, "loss": 0.9901, "step": 9620 }, { "epoch": 1.65, "grad_norm": 2.692582130432129, "learning_rate": 1e-05, "loss": 1.0983, "step": 9630 }, { "epoch": 1.66, "grad_norm": 2.6139438152313232, "learning_rate": 1e-05, "loss": 0.9617, "step": 9640 }, { "epoch": 1.66, "grad_norm": 3.860938549041748, "learning_rate": 1e-05, "loss": 1.0004, "step": 9650 }, { "epoch": 1.66, "grad_norm": 2.947887420654297, "learning_rate": 1e-05, "loss": 1.0991, "step": 9660 }, { "epoch": 1.66, "grad_norm": 4.666177272796631, "learning_rate": 1e-05, "loss": 1.0992, "step": 9670 }, { "epoch": 1.66, "grad_norm": 3.027334690093994, "learning_rate": 1e-05, "loss": 1.0596, "step": 9680 }, { "epoch": 1.66, "grad_norm": 5.081746578216553, "learning_rate": 1e-05, "loss": 1.0268, "step": 9690 }, { "epoch": 1.67, "grad_norm": 2.0974204540252686, "learning_rate": 1e-05, "loss": 0.9762, "step": 9700 }, { "epoch": 1.67, "grad_norm": 2.575479745864868, "learning_rate": 1e-05, "loss": 1.1449, "step": 9710 }, { "epoch": 1.67, "grad_norm": 2.9377288818359375, "learning_rate": 1e-05, "loss": 1.0244, "step": 9720 }, { "epoch": 1.67, "grad_norm": 2.1439216136932373, "learning_rate": 1e-05, "loss": 1.0813, "step": 9730 }, { "epoch": 1.67, "grad_norm": 2.694765329360962, "learning_rate": 1e-05, "loss": 1.0974, "step": 9740 }, { "epoch": 1.67, "grad_norm": 2.7554609775543213, "learning_rate": 1e-05, "loss": 1.083, "step": 9750 }, { "epoch": 1.68, "grad_norm": 3.0787527561187744, "learning_rate": 1e-05, "loss": 1.0616, "step": 9760 }, { "epoch": 1.68, "grad_norm": 3.455043315887451, "learning_rate": 1e-05, "loss": 1.0272, "step": 9770 }, { "epoch": 1.68, "grad_norm": 2.3333399295806885, "learning_rate": 1e-05, "loss": 1.0452, "step": 9780 }, { "epoch": 1.68, "grad_norm": 2.8097100257873535, "learning_rate": 1e-05, "loss": 1.1011, "step": 9790 }, { "epoch": 1.68, "grad_norm": 2.2194035053253174, "learning_rate": 1e-05, "loss": 1.0118, "step": 9800 }, { "epoch": 1.68, "grad_norm": 3.151822090148926, "learning_rate": 1e-05, "loss": 1.0312, "step": 9810 }, { "epoch": 1.69, "grad_norm": 2.602029323577881, "learning_rate": 1e-05, "loss": 1.009, "step": 9820 }, { "epoch": 1.69, "grad_norm": 2.129317283630371, "learning_rate": 1e-05, "loss": 0.9707, "step": 9830 }, { "epoch": 1.69, "grad_norm": 2.7062203884124756, "learning_rate": 1e-05, "loss": 1.0804, "step": 9840 }, { "epoch": 1.69, "grad_norm": 3.2763800621032715, "learning_rate": 1e-05, "loss": 1.0053, "step": 9850 }, { "epoch": 1.69, "grad_norm": 2.8125734329223633, "learning_rate": 1e-05, "loss": 1.0487, "step": 9860 }, { "epoch": 1.7, "grad_norm": 2.6083922386169434, "learning_rate": 1e-05, "loss": 0.9819, "step": 9870 }, { "epoch": 1.7, "grad_norm": 2.2002546787261963, "learning_rate": 1e-05, "loss": 1.0543, "step": 9880 }, { "epoch": 1.7, "grad_norm": 2.3200905323028564, "learning_rate": 1e-05, "loss": 1.1133, "step": 9890 }, { "epoch": 1.7, "grad_norm": 2.079068422317505, "learning_rate": 1e-05, "loss": 0.998, "step": 9900 }, { "epoch": 1.7, "grad_norm": 2.647366762161255, "learning_rate": 1e-05, "loss": 1.0267, "step": 9910 }, { "epoch": 1.7, "grad_norm": 2.9108846187591553, "learning_rate": 1e-05, "loss": 1.088, "step": 9920 }, { "epoch": 1.71, "grad_norm": 2.3041622638702393, "learning_rate": 1e-05, "loss": 1.2272, "step": 9930 }, { "epoch": 1.71, "grad_norm": 2.5793256759643555, "learning_rate": 1e-05, "loss": 0.9576, "step": 9940 }, { "epoch": 1.71, "grad_norm": 2.012216329574585, "learning_rate": 1e-05, "loss": 0.9958, "step": 9950 }, { "epoch": 1.71, "grad_norm": 2.0991573333740234, "learning_rate": 1e-05, "loss": 1.0183, "step": 9960 }, { "epoch": 1.71, "grad_norm": 2.904991865158081, "learning_rate": 1e-05, "loss": 1.0034, "step": 9970 }, { "epoch": 1.71, "grad_norm": 2.178138017654419, "learning_rate": 1e-05, "loss": 1.056, "step": 9980 }, { "epoch": 1.72, "grad_norm": 2.0459444522857666, "learning_rate": 1e-05, "loss": 1.0381, "step": 9990 }, { "epoch": 1.72, "grad_norm": 3.139091730117798, "learning_rate": 1e-05, "loss": 1.0501, "step": 10000 }, { "epoch": 1.72, "grad_norm": 2.130234718322754, "learning_rate": 1e-05, "loss": 0.9769, "step": 10010 }, { "epoch": 1.72, "grad_norm": 2.086378574371338, "learning_rate": 1e-05, "loss": 1.0102, "step": 10020 }, { "epoch": 1.72, "grad_norm": 4.662710666656494, "learning_rate": 1e-05, "loss": 1.0031, "step": 10030 }, { "epoch": 1.72, "grad_norm": 2.1875267028808594, "learning_rate": 1e-05, "loss": 1.0599, "step": 10040 }, { "epoch": 1.73, "grad_norm": 2.934441089630127, "learning_rate": 1e-05, "loss": 1.055, "step": 10050 }, { "epoch": 1.73, "grad_norm": 2.363140344619751, "learning_rate": 1e-05, "loss": 1.0554, "step": 10060 }, { "epoch": 1.73, "grad_norm": 2.435114622116089, "learning_rate": 1e-05, "loss": 1.0412, "step": 10070 }, { "epoch": 1.73, "grad_norm": 2.3322625160217285, "learning_rate": 1e-05, "loss": 1.0307, "step": 10080 }, { "epoch": 1.73, "grad_norm": 3.0348896980285645, "learning_rate": 1e-05, "loss": 1.106, "step": 10090 }, { "epoch": 1.73, "grad_norm": 3.9253132343292236, "learning_rate": 1e-05, "loss": 0.9802, "step": 10100 }, { "epoch": 1.74, "grad_norm": 2.191121816635132, "learning_rate": 1e-05, "loss": 0.9393, "step": 10110 }, { "epoch": 1.74, "grad_norm": 2.349829912185669, "learning_rate": 1e-05, "loss": 1.0189, "step": 10120 }, { "epoch": 1.74, "grad_norm": 2.29323410987854, "learning_rate": 1e-05, "loss": 1.079, "step": 10130 }, { "epoch": 1.74, "grad_norm": 1.9248541593551636, "learning_rate": 1e-05, "loss": 0.9665, "step": 10140 }, { "epoch": 1.74, "grad_norm": 3.685276746749878, "learning_rate": 1e-05, "loss": 1.0695, "step": 10150 }, { "epoch": 1.74, "grad_norm": 2.027282953262329, "learning_rate": 1e-05, "loss": 1.0601, "step": 10160 }, { "epoch": 1.75, "grad_norm": 2.761247396469116, "learning_rate": 1e-05, "loss": 1.1077, "step": 10170 }, { "epoch": 1.75, "grad_norm": 2.593219518661499, "learning_rate": 1e-05, "loss": 1.0019, "step": 10180 }, { "epoch": 1.75, "grad_norm": 2.4198343753814697, "learning_rate": 1e-05, "loss": 1.0267, "step": 10190 }, { "epoch": 1.75, "grad_norm": 2.7657220363616943, "learning_rate": 1e-05, "loss": 0.999, "step": 10200 }, { "epoch": 1.75, "grad_norm": 2.4793217182159424, "learning_rate": 1e-05, "loss": 0.9946, "step": 10210 }, { "epoch": 1.76, "grad_norm": 2.2479584217071533, "learning_rate": 1e-05, "loss": 1.0673, "step": 10220 }, { "epoch": 1.76, "grad_norm": 1.7868390083312988, "learning_rate": 1e-05, "loss": 0.9873, "step": 10230 }, { "epoch": 1.76, "grad_norm": 2.0158581733703613, "learning_rate": 1e-05, "loss": 0.989, "step": 10240 }, { "epoch": 1.76, "grad_norm": 2.350095272064209, "learning_rate": 1e-05, "loss": 1.035, "step": 10250 }, { "epoch": 1.76, "grad_norm": 2.529557466506958, "learning_rate": 1e-05, "loss": 1.0576, "step": 10260 }, { "epoch": 1.76, "grad_norm": 2.16300630569458, "learning_rate": 1e-05, "loss": 0.9788, "step": 10270 }, { "epoch": 1.77, "grad_norm": 3.6851983070373535, "learning_rate": 1e-05, "loss": 1.1226, "step": 10280 }, { "epoch": 1.77, "grad_norm": 2.6486752033233643, "learning_rate": 1e-05, "loss": 1.126, "step": 10290 }, { "epoch": 1.77, "grad_norm": 2.3545916080474854, "learning_rate": 1e-05, "loss": 1.053, "step": 10300 }, { "epoch": 1.77, "grad_norm": 2.8078691959381104, "learning_rate": 1e-05, "loss": 1.0989, "step": 10310 }, { "epoch": 1.77, "grad_norm": 2.6008193492889404, "learning_rate": 1e-05, "loss": 1.0376, "step": 10320 }, { "epoch": 1.77, "grad_norm": 3.365222692489624, "learning_rate": 1e-05, "loss": 1.053, "step": 10330 }, { "epoch": 1.78, "grad_norm": 2.4086787700653076, "learning_rate": 1e-05, "loss": 1.0464, "step": 10340 }, { "epoch": 1.78, "grad_norm": 2.502668619155884, "learning_rate": 1e-05, "loss": 1.0884, "step": 10350 }, { "epoch": 1.78, "grad_norm": 2.446023941040039, "learning_rate": 1e-05, "loss": 1.1663, "step": 10360 }, { "epoch": 1.78, "grad_norm": 2.8586957454681396, "learning_rate": 1e-05, "loss": 1.1375, "step": 10370 }, { "epoch": 1.78, "grad_norm": 2.550720453262329, "learning_rate": 1e-05, "loss": 1.0003, "step": 10380 }, { "epoch": 1.78, "grad_norm": 3.0043468475341797, "learning_rate": 1e-05, "loss": 1.1144, "step": 10390 }, { "epoch": 1.79, "grad_norm": 1.8488130569458008, "learning_rate": 1e-05, "loss": 0.9635, "step": 10400 }, { "epoch": 1.79, "grad_norm": 2.419969320297241, "learning_rate": 1e-05, "loss": 0.9657, "step": 10410 }, { "epoch": 1.79, "grad_norm": 2.233978748321533, "learning_rate": 1e-05, "loss": 1.034, "step": 10420 }, { "epoch": 1.79, "grad_norm": 2.712808609008789, "learning_rate": 1e-05, "loss": 1.0175, "step": 10430 }, { "epoch": 1.79, "grad_norm": 1.9075738191604614, "learning_rate": 1e-05, "loss": 0.9141, "step": 10440 }, { "epoch": 1.79, "grad_norm": 4.534524917602539, "learning_rate": 1e-05, "loss": 1.079, "step": 10450 }, { "epoch": 1.8, "grad_norm": 2.488118886947632, "learning_rate": 1e-05, "loss": 1.062, "step": 10460 }, { "epoch": 1.8, "grad_norm": 2.9878602027893066, "learning_rate": 1e-05, "loss": 1.003, "step": 10470 }, { "epoch": 1.8, "grad_norm": 2.6196465492248535, "learning_rate": 1e-05, "loss": 1.0319, "step": 10480 }, { "epoch": 1.8, "grad_norm": 2.7483022212982178, "learning_rate": 1e-05, "loss": 1.0101, "step": 10490 }, { "epoch": 1.8, "grad_norm": 2.408924102783203, "learning_rate": 1e-05, "loss": 1.1074, "step": 10500 }, { "epoch": 1.8, "grad_norm": 2.5068442821502686, "learning_rate": 1e-05, "loss": 1.0417, "step": 10510 }, { "epoch": 1.81, "grad_norm": 2.5237784385681152, "learning_rate": 1e-05, "loss": 1.0879, "step": 10520 }, { "epoch": 1.81, "grad_norm": 2.158583402633667, "learning_rate": 1e-05, "loss": 1.0603, "step": 10530 }, { "epoch": 1.81, "grad_norm": 2.305474281311035, "learning_rate": 1e-05, "loss": 0.9813, "step": 10540 }, { "epoch": 1.81, "grad_norm": 2.7699382305145264, "learning_rate": 1e-05, "loss": 0.9862, "step": 10550 }, { "epoch": 1.81, "grad_norm": 2.487882614135742, "learning_rate": 1e-05, "loss": 0.9959, "step": 10560 }, { "epoch": 1.82, "grad_norm": 2.2361204624176025, "learning_rate": 1e-05, "loss": 1.0431, "step": 10570 }, { "epoch": 1.82, "grad_norm": 2.8864893913269043, "learning_rate": 1e-05, "loss": 1.1185, "step": 10580 }, { "epoch": 1.82, "grad_norm": 2.5624547004699707, "learning_rate": 1e-05, "loss": 1.0679, "step": 10590 }, { "epoch": 1.82, "grad_norm": 2.1008965969085693, "learning_rate": 1e-05, "loss": 0.9721, "step": 10600 }, { "epoch": 1.82, "grad_norm": 2.5166285037994385, "learning_rate": 1e-05, "loss": 1.1075, "step": 10610 }, { "epoch": 1.82, "grad_norm": 2.7157206535339355, "learning_rate": 1e-05, "loss": 1.1449, "step": 10620 }, { "epoch": 1.83, "grad_norm": 2.2612836360931396, "learning_rate": 1e-05, "loss": 1.0796, "step": 10630 }, { "epoch": 1.83, "grad_norm": 2.545149803161621, "learning_rate": 1e-05, "loss": 1.0865, "step": 10640 }, { "epoch": 1.83, "grad_norm": 2.023824453353882, "learning_rate": 1e-05, "loss": 1.06, "step": 10650 }, { "epoch": 1.83, "grad_norm": 2.386988878250122, "learning_rate": 1e-05, "loss": 1.0951, "step": 10660 }, { "epoch": 1.83, "grad_norm": 3.322854995727539, "learning_rate": 1e-05, "loss": 0.9547, "step": 10670 }, { "epoch": 1.83, "grad_norm": 2.6029868125915527, "learning_rate": 1e-05, "loss": 1.006, "step": 10680 }, { "epoch": 1.84, "grad_norm": 2.713660717010498, "learning_rate": 1e-05, "loss": 1.0695, "step": 10690 }, { "epoch": 1.84, "grad_norm": 2.325039863586426, "learning_rate": 1e-05, "loss": 1.0567, "step": 10700 }, { "epoch": 1.84, "grad_norm": 2.0697994232177734, "learning_rate": 1e-05, "loss": 1.0052, "step": 10710 }, { "epoch": 1.84, "grad_norm": 2.3208460807800293, "learning_rate": 1e-05, "loss": 1.0542, "step": 10720 }, { "epoch": 1.84, "grad_norm": 2.4321489334106445, "learning_rate": 1e-05, "loss": 1.0296, "step": 10730 }, { "epoch": 1.84, "grad_norm": 2.381392002105713, "learning_rate": 1e-05, "loss": 1.1445, "step": 10740 }, { "epoch": 1.85, "grad_norm": 2.379256248474121, "learning_rate": 1e-05, "loss": 1.0656, "step": 10750 }, { "epoch": 1.85, "grad_norm": 2.5627009868621826, "learning_rate": 1e-05, "loss": 0.9696, "step": 10760 }, { "epoch": 1.85, "grad_norm": 2.7615158557891846, "learning_rate": 1e-05, "loss": 1.0678, "step": 10770 }, { "epoch": 1.85, "grad_norm": 2.501905679702759, "learning_rate": 1e-05, "loss": 1.0085, "step": 10780 }, { "epoch": 1.85, "grad_norm": 1.9379123449325562, "learning_rate": 1e-05, "loss": 1.0005, "step": 10790 }, { "epoch": 1.85, "grad_norm": 3.583933115005493, "learning_rate": 1e-05, "loss": 1.0475, "step": 10800 }, { "epoch": 1.86, "grad_norm": 2.4750759601593018, "learning_rate": 1e-05, "loss": 1.0547, "step": 10810 }, { "epoch": 1.86, "grad_norm": 2.923191547393799, "learning_rate": 1e-05, "loss": 1.0374, "step": 10820 }, { "epoch": 1.86, "grad_norm": 3.4874658584594727, "learning_rate": 1e-05, "loss": 1.0205, "step": 10830 }, { "epoch": 1.86, "grad_norm": 2.9485082626342773, "learning_rate": 1e-05, "loss": 1.0203, "step": 10840 }, { "epoch": 1.86, "grad_norm": 3.664959192276001, "learning_rate": 1e-05, "loss": 1.0121, "step": 10850 }, { "epoch": 1.87, "grad_norm": 3.395310401916504, "learning_rate": 1e-05, "loss": 1.0349, "step": 10860 }, { "epoch": 1.87, "grad_norm": 2.9110727310180664, "learning_rate": 1e-05, "loss": 1.0065, "step": 10870 }, { "epoch": 1.87, "grad_norm": 2.657857894897461, "learning_rate": 1e-05, "loss": 1.0684, "step": 10880 }, { "epoch": 1.87, "grad_norm": 2.985198974609375, "learning_rate": 1e-05, "loss": 1.0981, "step": 10890 }, { "epoch": 1.87, "grad_norm": 2.062721014022827, "learning_rate": 1e-05, "loss": 0.9516, "step": 10900 }, { "epoch": 1.87, "grad_norm": 3.2055904865264893, "learning_rate": 1e-05, "loss": 1.018, "step": 10910 }, { "epoch": 1.88, "grad_norm": 3.117875814437866, "learning_rate": 1e-05, "loss": 1.0365, "step": 10920 }, { "epoch": 1.88, "grad_norm": 1.7929600477218628, "learning_rate": 1e-05, "loss": 1.0048, "step": 10930 }, { "epoch": 1.88, "grad_norm": 2.6944921016693115, "learning_rate": 1e-05, "loss": 1.1761, "step": 10940 }, { "epoch": 1.88, "grad_norm": 2.6278600692749023, "learning_rate": 1e-05, "loss": 1.054, "step": 10950 }, { "epoch": 1.88, "grad_norm": 2.301929235458374, "learning_rate": 1e-05, "loss": 1.003, "step": 10960 }, { "epoch": 1.88, "grad_norm": 2.0498104095458984, "learning_rate": 1e-05, "loss": 1.0503, "step": 10970 }, { "epoch": 1.89, "grad_norm": 2.6599652767181396, "learning_rate": 1e-05, "loss": 1.0086, "step": 10980 }, { "epoch": 1.89, "grad_norm": 2.476015567779541, "learning_rate": 1e-05, "loss": 1.0572, "step": 10990 }, { "epoch": 1.89, "grad_norm": 3.113619327545166, "learning_rate": 1e-05, "loss": 1.0165, "step": 11000 }, { "epoch": 1.89, "grad_norm": 2.55496883392334, "learning_rate": 1e-05, "loss": 1.0654, "step": 11010 }, { "epoch": 1.89, "grad_norm": 2.72886323928833, "learning_rate": 1e-05, "loss": 1.0975, "step": 11020 }, { "epoch": 1.89, "grad_norm": 2.1363415718078613, "learning_rate": 1e-05, "loss": 1.0918, "step": 11030 }, { "epoch": 1.9, "grad_norm": 2.203794002532959, "learning_rate": 1e-05, "loss": 1.0866, "step": 11040 }, { "epoch": 1.9, "grad_norm": 2.8999581336975098, "learning_rate": 1e-05, "loss": 1.0809, "step": 11050 }, { "epoch": 1.9, "grad_norm": 2.8935294151306152, "learning_rate": 1e-05, "loss": 1.0438, "step": 11060 }, { "epoch": 1.9, "grad_norm": 2.6402342319488525, "learning_rate": 1e-05, "loss": 0.9449, "step": 11070 }, { "epoch": 1.9, "grad_norm": 2.666870594024658, "learning_rate": 1e-05, "loss": 1.1531, "step": 11080 }, { "epoch": 1.9, "grad_norm": 2.20949125289917, "learning_rate": 1e-05, "loss": 0.9572, "step": 11090 }, { "epoch": 1.91, "grad_norm": 2.2581710815429688, "learning_rate": 1e-05, "loss": 1.0955, "step": 11100 }, { "epoch": 1.91, "grad_norm": 2.1770179271698, "learning_rate": 1e-05, "loss": 1.0565, "step": 11110 }, { "epoch": 1.91, "grad_norm": 2.217064619064331, "learning_rate": 1e-05, "loss": 1.0962, "step": 11120 }, { "epoch": 1.91, "grad_norm": 1.9007341861724854, "learning_rate": 1e-05, "loss": 1.0419, "step": 11130 }, { "epoch": 1.91, "grad_norm": 2.0666606426239014, "learning_rate": 1e-05, "loss": 0.9801, "step": 11140 }, { "epoch": 1.91, "grad_norm": 2.4318718910217285, "learning_rate": 1e-05, "loss": 1.0225, "step": 11150 }, { "epoch": 1.92, "grad_norm": 2.6048569679260254, "learning_rate": 1e-05, "loss": 1.0034, "step": 11160 }, { "epoch": 1.92, "grad_norm": 3.7844033241271973, "learning_rate": 1e-05, "loss": 1.0736, "step": 11170 }, { "epoch": 1.92, "grad_norm": 2.0433297157287598, "learning_rate": 1e-05, "loss": 1.1076, "step": 11180 }, { "epoch": 1.92, "grad_norm": 2.671957015991211, "learning_rate": 1e-05, "loss": 1.0873, "step": 11190 }, { "epoch": 1.92, "grad_norm": 3.4829108715057373, "learning_rate": 1e-05, "loss": 1.0102, "step": 11200 }, { "epoch": 1.93, "grad_norm": 3.030287981033325, "learning_rate": 1e-05, "loss": 1.1427, "step": 11210 }, { "epoch": 1.93, "grad_norm": 3.293470621109009, "learning_rate": 1e-05, "loss": 1.0337, "step": 11220 }, { "epoch": 1.93, "grad_norm": 3.201225519180298, "learning_rate": 1e-05, "loss": 1.0945, "step": 11230 }, { "epoch": 1.93, "grad_norm": 2.404139518737793, "learning_rate": 1e-05, "loss": 1.0539, "step": 11240 }, { "epoch": 1.93, "grad_norm": 3.0956571102142334, "learning_rate": 1e-05, "loss": 1.0154, "step": 11250 }, { "epoch": 1.93, "grad_norm": 2.3433375358581543, "learning_rate": 1e-05, "loss": 1.0592, "step": 11260 }, { "epoch": 1.94, "grad_norm": 2.212955951690674, "learning_rate": 1e-05, "loss": 0.9882, "step": 11270 }, { "epoch": 1.94, "grad_norm": 2.154575824737549, "learning_rate": 1e-05, "loss": 1.034, "step": 11280 }, { "epoch": 1.94, "grad_norm": 4.967568397521973, "learning_rate": 1e-05, "loss": 1.0411, "step": 11290 }, { "epoch": 1.94, "grad_norm": 1.8709118366241455, "learning_rate": 1e-05, "loss": 1.0207, "step": 11300 }, { "epoch": 1.94, "grad_norm": 2.2156543731689453, "learning_rate": 1e-05, "loss": 1.1151, "step": 11310 }, { "epoch": 1.94, "grad_norm": 2.266279458999634, "learning_rate": 1e-05, "loss": 1.0779, "step": 11320 }, { "epoch": 1.95, "grad_norm": 2.5265564918518066, "learning_rate": 1e-05, "loss": 1.0515, "step": 11330 }, { "epoch": 1.95, "grad_norm": 2.6855623722076416, "learning_rate": 1e-05, "loss": 0.9992, "step": 11340 }, { "epoch": 1.95, "grad_norm": 3.729240894317627, "learning_rate": 1e-05, "loss": 1.0587, "step": 11350 }, { "epoch": 1.95, "grad_norm": 4.715482234954834, "learning_rate": 1e-05, "loss": 1.0766, "step": 11360 }, { "epoch": 1.95, "grad_norm": 3.9233758449554443, "learning_rate": 1e-05, "loss": 1.1171, "step": 11370 }, { "epoch": 1.95, "grad_norm": 2.296520948410034, "learning_rate": 1e-05, "loss": 1.0624, "step": 11380 }, { "epoch": 1.96, "grad_norm": 2.0924103260040283, "learning_rate": 1e-05, "loss": 0.9887, "step": 11390 }, { "epoch": 1.96, "grad_norm": 2.1080734729766846, "learning_rate": 1e-05, "loss": 1.0284, "step": 11400 }, { "epoch": 1.96, "grad_norm": 3.715378522872925, "learning_rate": 1e-05, "loss": 1.0118, "step": 11410 }, { "epoch": 1.96, "grad_norm": 2.225140333175659, "learning_rate": 1e-05, "loss": 1.0021, "step": 11420 }, { "epoch": 1.96, "grad_norm": 2.3546242713928223, "learning_rate": 1e-05, "loss": 1.046, "step": 11430 }, { "epoch": 1.96, "grad_norm": 2.6119978427886963, "learning_rate": 1e-05, "loss": 1.1451, "step": 11440 }, { "epoch": 1.97, "grad_norm": 2.4649658203125, "learning_rate": 1e-05, "loss": 1.0584, "step": 11450 }, { "epoch": 1.97, "grad_norm": 2.748032331466675, "learning_rate": 1e-05, "loss": 1.1017, "step": 11460 }, { "epoch": 1.97, "grad_norm": 3.6822683811187744, "learning_rate": 1e-05, "loss": 0.9986, "step": 11470 }, { "epoch": 1.97, "grad_norm": 2.461031436920166, "learning_rate": 1e-05, "loss": 1.1179, "step": 11480 }, { "epoch": 1.97, "grad_norm": 1.9633231163024902, "learning_rate": 1e-05, "loss": 1.1283, "step": 11490 }, { "epoch": 1.97, "grad_norm": 2.6085851192474365, "learning_rate": 1e-05, "loss": 1.0043, "step": 11500 }, { "epoch": 1.98, "grad_norm": 2.4378316402435303, "learning_rate": 1e-05, "loss": 0.9722, "step": 11510 }, { "epoch": 1.98, "grad_norm": 2.0799951553344727, "learning_rate": 1e-05, "loss": 1.1276, "step": 11520 }, { "epoch": 1.98, "grad_norm": 2.2782275676727295, "learning_rate": 1e-05, "loss": 1.1108, "step": 11530 }, { "epoch": 1.98, "grad_norm": 2.5447194576263428, "learning_rate": 1e-05, "loss": 1.1114, "step": 11540 }, { "epoch": 1.98, "grad_norm": 2.684854745864868, "learning_rate": 1e-05, "loss": 1.0431, "step": 11550 }, { "epoch": 1.99, "grad_norm": 2.318377733230591, "learning_rate": 1e-05, "loss": 1.0562, "step": 11560 }, { "epoch": 1.99, "grad_norm": 2.3444907665252686, "learning_rate": 1e-05, "loss": 1.1112, "step": 11570 }, { "epoch": 1.99, "grad_norm": 2.450131416320801, "learning_rate": 1e-05, "loss": 1.0187, "step": 11580 }, { "epoch": 1.99, "grad_norm": 3.441574811935425, "learning_rate": 1e-05, "loss": 1.1204, "step": 11590 }, { "epoch": 1.99, "grad_norm": 3.4324586391448975, "learning_rate": 1e-05, "loss": 1.0423, "step": 11600 }, { "epoch": 1.99, "grad_norm": 2.931483030319214, "learning_rate": 1e-05, "loss": 1.112, "step": 11610 }, { "epoch": 2.0, "grad_norm": 3.359668731689453, "learning_rate": 1e-05, "loss": 1.0799, "step": 11620 }, { "epoch": 2.0, "grad_norm": 2.5941739082336426, "learning_rate": 1e-05, "loss": 0.997, "step": 11630 }, { "epoch": 2.0, "grad_norm": 2.8515117168426514, "learning_rate": 1e-05, "loss": 1.0763, "step": 11640 }, { "epoch": 2.0, "eval_loss": 1.849780797958374, "eval_runtime": 21.7737, "eval_samples_per_second": 45.927, "eval_steps_per_second": 45.927, "step": 11646 }, { "epoch": 2.0, "grad_norm": 2.843797206878662, "learning_rate": 1e-05, "loss": 0.9207, "step": 11650 }, { "epoch": 2.0, "grad_norm": 1.897534966468811, "learning_rate": 1e-05, "loss": 0.7565, "step": 11660 }, { "epoch": 2.0, "grad_norm": 4.776578903198242, "learning_rate": 1e-05, "loss": 0.8172, "step": 11670 }, { "epoch": 2.01, "grad_norm": 2.535592555999756, "learning_rate": 1e-05, "loss": 0.7153, "step": 11680 }, { "epoch": 2.01, "grad_norm": 3.9274134635925293, "learning_rate": 1e-05, "loss": 0.7028, "step": 11690 }, { "epoch": 2.01, "grad_norm": 2.555481195449829, "learning_rate": 1e-05, "loss": 0.7169, "step": 11700 }, { "epoch": 2.01, "grad_norm": 1.7720446586608887, "learning_rate": 1e-05, "loss": 0.6535, "step": 11710 }, { "epoch": 2.01, "grad_norm": 2.346792221069336, "learning_rate": 1e-05, "loss": 0.6748, "step": 11720 }, { "epoch": 2.01, "grad_norm": 3.803128480911255, "learning_rate": 1e-05, "loss": 0.7134, "step": 11730 }, { "epoch": 2.02, "grad_norm": 3.682583808898926, "learning_rate": 1e-05, "loss": 0.7639, "step": 11740 }, { "epoch": 2.02, "grad_norm": 2.075239658355713, "learning_rate": 1e-05, "loss": 0.6385, "step": 11750 }, { "epoch": 2.02, "grad_norm": 1.98849618434906, "learning_rate": 1e-05, "loss": 0.6723, "step": 11760 }, { "epoch": 2.02, "grad_norm": 2.6985552310943604, "learning_rate": 1e-05, "loss": 0.658, "step": 11770 }, { "epoch": 2.02, "grad_norm": 3.478668689727783, "learning_rate": 1e-05, "loss": 0.6914, "step": 11780 }, { "epoch": 2.02, "grad_norm": 2.5365147590637207, "learning_rate": 1e-05, "loss": 0.6808, "step": 11790 }, { "epoch": 2.03, "grad_norm": 3.0944130420684814, "learning_rate": 1e-05, "loss": 0.6943, "step": 11800 }, { "epoch": 2.03, "grad_norm": 2.60815691947937, "learning_rate": 1e-05, "loss": 0.6429, "step": 11810 }, { "epoch": 2.03, "grad_norm": 2.929291248321533, "learning_rate": 1e-05, "loss": 0.7343, "step": 11820 }, { "epoch": 2.03, "grad_norm": 3.015611171722412, "learning_rate": 1e-05, "loss": 0.6732, "step": 11830 }, { "epoch": 2.03, "grad_norm": 3.052377939224243, "learning_rate": 1e-05, "loss": 0.8246, "step": 11840 }, { "epoch": 2.04, "grad_norm": 2.1589531898498535, "learning_rate": 1e-05, "loss": 0.6938, "step": 11850 }, { "epoch": 2.04, "grad_norm": 2.822674036026001, "learning_rate": 1e-05, "loss": 0.6883, "step": 11860 }, { "epoch": 2.04, "grad_norm": 2.3565139770507812, "learning_rate": 1e-05, "loss": 0.6635, "step": 11870 }, { "epoch": 2.04, "grad_norm": 2.325673818588257, "learning_rate": 1e-05, "loss": 0.7169, "step": 11880 }, { "epoch": 2.04, "grad_norm": 2.929018020629883, "learning_rate": 1e-05, "loss": 0.7049, "step": 11890 }, { "epoch": 2.04, "grad_norm": 2.7780282497406006, "learning_rate": 1e-05, "loss": 0.6585, "step": 11900 }, { "epoch": 2.05, "grad_norm": 2.293572187423706, "learning_rate": 1e-05, "loss": 0.7346, "step": 11910 }, { "epoch": 2.05, "grad_norm": 2.2444581985473633, "learning_rate": 1e-05, "loss": 0.6725, "step": 11920 }, { "epoch": 2.05, "grad_norm": 2.315701723098755, "learning_rate": 1e-05, "loss": 0.6875, "step": 11930 }, { "epoch": 2.05, "grad_norm": 2.3992395401000977, "learning_rate": 1e-05, "loss": 0.7435, "step": 11940 }, { "epoch": 2.05, "grad_norm": 2.5108776092529297, "learning_rate": 1e-05, "loss": 0.6308, "step": 11950 }, { "epoch": 2.05, "grad_norm": 3.073500156402588, "learning_rate": 1e-05, "loss": 0.6695, "step": 11960 }, { "epoch": 2.06, "grad_norm": 2.31484055519104, "learning_rate": 1e-05, "loss": 0.7226, "step": 11970 }, { "epoch": 2.06, "grad_norm": 3.2437198162078857, "learning_rate": 1e-05, "loss": 0.6808, "step": 11980 }, { "epoch": 2.06, "grad_norm": 3.2090394496917725, "learning_rate": 1e-05, "loss": 0.7653, "step": 11990 }, { "epoch": 2.06, "grad_norm": 3.439284086227417, "learning_rate": 1e-05, "loss": 0.6668, "step": 12000 }, { "epoch": 2.06, "grad_norm": 3.4921059608459473, "learning_rate": 1e-05, "loss": 0.7467, "step": 12010 }, { "epoch": 2.06, "grad_norm": 2.9912140369415283, "learning_rate": 1e-05, "loss": 0.723, "step": 12020 }, { "epoch": 2.07, "grad_norm": 2.1998276710510254, "learning_rate": 1e-05, "loss": 0.564, "step": 12030 }, { "epoch": 2.07, "grad_norm": 2.261547327041626, "learning_rate": 1e-05, "loss": 0.6412, "step": 12040 }, { "epoch": 2.07, "grad_norm": 2.4579389095306396, "learning_rate": 1e-05, "loss": 0.707, "step": 12050 }, { "epoch": 2.07, "grad_norm": 2.2751379013061523, "learning_rate": 1e-05, "loss": 0.6533, "step": 12060 }, { "epoch": 2.07, "grad_norm": 2.349090576171875, "learning_rate": 1e-05, "loss": 0.8271, "step": 12070 }, { "epoch": 2.07, "grad_norm": 2.216106653213501, "learning_rate": 1e-05, "loss": 0.669, "step": 12080 }, { "epoch": 2.08, "grad_norm": 2.4988324642181396, "learning_rate": 1e-05, "loss": 0.7576, "step": 12090 }, { "epoch": 2.08, "grad_norm": 2.8161988258361816, "learning_rate": 1e-05, "loss": 0.7229, "step": 12100 }, { "epoch": 2.08, "grad_norm": 2.5631985664367676, "learning_rate": 1e-05, "loss": 0.7278, "step": 12110 }, { "epoch": 2.08, "grad_norm": 2.14375376701355, "learning_rate": 1e-05, "loss": 0.665, "step": 12120 }, { "epoch": 2.08, "grad_norm": 2.518671989440918, "learning_rate": 1e-05, "loss": 0.6927, "step": 12130 }, { "epoch": 2.08, "grad_norm": 2.4540021419525146, "learning_rate": 1e-05, "loss": 0.7187, "step": 12140 }, { "epoch": 2.09, "grad_norm": 3.0300498008728027, "learning_rate": 1e-05, "loss": 0.6903, "step": 12150 }, { "epoch": 2.09, "grad_norm": 2.7865285873413086, "learning_rate": 1e-05, "loss": 0.6999, "step": 12160 }, { "epoch": 2.09, "grad_norm": 3.4259936809539795, "learning_rate": 1e-05, "loss": 0.7085, "step": 12170 }, { "epoch": 2.09, "grad_norm": 2.500450611114502, "learning_rate": 1e-05, "loss": 0.71, "step": 12180 }, { "epoch": 2.09, "grad_norm": 2.607896566390991, "learning_rate": 1e-05, "loss": 0.6563, "step": 12190 }, { "epoch": 2.1, "grad_norm": 3.381148338317871, "learning_rate": 1e-05, "loss": 0.7063, "step": 12200 }, { "epoch": 2.1, "grad_norm": 1.8460057973861694, "learning_rate": 1e-05, "loss": 0.7151, "step": 12210 }, { "epoch": 2.1, "grad_norm": 2.369692802429199, "learning_rate": 1e-05, "loss": 0.7453, "step": 12220 }, { "epoch": 2.1, "grad_norm": 2.246582269668579, "learning_rate": 1e-05, "loss": 0.6495, "step": 12230 }, { "epoch": 2.1, "grad_norm": 2.5934314727783203, "learning_rate": 1e-05, "loss": 0.6994, "step": 12240 }, { "epoch": 2.1, "grad_norm": 2.804248094558716, "learning_rate": 1e-05, "loss": 0.7329, "step": 12250 }, { "epoch": 2.11, "grad_norm": 2.5733225345611572, "learning_rate": 1e-05, "loss": 0.6917, "step": 12260 }, { "epoch": 2.11, "grad_norm": 2.266530990600586, "learning_rate": 1e-05, "loss": 0.6712, "step": 12270 }, { "epoch": 2.11, "grad_norm": 2.1767702102661133, "learning_rate": 1e-05, "loss": 0.663, "step": 12280 }, { "epoch": 2.11, "grad_norm": 2.936537027359009, "learning_rate": 1e-05, "loss": 0.6707, "step": 12290 }, { "epoch": 2.11, "grad_norm": 2.5065464973449707, "learning_rate": 1e-05, "loss": 0.7478, "step": 12300 }, { "epoch": 2.11, "grad_norm": 2.3270397186279297, "learning_rate": 1e-05, "loss": 0.7466, "step": 12310 }, { "epoch": 2.12, "grad_norm": 2.324957847595215, "learning_rate": 1e-05, "loss": 0.7159, "step": 12320 }, { "epoch": 2.12, "grad_norm": 2.8157315254211426, "learning_rate": 1e-05, "loss": 0.6462, "step": 12330 }, { "epoch": 2.12, "grad_norm": 3.481329917907715, "learning_rate": 1e-05, "loss": 0.6302, "step": 12340 }, { "epoch": 2.12, "grad_norm": 2.075322389602661, "learning_rate": 1e-05, "loss": 0.6474, "step": 12350 }, { "epoch": 2.12, "grad_norm": 2.462676763534546, "learning_rate": 1e-05, "loss": 0.6556, "step": 12360 }, { "epoch": 2.12, "grad_norm": 2.9119620323181152, "learning_rate": 1e-05, "loss": 0.7142, "step": 12370 }, { "epoch": 2.13, "grad_norm": 2.999208450317383, "learning_rate": 1e-05, "loss": 0.6602, "step": 12380 }, { "epoch": 2.13, "grad_norm": 2.5624887943267822, "learning_rate": 1e-05, "loss": 0.7031, "step": 12390 }, { "epoch": 2.13, "grad_norm": 2.6522111892700195, "learning_rate": 1e-05, "loss": 0.7189, "step": 12400 }, { "epoch": 2.13, "grad_norm": 2.1354119777679443, "learning_rate": 1e-05, "loss": 0.7259, "step": 12410 }, { "epoch": 2.13, "grad_norm": 2.693638324737549, "learning_rate": 1e-05, "loss": 0.6861, "step": 12420 }, { "epoch": 2.13, "grad_norm": 2.5909125804901123, "learning_rate": 1e-05, "loss": 0.6882, "step": 12430 }, { "epoch": 2.14, "grad_norm": 2.9442074298858643, "learning_rate": 1e-05, "loss": 0.7714, "step": 12440 }, { "epoch": 2.14, "grad_norm": 2.713937282562256, "learning_rate": 1e-05, "loss": 0.6279, "step": 12450 }, { "epoch": 2.14, "grad_norm": 2.535755157470703, "learning_rate": 1e-05, "loss": 0.6467, "step": 12460 }, { "epoch": 2.14, "grad_norm": 2.3990252017974854, "learning_rate": 1e-05, "loss": 0.6979, "step": 12470 }, { "epoch": 2.14, "grad_norm": 1.9193114042282104, "learning_rate": 1e-05, "loss": 0.6374, "step": 12480 }, { "epoch": 2.14, "grad_norm": 2.7493979930877686, "learning_rate": 1e-05, "loss": 0.652, "step": 12490 }, { "epoch": 2.15, "grad_norm": 2.063673734664917, "learning_rate": 1e-05, "loss": 0.7104, "step": 12500 }, { "epoch": 2.15, "grad_norm": 2.684713840484619, "learning_rate": 1e-05, "loss": 0.6893, "step": 12510 }, { "epoch": 2.15, "grad_norm": 2.4041292667388916, "learning_rate": 1e-05, "loss": 0.6721, "step": 12520 }, { "epoch": 2.15, "grad_norm": 1.9186309576034546, "learning_rate": 1e-05, "loss": 0.666, "step": 12530 }, { "epoch": 2.15, "grad_norm": 3.9311676025390625, "learning_rate": 1e-05, "loss": 0.7856, "step": 12540 }, { "epoch": 2.16, "grad_norm": 2.0597290992736816, "learning_rate": 1e-05, "loss": 0.7454, "step": 12550 }, { "epoch": 2.16, "grad_norm": 3.2816972732543945, "learning_rate": 1e-05, "loss": 0.7107, "step": 12560 }, { "epoch": 2.16, "grad_norm": 2.5250680446624756, "learning_rate": 1e-05, "loss": 0.6695, "step": 12570 }, { "epoch": 2.16, "grad_norm": 2.6315548419952393, "learning_rate": 1e-05, "loss": 0.6763, "step": 12580 }, { "epoch": 2.16, "grad_norm": 2.9592995643615723, "learning_rate": 1e-05, "loss": 0.711, "step": 12590 }, { "epoch": 2.16, "grad_norm": 2.862454891204834, "learning_rate": 1e-05, "loss": 0.7075, "step": 12600 }, { "epoch": 2.17, "grad_norm": 2.8937571048736572, "learning_rate": 1e-05, "loss": 0.712, "step": 12610 }, { "epoch": 2.17, "grad_norm": 2.583103656768799, "learning_rate": 1e-05, "loss": 0.6897, "step": 12620 }, { "epoch": 2.17, "grad_norm": 3.4071152210235596, "learning_rate": 1e-05, "loss": 0.6719, "step": 12630 }, { "epoch": 2.17, "grad_norm": 2.4355502128601074, "learning_rate": 1e-05, "loss": 0.7007, "step": 12640 }, { "epoch": 2.17, "grad_norm": 2.881779670715332, "learning_rate": 1e-05, "loss": 0.7032, "step": 12650 }, { "epoch": 2.17, "grad_norm": 2.5507924556732178, "learning_rate": 1e-05, "loss": 0.731, "step": 12660 }, { "epoch": 2.18, "grad_norm": 2.8062305450439453, "learning_rate": 1e-05, "loss": 0.7516, "step": 12670 }, { "epoch": 2.18, "grad_norm": 2.2241053581237793, "learning_rate": 1e-05, "loss": 0.6573, "step": 12680 }, { "epoch": 2.18, "grad_norm": 1.9851512908935547, "learning_rate": 1e-05, "loss": 0.6876, "step": 12690 }, { "epoch": 2.18, "grad_norm": 2.9645676612854004, "learning_rate": 1e-05, "loss": 0.7315, "step": 12700 }, { "epoch": 2.18, "grad_norm": 2.893385887145996, "learning_rate": 1e-05, "loss": 0.7508, "step": 12710 }, { "epoch": 2.18, "grad_norm": 2.194856643676758, "learning_rate": 1e-05, "loss": 0.7279, "step": 12720 }, { "epoch": 2.19, "grad_norm": 3.4220829010009766, "learning_rate": 1e-05, "loss": 0.6461, "step": 12730 }, { "epoch": 2.19, "grad_norm": 3.7091429233551025, "learning_rate": 1e-05, "loss": 0.6547, "step": 12740 }, { "epoch": 2.19, "grad_norm": 3.0404088497161865, "learning_rate": 1e-05, "loss": 0.6513, "step": 12750 }, { "epoch": 2.19, "grad_norm": 2.5405914783477783, "learning_rate": 1e-05, "loss": 0.7101, "step": 12760 }, { "epoch": 2.19, "grad_norm": 3.01507306098938, "learning_rate": 1e-05, "loss": 0.6865, "step": 12770 }, { "epoch": 2.19, "grad_norm": 3.3664188385009766, "learning_rate": 1e-05, "loss": 0.6304, "step": 12780 }, { "epoch": 2.2, "grad_norm": 2.8610332012176514, "learning_rate": 1e-05, "loss": 0.6897, "step": 12790 }, { "epoch": 2.2, "grad_norm": 3.2472474575042725, "learning_rate": 1e-05, "loss": 0.6572, "step": 12800 }, { "epoch": 2.2, "grad_norm": 3.2986018657684326, "learning_rate": 1e-05, "loss": 0.6768, "step": 12810 }, { "epoch": 2.2, "grad_norm": 2.359896421432495, "learning_rate": 1e-05, "loss": 0.7468, "step": 12820 }, { "epoch": 2.2, "grad_norm": 2.9575815200805664, "learning_rate": 1e-05, "loss": 0.6759, "step": 12830 }, { "epoch": 2.21, "grad_norm": 1.660651445388794, "learning_rate": 1e-05, "loss": 0.6545, "step": 12840 }, { "epoch": 2.21, "grad_norm": 2.5651981830596924, "learning_rate": 1e-05, "loss": 0.6934, "step": 12850 }, { "epoch": 2.21, "grad_norm": 2.8365325927734375, "learning_rate": 1e-05, "loss": 0.684, "step": 12860 }, { "epoch": 2.21, "grad_norm": 2.5450923442840576, "learning_rate": 1e-05, "loss": 0.7585, "step": 12870 }, { "epoch": 2.21, "grad_norm": 3.447737455368042, "learning_rate": 1e-05, "loss": 0.7427, "step": 12880 }, { "epoch": 2.21, "grad_norm": 3.349693536758423, "learning_rate": 1e-05, "loss": 0.6351, "step": 12890 }, { "epoch": 2.22, "grad_norm": 2.5604896545410156, "learning_rate": 1e-05, "loss": 0.7356, "step": 12900 }, { "epoch": 2.22, "grad_norm": 3.0410192012786865, "learning_rate": 1e-05, "loss": 0.7518, "step": 12910 }, { "epoch": 2.22, "grad_norm": 2.895411252975464, "learning_rate": 1e-05, "loss": 0.7153, "step": 12920 }, { "epoch": 2.22, "grad_norm": 2.255626678466797, "learning_rate": 1e-05, "loss": 0.7202, "step": 12930 }, { "epoch": 2.22, "grad_norm": 2.550957441329956, "learning_rate": 1e-05, "loss": 0.7901, "step": 12940 }, { "epoch": 2.22, "grad_norm": 2.8669633865356445, "learning_rate": 1e-05, "loss": 0.6964, "step": 12950 }, { "epoch": 2.23, "grad_norm": 2.965333938598633, "learning_rate": 1e-05, "loss": 0.7156, "step": 12960 }, { "epoch": 2.23, "grad_norm": 2.201613664627075, "learning_rate": 1e-05, "loss": 0.6929, "step": 12970 }, { "epoch": 2.23, "grad_norm": 2.1694796085357666, "learning_rate": 1e-05, "loss": 0.6411, "step": 12980 }, { "epoch": 2.23, "grad_norm": 3.265608549118042, "learning_rate": 1e-05, "loss": 0.7531, "step": 12990 }, { "epoch": 2.23, "grad_norm": 3.275178909301758, "learning_rate": 1e-05, "loss": 0.807, "step": 13000 }, { "epoch": 2.23, "grad_norm": 2.581843137741089, "learning_rate": 1e-05, "loss": 0.7001, "step": 13010 }, { "epoch": 2.24, "grad_norm": 2.3566198348999023, "learning_rate": 1e-05, "loss": 0.7484, "step": 13020 }, { "epoch": 2.24, "grad_norm": 2.3498802185058594, "learning_rate": 1e-05, "loss": 0.6727, "step": 13030 }, { "epoch": 2.24, "grad_norm": 2.0292038917541504, "learning_rate": 1e-05, "loss": 0.7085, "step": 13040 }, { "epoch": 2.24, "grad_norm": 3.543577194213867, "learning_rate": 1e-05, "loss": 0.6679, "step": 13050 }, { "epoch": 2.24, "grad_norm": 2.6873228549957275, "learning_rate": 1e-05, "loss": 0.7377, "step": 13060 }, { "epoch": 2.24, "grad_norm": 3.999117851257324, "learning_rate": 1e-05, "loss": 0.8451, "step": 13070 }, { "epoch": 2.25, "grad_norm": 2.971100091934204, "learning_rate": 1e-05, "loss": 0.738, "step": 13080 }, { "epoch": 2.25, "grad_norm": 2.838461399078369, "learning_rate": 1e-05, "loss": 0.7077, "step": 13090 }, { "epoch": 2.25, "grad_norm": 2.252855062484741, "learning_rate": 1e-05, "loss": 0.7294, "step": 13100 }, { "epoch": 2.25, "grad_norm": 3.0866432189941406, "learning_rate": 1e-05, "loss": 0.7138, "step": 13110 }, { "epoch": 2.25, "grad_norm": 2.364053726196289, "learning_rate": 1e-05, "loss": 0.6806, "step": 13120 }, { "epoch": 2.25, "grad_norm": 2.768568515777588, "learning_rate": 1e-05, "loss": 0.7196, "step": 13130 }, { "epoch": 2.26, "grad_norm": 4.763648986816406, "learning_rate": 1e-05, "loss": 0.7165, "step": 13140 }, { "epoch": 2.26, "grad_norm": 3.4651520252227783, "learning_rate": 1e-05, "loss": 0.7155, "step": 13150 }, { "epoch": 2.26, "grad_norm": 2.2361979484558105, "learning_rate": 1e-05, "loss": 0.7171, "step": 13160 }, { "epoch": 2.26, "grad_norm": 2.607278347015381, "learning_rate": 1e-05, "loss": 0.698, "step": 13170 }, { "epoch": 2.26, "grad_norm": 2.6196305751800537, "learning_rate": 1e-05, "loss": 0.7162, "step": 13180 }, { "epoch": 2.27, "grad_norm": 2.6148579120635986, "learning_rate": 1e-05, "loss": 0.6812, "step": 13190 }, { "epoch": 2.27, "grad_norm": 2.5217950344085693, "learning_rate": 1e-05, "loss": 0.7015, "step": 13200 }, { "epoch": 2.27, "grad_norm": 3.2551751136779785, "learning_rate": 1e-05, "loss": 0.7336, "step": 13210 }, { "epoch": 2.27, "grad_norm": 2.751849412918091, "learning_rate": 1e-05, "loss": 0.7156, "step": 13220 }, { "epoch": 2.27, "grad_norm": 2.5473365783691406, "learning_rate": 1e-05, "loss": 0.7103, "step": 13230 }, { "epoch": 2.27, "grad_norm": 2.067836046218872, "learning_rate": 1e-05, "loss": 0.7334, "step": 13240 }, { "epoch": 2.28, "grad_norm": 2.5932745933532715, "learning_rate": 1e-05, "loss": 0.658, "step": 13250 }, { "epoch": 2.28, "grad_norm": 3.098078489303589, "learning_rate": 1e-05, "loss": 0.6701, "step": 13260 }, { "epoch": 2.28, "grad_norm": 2.336190700531006, "learning_rate": 1e-05, "loss": 0.6707, "step": 13270 }, { "epoch": 2.28, "grad_norm": 2.382223606109619, "learning_rate": 1e-05, "loss": 0.6473, "step": 13280 }, { "epoch": 2.28, "grad_norm": 2.551252603530884, "learning_rate": 1e-05, "loss": 0.8185, "step": 13290 }, { "epoch": 2.28, "grad_norm": 2.903029203414917, "learning_rate": 1e-05, "loss": 0.7188, "step": 13300 }, { "epoch": 2.29, "grad_norm": 3.2348668575286865, "learning_rate": 1e-05, "loss": 0.7409, "step": 13310 }, { "epoch": 2.29, "grad_norm": 2.7562198638916016, "learning_rate": 1e-05, "loss": 0.7271, "step": 13320 }, { "epoch": 2.29, "grad_norm": 3.178734540939331, "learning_rate": 1e-05, "loss": 0.7138, "step": 13330 }, { "epoch": 2.29, "grad_norm": 2.834533929824829, "learning_rate": 1e-05, "loss": 0.6777, "step": 13340 }, { "epoch": 2.29, "grad_norm": 2.3648908138275146, "learning_rate": 1e-05, "loss": 0.7587, "step": 13350 }, { "epoch": 2.29, "grad_norm": 3.3052682876586914, "learning_rate": 1e-05, "loss": 0.7108, "step": 13360 }, { "epoch": 2.3, "grad_norm": 2.5473461151123047, "learning_rate": 1e-05, "loss": 0.6858, "step": 13370 }, { "epoch": 2.3, "grad_norm": 2.689939260482788, "learning_rate": 1e-05, "loss": 0.7253, "step": 13380 }, { "epoch": 2.3, "grad_norm": 2.7012176513671875, "learning_rate": 1e-05, "loss": 0.659, "step": 13390 }, { "epoch": 2.3, "grad_norm": 3.217085361480713, "learning_rate": 1e-05, "loss": 0.7182, "step": 13400 }, { "epoch": 2.3, "grad_norm": 2.019303321838379, "learning_rate": 1e-05, "loss": 0.715, "step": 13410 }, { "epoch": 2.3, "grad_norm": 2.136199474334717, "learning_rate": 1e-05, "loss": 0.8071, "step": 13420 }, { "epoch": 2.31, "grad_norm": 2.7258598804473877, "learning_rate": 1e-05, "loss": 0.7511, "step": 13430 }, { "epoch": 2.31, "grad_norm": 2.4771461486816406, "learning_rate": 1e-05, "loss": 0.6709, "step": 13440 }, { "epoch": 2.31, "grad_norm": 2.250577449798584, "learning_rate": 1e-05, "loss": 0.7144, "step": 13450 }, { "epoch": 2.31, "grad_norm": 2.512507677078247, "learning_rate": 1e-05, "loss": 0.643, "step": 13460 }, { "epoch": 2.31, "grad_norm": 2.3582048416137695, "learning_rate": 1e-05, "loss": 0.7727, "step": 13470 }, { "epoch": 2.31, "grad_norm": 2.9323537349700928, "learning_rate": 1e-05, "loss": 0.7564, "step": 13480 }, { "epoch": 2.32, "grad_norm": 4.975164413452148, "learning_rate": 1e-05, "loss": 0.7155, "step": 13490 }, { "epoch": 2.32, "grad_norm": 3.4974381923675537, "learning_rate": 1e-05, "loss": 0.6929, "step": 13500 }, { "epoch": 2.32, "grad_norm": 4.331045150756836, "learning_rate": 1e-05, "loss": 0.7187, "step": 13510 }, { "epoch": 2.32, "grad_norm": 2.769953489303589, "learning_rate": 1e-05, "loss": 0.7293, "step": 13520 }, { "epoch": 2.32, "grad_norm": 2.571716785430908, "learning_rate": 1e-05, "loss": 0.6686, "step": 13530 }, { "epoch": 2.33, "grad_norm": 2.4005305767059326, "learning_rate": 1e-05, "loss": 0.6675, "step": 13540 }, { "epoch": 2.33, "grad_norm": 4.396423816680908, "learning_rate": 1e-05, "loss": 0.7801, "step": 13550 }, { "epoch": 2.33, "grad_norm": 2.680354356765747, "learning_rate": 1e-05, "loss": 0.6873, "step": 13560 }, { "epoch": 2.33, "grad_norm": 2.4758498668670654, "learning_rate": 1e-05, "loss": 0.7204, "step": 13570 }, { "epoch": 2.33, "grad_norm": 2.857762575149536, "learning_rate": 1e-05, "loss": 0.7049, "step": 13580 }, { "epoch": 2.33, "grad_norm": 2.436479330062866, "learning_rate": 1e-05, "loss": 0.6877, "step": 13590 }, { "epoch": 2.34, "grad_norm": 3.100313186645508, "learning_rate": 1e-05, "loss": 0.7481, "step": 13600 }, { "epoch": 2.34, "grad_norm": 2.6817588806152344, "learning_rate": 1e-05, "loss": 0.7384, "step": 13610 }, { "epoch": 2.34, "grad_norm": 2.7088801860809326, "learning_rate": 1e-05, "loss": 0.8041, "step": 13620 }, { "epoch": 2.34, "grad_norm": 3.363356351852417, "learning_rate": 1e-05, "loss": 0.8368, "step": 13630 }, { "epoch": 2.34, "grad_norm": 3.5108892917633057, "learning_rate": 1e-05, "loss": 0.699, "step": 13640 }, { "epoch": 2.34, "grad_norm": 2.41882061958313, "learning_rate": 1e-05, "loss": 0.7362, "step": 13650 }, { "epoch": 2.35, "grad_norm": 2.439345359802246, "learning_rate": 1e-05, "loss": 0.6982, "step": 13660 }, { "epoch": 2.35, "grad_norm": 3.261932134628296, "learning_rate": 1e-05, "loss": 0.7125, "step": 13670 }, { "epoch": 2.35, "grad_norm": 1.8121339082717896, "learning_rate": 1e-05, "loss": 0.644, "step": 13680 }, { "epoch": 2.35, "grad_norm": 3.049854040145874, "learning_rate": 1e-05, "loss": 0.7739, "step": 13690 }, { "epoch": 2.35, "grad_norm": 2.311997413635254, "learning_rate": 1e-05, "loss": 0.7085, "step": 13700 }, { "epoch": 2.35, "grad_norm": 4.8963189125061035, "learning_rate": 1e-05, "loss": 0.7852, "step": 13710 }, { "epoch": 2.36, "grad_norm": 2.440222978591919, "learning_rate": 1e-05, "loss": 0.8285, "step": 13720 }, { "epoch": 2.36, "grad_norm": 3.0829832553863525, "learning_rate": 1e-05, "loss": 0.7115, "step": 13730 }, { "epoch": 2.36, "grad_norm": 2.4086291790008545, "learning_rate": 1e-05, "loss": 0.6505, "step": 13740 }, { "epoch": 2.36, "grad_norm": 6.449065685272217, "learning_rate": 1e-05, "loss": 0.7672, "step": 13750 }, { "epoch": 2.36, "grad_norm": 2.3513927459716797, "learning_rate": 1e-05, "loss": 0.7172, "step": 13760 }, { "epoch": 2.36, "grad_norm": 2.6725003719329834, "learning_rate": 1e-05, "loss": 0.6698, "step": 13770 }, { "epoch": 2.37, "grad_norm": 2.1811881065368652, "learning_rate": 1e-05, "loss": 0.6756, "step": 13780 }, { "epoch": 2.37, "grad_norm": 2.4521212577819824, "learning_rate": 1e-05, "loss": 0.6899, "step": 13790 }, { "epoch": 2.37, "grad_norm": 2.6575536727905273, "learning_rate": 1e-05, "loss": 0.6729, "step": 13800 }, { "epoch": 2.37, "grad_norm": 3.644134283065796, "learning_rate": 1e-05, "loss": 0.7842, "step": 13810 }, { "epoch": 2.37, "grad_norm": 2.9881744384765625, "learning_rate": 1e-05, "loss": 0.7806, "step": 13820 }, { "epoch": 2.38, "grad_norm": 2.4042603969573975, "learning_rate": 1e-05, "loss": 0.7118, "step": 13830 }, { "epoch": 2.38, "grad_norm": 2.671569347381592, "learning_rate": 1e-05, "loss": 0.6719, "step": 13840 }, { "epoch": 2.38, "grad_norm": 2.206813097000122, "learning_rate": 1e-05, "loss": 0.7705, "step": 13850 }, { "epoch": 2.38, "grad_norm": 2.489980459213257, "learning_rate": 1e-05, "loss": 0.7533, "step": 13860 }, { "epoch": 2.38, "grad_norm": 2.6754541397094727, "learning_rate": 1e-05, "loss": 0.707, "step": 13870 }, { "epoch": 2.38, "grad_norm": 2.5215911865234375, "learning_rate": 1e-05, "loss": 0.7196, "step": 13880 }, { "epoch": 2.39, "grad_norm": 2.8557751178741455, "learning_rate": 1e-05, "loss": 0.7224, "step": 13890 }, { "epoch": 2.39, "grad_norm": 2.152554750442505, "learning_rate": 1e-05, "loss": 0.7437, "step": 13900 }, { "epoch": 2.39, "grad_norm": 1.7907458543777466, "learning_rate": 1e-05, "loss": 0.6867, "step": 13910 }, { "epoch": 2.39, "grad_norm": 2.468668222427368, "learning_rate": 1e-05, "loss": 0.702, "step": 13920 }, { "epoch": 2.39, "grad_norm": 2.148282766342163, "learning_rate": 1e-05, "loss": 0.6518, "step": 13930 }, { "epoch": 2.39, "grad_norm": 3.8288798332214355, "learning_rate": 1e-05, "loss": 0.7376, "step": 13940 }, { "epoch": 2.4, "grad_norm": 2.21659779548645, "learning_rate": 1e-05, "loss": 0.6753, "step": 13950 }, { "epoch": 2.4, "grad_norm": 2.80706787109375, "learning_rate": 1e-05, "loss": 0.7521, "step": 13960 }, { "epoch": 2.4, "grad_norm": 3.0065953731536865, "learning_rate": 1e-05, "loss": 0.6689, "step": 13970 }, { "epoch": 2.4, "grad_norm": 2.646472215652466, "learning_rate": 1e-05, "loss": 0.7145, "step": 13980 }, { "epoch": 2.4, "grad_norm": 2.4246387481689453, "learning_rate": 1e-05, "loss": 0.7228, "step": 13990 }, { "epoch": 2.4, "grad_norm": 2.947126865386963, "learning_rate": 1e-05, "loss": 0.6762, "step": 14000 }, { "epoch": 2.41, "grad_norm": 2.71744704246521, "learning_rate": 1e-05, "loss": 0.7209, "step": 14010 }, { "epoch": 2.41, "grad_norm": 2.5872905254364014, "learning_rate": 1e-05, "loss": 0.6415, "step": 14020 }, { "epoch": 2.41, "grad_norm": 2.5897634029388428, "learning_rate": 1e-05, "loss": 0.7609, "step": 14030 }, { "epoch": 2.41, "grad_norm": 2.4245212078094482, "learning_rate": 1e-05, "loss": 0.6804, "step": 14040 }, { "epoch": 2.41, "grad_norm": 4.575323581695557, "learning_rate": 1e-05, "loss": 0.7517, "step": 14050 }, { "epoch": 2.41, "grad_norm": 2.2898592948913574, "learning_rate": 1e-05, "loss": 0.6849, "step": 14060 }, { "epoch": 2.42, "grad_norm": 3.359697103500366, "learning_rate": 1e-05, "loss": 0.7076, "step": 14070 }, { "epoch": 2.42, "grad_norm": 2.9763455390930176, "learning_rate": 1e-05, "loss": 0.747, "step": 14080 }, { "epoch": 2.42, "grad_norm": 2.798625946044922, "learning_rate": 1e-05, "loss": 0.6958, "step": 14090 }, { "epoch": 2.42, "grad_norm": 2.5556600093841553, "learning_rate": 1e-05, "loss": 0.7089, "step": 14100 }, { "epoch": 2.42, "grad_norm": 2.997627019882202, "learning_rate": 1e-05, "loss": 0.6879, "step": 14110 }, { "epoch": 2.42, "grad_norm": 3.4501309394836426, "learning_rate": 1e-05, "loss": 0.7083, "step": 14120 }, { "epoch": 2.43, "grad_norm": 3.426292657852173, "learning_rate": 1e-05, "loss": 0.7241, "step": 14130 }, { "epoch": 2.43, "grad_norm": 3.4111952781677246, "learning_rate": 1e-05, "loss": 0.7016, "step": 14140 }, { "epoch": 2.43, "grad_norm": 2.8646419048309326, "learning_rate": 1e-05, "loss": 0.7362, "step": 14150 }, { "epoch": 2.43, "grad_norm": 3.150669813156128, "learning_rate": 1e-05, "loss": 0.705, "step": 14160 }, { "epoch": 2.43, "grad_norm": 3.1819069385528564, "learning_rate": 1e-05, "loss": 0.7009, "step": 14170 }, { "epoch": 2.44, "grad_norm": 3.3720028400421143, "learning_rate": 1e-05, "loss": 0.6942, "step": 14180 }, { "epoch": 2.44, "grad_norm": 3.2291455268859863, "learning_rate": 1e-05, "loss": 0.7138, "step": 14190 }, { "epoch": 2.44, "grad_norm": 4.61140775680542, "learning_rate": 1e-05, "loss": 0.8018, "step": 14200 }, { "epoch": 2.44, "grad_norm": 4.3528265953063965, "learning_rate": 1e-05, "loss": 0.7181, "step": 14210 }, { "epoch": 2.44, "grad_norm": 3.0563249588012695, "learning_rate": 1e-05, "loss": 0.7748, "step": 14220 }, { "epoch": 2.44, "grad_norm": 1.952367901802063, "learning_rate": 1e-05, "loss": 0.7157, "step": 14230 }, { "epoch": 2.45, "grad_norm": 2.618277072906494, "learning_rate": 1e-05, "loss": 0.6776, "step": 14240 }, { "epoch": 2.45, "grad_norm": 2.3816099166870117, "learning_rate": 1e-05, "loss": 0.7168, "step": 14250 }, { "epoch": 2.45, "grad_norm": 5.881453037261963, "learning_rate": 1e-05, "loss": 0.7459, "step": 14260 }, { "epoch": 2.45, "grad_norm": 3.0305838584899902, "learning_rate": 1e-05, "loss": 0.7478, "step": 14270 }, { "epoch": 2.45, "grad_norm": 2.958314895629883, "learning_rate": 1e-05, "loss": 0.7025, "step": 14280 }, { "epoch": 2.45, "grad_norm": 2.898390054702759, "learning_rate": 1e-05, "loss": 0.7767, "step": 14290 }, { "epoch": 2.46, "grad_norm": 3.295344829559326, "learning_rate": 1e-05, "loss": 0.7745, "step": 14300 }, { "epoch": 2.46, "grad_norm": 2.084602117538452, "learning_rate": 1e-05, "loss": 0.6736, "step": 14310 }, { "epoch": 2.46, "grad_norm": 3.2696902751922607, "learning_rate": 1e-05, "loss": 0.7092, "step": 14320 }, { "epoch": 2.46, "grad_norm": 2.228020429611206, "learning_rate": 1e-05, "loss": 0.6445, "step": 14330 }, { "epoch": 2.46, "grad_norm": 3.259493350982666, "learning_rate": 1e-05, "loss": 0.7543, "step": 14340 }, { "epoch": 2.46, "grad_norm": 4.3431501388549805, "learning_rate": 1e-05, "loss": 0.7228, "step": 14350 }, { "epoch": 2.47, "grad_norm": 2.6910886764526367, "learning_rate": 1e-05, "loss": 0.7423, "step": 14360 }, { "epoch": 2.47, "grad_norm": 3.4436614513397217, "learning_rate": 1e-05, "loss": 0.6928, "step": 14370 }, { "epoch": 2.47, "grad_norm": 3.588196039199829, "learning_rate": 1e-05, "loss": 0.7113, "step": 14380 }, { "epoch": 2.47, "grad_norm": 1.8467262983322144, "learning_rate": 1e-05, "loss": 0.6801, "step": 14390 }, { "epoch": 2.47, "grad_norm": 3.05918288230896, "learning_rate": 1e-05, "loss": 0.769, "step": 14400 }, { "epoch": 2.47, "grad_norm": 3.821401357650757, "learning_rate": 1e-05, "loss": 0.7344, "step": 14410 }, { "epoch": 2.48, "grad_norm": 3.6849071979522705, "learning_rate": 1e-05, "loss": 0.8138, "step": 14420 }, { "epoch": 2.48, "grad_norm": 2.413574457168579, "learning_rate": 1e-05, "loss": 0.7453, "step": 14430 }, { "epoch": 2.48, "grad_norm": 3.0552051067352295, "learning_rate": 1e-05, "loss": 0.665, "step": 14440 }, { "epoch": 2.48, "grad_norm": 2.2876293659210205, "learning_rate": 1e-05, "loss": 0.7064, "step": 14450 }, { "epoch": 2.48, "grad_norm": 2.75596022605896, "learning_rate": 1e-05, "loss": 0.7241, "step": 14460 }, { "epoch": 2.48, "grad_norm": 2.0840628147125244, "learning_rate": 1e-05, "loss": 0.7029, "step": 14470 }, { "epoch": 2.49, "grad_norm": 2.3339579105377197, "learning_rate": 1e-05, "loss": 0.6926, "step": 14480 }, { "epoch": 2.49, "grad_norm": 2.6257641315460205, "learning_rate": 1e-05, "loss": 0.7306, "step": 14490 }, { "epoch": 2.49, "grad_norm": 3.338038206100464, "learning_rate": 1e-05, "loss": 0.7289, "step": 14500 }, { "epoch": 2.49, "grad_norm": 3.947277545928955, "learning_rate": 1e-05, "loss": 0.7133, "step": 14510 }, { "epoch": 2.49, "grad_norm": 4.049989223480225, "learning_rate": 1e-05, "loss": 0.7539, "step": 14520 }, { "epoch": 2.5, "grad_norm": 2.6452383995056152, "learning_rate": 1e-05, "loss": 0.7556, "step": 14530 }, { "epoch": 2.5, "grad_norm": 2.4738383293151855, "learning_rate": 1e-05, "loss": 0.7545, "step": 14540 }, { "epoch": 2.5, "grad_norm": 3.1785967350006104, "learning_rate": 1e-05, "loss": 0.6619, "step": 14550 }, { "epoch": 2.5, "grad_norm": 2.7168779373168945, "learning_rate": 1e-05, "loss": 0.7748, "step": 14560 }, { "epoch": 2.5, "grad_norm": 2.5785582065582275, "learning_rate": 1e-05, "loss": 0.7754, "step": 14570 }, { "epoch": 2.5, "grad_norm": 3.7294836044311523, "learning_rate": 1e-05, "loss": 0.729, "step": 14580 }, { "epoch": 2.51, "grad_norm": 2.786391496658325, "learning_rate": 1e-05, "loss": 0.679, "step": 14590 }, { "epoch": 2.51, "grad_norm": 3.8487510681152344, "learning_rate": 1e-05, "loss": 0.7393, "step": 14600 }, { "epoch": 2.51, "grad_norm": 2.528351306915283, "learning_rate": 1e-05, "loss": 0.7025, "step": 14610 }, { "epoch": 2.51, "grad_norm": 2.2078957557678223, "learning_rate": 1e-05, "loss": 0.6564, "step": 14620 }, { "epoch": 2.51, "grad_norm": 2.7390918731689453, "learning_rate": 1e-05, "loss": 0.7214, "step": 14630 }, { "epoch": 2.51, "grad_norm": 2.95219087600708, "learning_rate": 1e-05, "loss": 0.7425, "step": 14640 }, { "epoch": 2.52, "grad_norm": 2.6406657695770264, "learning_rate": 1e-05, "loss": 0.7114, "step": 14650 }, { "epoch": 2.52, "grad_norm": 3.2420835494995117, "learning_rate": 1e-05, "loss": 0.7072, "step": 14660 }, { "epoch": 2.52, "grad_norm": 3.2121024131774902, "learning_rate": 1e-05, "loss": 0.7846, "step": 14670 }, { "epoch": 2.52, "grad_norm": 2.7226524353027344, "learning_rate": 1e-05, "loss": 0.7264, "step": 14680 }, { "epoch": 2.52, "grad_norm": 2.925915479660034, "learning_rate": 1e-05, "loss": 0.7575, "step": 14690 }, { "epoch": 2.52, "grad_norm": 2.7175862789154053, "learning_rate": 1e-05, "loss": 0.7436, "step": 14700 }, { "epoch": 2.53, "grad_norm": 2.9792709350585938, "learning_rate": 1e-05, "loss": 0.7475, "step": 14710 }, { "epoch": 2.53, "grad_norm": 2.419769048690796, "learning_rate": 1e-05, "loss": 0.757, "step": 14720 }, { "epoch": 2.53, "grad_norm": 2.0730252265930176, "learning_rate": 1e-05, "loss": 0.7351, "step": 14730 }, { "epoch": 2.53, "grad_norm": 3.0078718662261963, "learning_rate": 1e-05, "loss": 0.7393, "step": 14740 }, { "epoch": 2.53, "grad_norm": 1.7846797704696655, "learning_rate": 1e-05, "loss": 0.6745, "step": 14750 }, { "epoch": 2.53, "grad_norm": 2.7745330333709717, "learning_rate": 1e-05, "loss": 0.747, "step": 14760 }, { "epoch": 2.54, "grad_norm": 3.211197853088379, "learning_rate": 1e-05, "loss": 0.7089, "step": 14770 }, { "epoch": 2.54, "grad_norm": 2.642756223678589, "learning_rate": 1e-05, "loss": 0.7322, "step": 14780 }, { "epoch": 2.54, "grad_norm": 2.5132839679718018, "learning_rate": 1e-05, "loss": 0.6992, "step": 14790 }, { "epoch": 2.54, "grad_norm": 2.3744137287139893, "learning_rate": 1e-05, "loss": 0.6811, "step": 14800 }, { "epoch": 2.54, "grad_norm": 2.4576313495635986, "learning_rate": 1e-05, "loss": 0.7047, "step": 14810 }, { "epoch": 2.55, "grad_norm": 2.6812050342559814, "learning_rate": 1e-05, "loss": 0.753, "step": 14820 }, { "epoch": 2.55, "grad_norm": 2.089709997177124, "learning_rate": 1e-05, "loss": 0.7411, "step": 14830 }, { "epoch": 2.55, "grad_norm": 2.876819133758545, "learning_rate": 1e-05, "loss": 0.6582, "step": 14840 }, { "epoch": 2.55, "grad_norm": 2.931020736694336, "learning_rate": 1e-05, "loss": 0.7025, "step": 14850 }, { "epoch": 2.55, "grad_norm": 2.4845004081726074, "learning_rate": 1e-05, "loss": 0.7036, "step": 14860 }, { "epoch": 2.55, "grad_norm": 3.168095111846924, "learning_rate": 1e-05, "loss": 0.6892, "step": 14870 }, { "epoch": 2.56, "grad_norm": 2.145261526107788, "learning_rate": 1e-05, "loss": 0.662, "step": 14880 }, { "epoch": 2.56, "grad_norm": 2.742406129837036, "learning_rate": 1e-05, "loss": 0.7319, "step": 14890 }, { "epoch": 2.56, "grad_norm": 2.996185779571533, "learning_rate": 1e-05, "loss": 0.7447, "step": 14900 }, { "epoch": 2.56, "grad_norm": 3.1864700317382812, "learning_rate": 1e-05, "loss": 0.7869, "step": 14910 }, { "epoch": 2.56, "grad_norm": 2.646944522857666, "learning_rate": 1e-05, "loss": 0.7388, "step": 14920 }, { "epoch": 2.56, "grad_norm": 2.3198368549346924, "learning_rate": 1e-05, "loss": 0.7696, "step": 14930 }, { "epoch": 2.57, "grad_norm": 2.353590726852417, "learning_rate": 1e-05, "loss": 0.8069, "step": 14940 }, { "epoch": 2.57, "grad_norm": 2.4450297355651855, "learning_rate": 1e-05, "loss": 0.7355, "step": 14950 }, { "epoch": 2.57, "grad_norm": 2.8194401264190674, "learning_rate": 1e-05, "loss": 0.75, "step": 14960 }, { "epoch": 2.57, "grad_norm": 3.041092872619629, "learning_rate": 1e-05, "loss": 0.6854, "step": 14970 }, { "epoch": 2.57, "grad_norm": 4.193447589874268, "learning_rate": 1e-05, "loss": 0.7292, "step": 14980 }, { "epoch": 2.57, "grad_norm": 2.8166589736938477, "learning_rate": 1e-05, "loss": 0.7081, "step": 14990 }, { "epoch": 2.58, "grad_norm": 2.8678197860717773, "learning_rate": 1e-05, "loss": 0.7169, "step": 15000 }, { "epoch": 2.58, "grad_norm": 2.5955636501312256, "learning_rate": 1e-05, "loss": 0.7199, "step": 15010 }, { "epoch": 2.58, "grad_norm": 2.822446584701538, "learning_rate": 1e-05, "loss": 0.7213, "step": 15020 }, { "epoch": 2.58, "grad_norm": 3.7871756553649902, "learning_rate": 1e-05, "loss": 0.6311, "step": 15030 }, { "epoch": 2.58, "grad_norm": 4.210511207580566, "learning_rate": 1e-05, "loss": 0.738, "step": 15040 }, { "epoch": 2.58, "grad_norm": 2.3034372329711914, "learning_rate": 1e-05, "loss": 0.7456, "step": 15050 }, { "epoch": 2.59, "grad_norm": 2.2593023777008057, "learning_rate": 1e-05, "loss": 0.6465, "step": 15060 }, { "epoch": 2.59, "grad_norm": 3.289602756500244, "learning_rate": 1e-05, "loss": 0.7163, "step": 15070 }, { "epoch": 2.59, "grad_norm": 2.7729742527008057, "learning_rate": 1e-05, "loss": 0.7542, "step": 15080 }, { "epoch": 2.59, "grad_norm": 3.9700310230255127, "learning_rate": 1e-05, "loss": 0.6687, "step": 15090 }, { "epoch": 2.59, "grad_norm": 5.73222541809082, "learning_rate": 1e-05, "loss": 0.8529, "step": 15100 }, { "epoch": 2.59, "grad_norm": 2.1498641967773438, "learning_rate": 1e-05, "loss": 0.7654, "step": 15110 }, { "epoch": 2.6, "grad_norm": 2.562579870223999, "learning_rate": 1e-05, "loss": 0.7567, "step": 15120 }, { "epoch": 2.6, "grad_norm": 2.8067171573638916, "learning_rate": 1e-05, "loss": 0.8068, "step": 15130 }, { "epoch": 2.6, "grad_norm": 2.884761095046997, "learning_rate": 1e-05, "loss": 0.7657, "step": 15140 }, { "epoch": 2.6, "grad_norm": 3.6787796020507812, "learning_rate": 1e-05, "loss": 0.7246, "step": 15150 }, { "epoch": 2.6, "grad_norm": 4.4008331298828125, "learning_rate": 1e-05, "loss": 0.7994, "step": 15160 }, { "epoch": 2.61, "grad_norm": 2.7413175106048584, "learning_rate": 1e-05, "loss": 0.7031, "step": 15170 }, { "epoch": 2.61, "grad_norm": 2.5500998497009277, "learning_rate": 1e-05, "loss": 0.6695, "step": 15180 }, { "epoch": 2.61, "grad_norm": 2.7714643478393555, "learning_rate": 1e-05, "loss": 0.7173, "step": 15190 }, { "epoch": 2.61, "grad_norm": 2.525230884552002, "learning_rate": 1e-05, "loss": 0.7346, "step": 15200 }, { "epoch": 2.61, "grad_norm": 2.637814521789551, "learning_rate": 1e-05, "loss": 0.7767, "step": 15210 }, { "epoch": 2.61, "grad_norm": 2.278615713119507, "learning_rate": 1e-05, "loss": 0.7097, "step": 15220 }, { "epoch": 2.62, "grad_norm": 2.507465124130249, "learning_rate": 1e-05, "loss": 0.7192, "step": 15230 }, { "epoch": 2.62, "grad_norm": 3.573824405670166, "learning_rate": 1e-05, "loss": 0.7061, "step": 15240 }, { "epoch": 2.62, "grad_norm": 2.6793293952941895, "learning_rate": 1e-05, "loss": 0.7652, "step": 15250 }, { "epoch": 2.62, "grad_norm": 3.3592193126678467, "learning_rate": 1e-05, "loss": 0.738, "step": 15260 }, { "epoch": 2.62, "grad_norm": 2.9290943145751953, "learning_rate": 1e-05, "loss": 0.7563, "step": 15270 }, { "epoch": 2.62, "grad_norm": 2.167343854904175, "learning_rate": 1e-05, "loss": 0.6755, "step": 15280 }, { "epoch": 2.63, "grad_norm": 3.4597039222717285, "learning_rate": 1e-05, "loss": 0.7386, "step": 15290 }, { "epoch": 2.63, "grad_norm": 2.5685362815856934, "learning_rate": 1e-05, "loss": 0.7828, "step": 15300 }, { "epoch": 2.63, "grad_norm": 3.855380058288574, "learning_rate": 1e-05, "loss": 0.7743, "step": 15310 }, { "epoch": 2.63, "grad_norm": 2.524961233139038, "learning_rate": 1e-05, "loss": 0.7033, "step": 15320 }, { "epoch": 2.63, "grad_norm": 2.046072244644165, "learning_rate": 1e-05, "loss": 0.7532, "step": 15330 }, { "epoch": 2.63, "grad_norm": 4.325909614562988, "learning_rate": 1e-05, "loss": 0.7994, "step": 15340 }, { "epoch": 2.64, "grad_norm": 2.411571979522705, "learning_rate": 1e-05, "loss": 0.7081, "step": 15350 }, { "epoch": 2.64, "grad_norm": 3.4040896892547607, "learning_rate": 1e-05, "loss": 0.7443, "step": 15360 }, { "epoch": 2.64, "grad_norm": 2.6055779457092285, "learning_rate": 1e-05, "loss": 0.8008, "step": 15370 }, { "epoch": 2.64, "grad_norm": 2.8484854698181152, "learning_rate": 1e-05, "loss": 0.6909, "step": 15380 }, { "epoch": 2.64, "grad_norm": 2.2155449390411377, "learning_rate": 1e-05, "loss": 0.7675, "step": 15390 }, { "epoch": 2.64, "grad_norm": 2.5154049396514893, "learning_rate": 1e-05, "loss": 0.7156, "step": 15400 }, { "epoch": 2.65, "grad_norm": 3.7648818492889404, "learning_rate": 1e-05, "loss": 0.797, "step": 15410 }, { "epoch": 2.65, "grad_norm": 2.2900948524475098, "learning_rate": 1e-05, "loss": 0.7388, "step": 15420 }, { "epoch": 2.65, "grad_norm": 3.301736831665039, "learning_rate": 1e-05, "loss": 0.7711, "step": 15430 }, { "epoch": 2.65, "grad_norm": 11.01123332977295, "learning_rate": 1e-05, "loss": 0.8923, "step": 15440 }, { "epoch": 2.65, "grad_norm": 3.414224624633789, "learning_rate": 1e-05, "loss": 0.7313, "step": 15450 }, { "epoch": 2.65, "grad_norm": 2.749166965484619, "learning_rate": 1e-05, "loss": 0.7062, "step": 15460 }, { "epoch": 2.66, "grad_norm": 2.4409449100494385, "learning_rate": 1e-05, "loss": 0.7218, "step": 15470 }, { "epoch": 2.66, "grad_norm": 2.4345972537994385, "learning_rate": 1e-05, "loss": 0.7291, "step": 15480 }, { "epoch": 2.66, "grad_norm": 2.5892274379730225, "learning_rate": 1e-05, "loss": 0.7936, "step": 15490 }, { "epoch": 2.66, "grad_norm": 4.130225658416748, "learning_rate": 1e-05, "loss": 0.7268, "step": 15500 }, { "epoch": 2.66, "grad_norm": 3.403099298477173, "learning_rate": 1e-05, "loss": 0.7424, "step": 15510 }, { "epoch": 2.67, "grad_norm": 2.790379285812378, "learning_rate": 1e-05, "loss": 0.7942, "step": 15520 }, { "epoch": 2.67, "grad_norm": 3.102782964706421, "learning_rate": 1e-05, "loss": 0.7751, "step": 15530 }, { "epoch": 2.67, "grad_norm": 2.5872371196746826, "learning_rate": 1e-05, "loss": 0.7455, "step": 15540 }, { "epoch": 2.67, "grad_norm": 4.8439154624938965, "learning_rate": 1e-05, "loss": 0.7276, "step": 15550 }, { "epoch": 2.67, "grad_norm": 2.814612627029419, "learning_rate": 1e-05, "loss": 0.7593, "step": 15560 }, { "epoch": 2.67, "grad_norm": 3.2214343547821045, "learning_rate": 1e-05, "loss": 0.6791, "step": 15570 }, { "epoch": 2.68, "grad_norm": 4.046710014343262, "learning_rate": 1e-05, "loss": 0.7227, "step": 15580 }, { "epoch": 2.68, "grad_norm": 3.2067861557006836, "learning_rate": 1e-05, "loss": 0.7475, "step": 15590 }, { "epoch": 2.68, "grad_norm": 2.4715051651000977, "learning_rate": 1e-05, "loss": 0.7739, "step": 15600 }, { "epoch": 2.68, "grad_norm": 3.115117311477661, "learning_rate": 1e-05, "loss": 0.7745, "step": 15610 }, { "epoch": 2.68, "grad_norm": 3.757561206817627, "learning_rate": 1e-05, "loss": 0.7603, "step": 15620 }, { "epoch": 2.68, "grad_norm": 2.4830832481384277, "learning_rate": 1e-05, "loss": 0.7136, "step": 15630 }, { "epoch": 2.69, "grad_norm": 3.814326286315918, "learning_rate": 1e-05, "loss": 0.819, "step": 15640 }, { "epoch": 2.69, "grad_norm": 3.865314245223999, "learning_rate": 1e-05, "loss": 0.7187, "step": 15650 }, { "epoch": 2.69, "grad_norm": 4.542452335357666, "learning_rate": 1e-05, "loss": 0.6924, "step": 15660 }, { "epoch": 2.69, "grad_norm": 3.1028683185577393, "learning_rate": 1e-05, "loss": 0.7375, "step": 15670 }, { "epoch": 2.69, "grad_norm": 2.452650308609009, "learning_rate": 1e-05, "loss": 0.6479, "step": 15680 }, { "epoch": 2.69, "grad_norm": 2.847052574157715, "learning_rate": 1e-05, "loss": 0.7278, "step": 15690 }, { "epoch": 2.7, "grad_norm": 5.2814788818359375, "learning_rate": 1e-05, "loss": 0.8083, "step": 15700 }, { "epoch": 2.7, "grad_norm": 2.3837742805480957, "learning_rate": 1e-05, "loss": 0.8053, "step": 15710 }, { "epoch": 2.7, "grad_norm": 2.4592061042785645, "learning_rate": 1e-05, "loss": 0.6921, "step": 15720 }, { "epoch": 2.7, "grad_norm": 3.0937070846557617, "learning_rate": 1e-05, "loss": 0.7763, "step": 15730 }, { "epoch": 2.7, "grad_norm": 2.6138272285461426, "learning_rate": 1e-05, "loss": 0.7324, "step": 15740 }, { "epoch": 2.7, "grad_norm": 2.9379701614379883, "learning_rate": 1e-05, "loss": 0.7163, "step": 15750 }, { "epoch": 2.71, "grad_norm": 2.2379825115203857, "learning_rate": 1e-05, "loss": 0.6726, "step": 15760 }, { "epoch": 2.71, "grad_norm": 2.3431155681610107, "learning_rate": 1e-05, "loss": 0.7273, "step": 15770 }, { "epoch": 2.71, "grad_norm": 4.853468894958496, "learning_rate": 1e-05, "loss": 0.7136, "step": 15780 }, { "epoch": 2.71, "grad_norm": 3.053487777709961, "learning_rate": 1e-05, "loss": 0.7326, "step": 15790 }, { "epoch": 2.71, "grad_norm": 2.3642847537994385, "learning_rate": 1e-05, "loss": 0.748, "step": 15800 }, { "epoch": 2.72, "grad_norm": 2.356008768081665, "learning_rate": 1e-05, "loss": 0.7565, "step": 15810 }, { "epoch": 2.72, "grad_norm": 5.110803604125977, "learning_rate": 1e-05, "loss": 0.7384, "step": 15820 }, { "epoch": 2.72, "grad_norm": 2.1696460247039795, "learning_rate": 1e-05, "loss": 0.7006, "step": 15830 }, { "epoch": 2.72, "grad_norm": 2.4872984886169434, "learning_rate": 1e-05, "loss": 0.7122, "step": 15840 }, { "epoch": 2.72, "grad_norm": 2.6965861320495605, "learning_rate": 1e-05, "loss": 0.7435, "step": 15850 }, { "epoch": 2.72, "grad_norm": 2.5141866207122803, "learning_rate": 1e-05, "loss": 0.7116, "step": 15860 }, { "epoch": 2.73, "grad_norm": 3.048657178878784, "learning_rate": 1e-05, "loss": 0.7095, "step": 15870 }, { "epoch": 2.73, "grad_norm": 2.5788943767547607, "learning_rate": 1e-05, "loss": 0.7484, "step": 15880 }, { "epoch": 2.73, "grad_norm": 2.4205973148345947, "learning_rate": 1e-05, "loss": 0.7057, "step": 15890 }, { "epoch": 2.73, "grad_norm": 3.8227627277374268, "learning_rate": 1e-05, "loss": 0.7496, "step": 15900 }, { "epoch": 2.73, "grad_norm": 2.5252959728240967, "learning_rate": 1e-05, "loss": 0.7253, "step": 15910 }, { "epoch": 2.73, "grad_norm": 2.177140712738037, "learning_rate": 1e-05, "loss": 0.7887, "step": 15920 }, { "epoch": 2.74, "grad_norm": 2.435643434524536, "learning_rate": 1e-05, "loss": 0.816, "step": 15930 }, { "epoch": 2.74, "grad_norm": 2.5256426334381104, "learning_rate": 1e-05, "loss": 0.7462, "step": 15940 }, { "epoch": 2.74, "grad_norm": 2.2169747352600098, "learning_rate": 1e-05, "loss": 0.7447, "step": 15950 }, { "epoch": 2.74, "grad_norm": 2.4709417819976807, "learning_rate": 1e-05, "loss": 0.6927, "step": 15960 }, { "epoch": 2.74, "grad_norm": 3.4572293758392334, "learning_rate": 1e-05, "loss": 0.7019, "step": 15970 }, { "epoch": 2.74, "grad_norm": 2.54184627532959, "learning_rate": 1e-05, "loss": 0.7396, "step": 15980 }, { "epoch": 2.75, "grad_norm": 2.9560205936431885, "learning_rate": 1e-05, "loss": 0.7141, "step": 15990 }, { "epoch": 2.75, "grad_norm": 2.538736343383789, "learning_rate": 1e-05, "loss": 0.7024, "step": 16000 }, { "epoch": 2.75, "grad_norm": 2.4639668464660645, "learning_rate": 1e-05, "loss": 0.694, "step": 16010 }, { "epoch": 2.75, "grad_norm": 3.4678444862365723, "learning_rate": 1e-05, "loss": 0.7787, "step": 16020 }, { "epoch": 2.75, "grad_norm": 2.7737815380096436, "learning_rate": 1e-05, "loss": 0.7071, "step": 16030 }, { "epoch": 2.75, "grad_norm": 3.0524613857269287, "learning_rate": 1e-05, "loss": 0.7129, "step": 16040 }, { "epoch": 2.76, "grad_norm": 2.9612460136413574, "learning_rate": 1e-05, "loss": 0.7531, "step": 16050 }, { "epoch": 2.76, "grad_norm": 2.466224193572998, "learning_rate": 1e-05, "loss": 0.703, "step": 16060 }, { "epoch": 2.76, "grad_norm": 3.5040297508239746, "learning_rate": 1e-05, "loss": 0.7759, "step": 16070 }, { "epoch": 2.76, "grad_norm": 2.6110734939575195, "learning_rate": 1e-05, "loss": 0.7041, "step": 16080 }, { "epoch": 2.76, "grad_norm": 2.3298346996307373, "learning_rate": 1e-05, "loss": 0.7527, "step": 16090 }, { "epoch": 2.76, "grad_norm": 2.0623910427093506, "learning_rate": 1e-05, "loss": 0.7526, "step": 16100 }, { "epoch": 2.77, "grad_norm": 3.3194146156311035, "learning_rate": 1e-05, "loss": 0.6769, "step": 16110 }, { "epoch": 2.77, "grad_norm": 2.5238358974456787, "learning_rate": 1e-05, "loss": 0.7166, "step": 16120 }, { "epoch": 2.77, "grad_norm": 2.802138090133667, "learning_rate": 1e-05, "loss": 0.7491, "step": 16130 }, { "epoch": 2.77, "grad_norm": 3.0264785289764404, "learning_rate": 1e-05, "loss": 0.7306, "step": 16140 }, { "epoch": 2.77, "grad_norm": 5.061720848083496, "learning_rate": 1e-05, "loss": 0.7897, "step": 16150 }, { "epoch": 2.78, "grad_norm": 3.7770209312438965, "learning_rate": 1e-05, "loss": 0.6993, "step": 16160 }, { "epoch": 2.78, "grad_norm": 3.6209259033203125, "learning_rate": 1e-05, "loss": 0.8215, "step": 16170 }, { "epoch": 2.78, "grad_norm": 4.007264137268066, "learning_rate": 1e-05, "loss": 0.7738, "step": 16180 }, { "epoch": 2.78, "grad_norm": 3.5193967819213867, "learning_rate": 1e-05, "loss": 0.6896, "step": 16190 }, { "epoch": 2.78, "grad_norm": 2.8088245391845703, "learning_rate": 1e-05, "loss": 0.7401, "step": 16200 }, { "epoch": 2.78, "grad_norm": 2.5565385818481445, "learning_rate": 1e-05, "loss": 0.7178, "step": 16210 }, { "epoch": 2.79, "grad_norm": 3.078495502471924, "learning_rate": 1e-05, "loss": 0.782, "step": 16220 }, { "epoch": 2.79, "grad_norm": 2.354825973510742, "learning_rate": 1e-05, "loss": 0.6915, "step": 16230 }, { "epoch": 2.79, "grad_norm": 2.650279998779297, "learning_rate": 1e-05, "loss": 0.7687, "step": 16240 }, { "epoch": 2.79, "grad_norm": 3.1134865283966064, "learning_rate": 1e-05, "loss": 0.7498, "step": 16250 }, { "epoch": 2.79, "grad_norm": 2.4980340003967285, "learning_rate": 1e-05, "loss": 0.7477, "step": 16260 }, { "epoch": 2.79, "grad_norm": 2.9300968647003174, "learning_rate": 1e-05, "loss": 0.716, "step": 16270 }, { "epoch": 2.8, "grad_norm": 2.97686505317688, "learning_rate": 1e-05, "loss": 0.7463, "step": 16280 }, { "epoch": 2.8, "grad_norm": 2.8819618225097656, "learning_rate": 1e-05, "loss": 0.7069, "step": 16290 }, { "epoch": 2.8, "grad_norm": 2.8394768238067627, "learning_rate": 1e-05, "loss": 0.7733, "step": 16300 }, { "epoch": 2.8, "grad_norm": 2.063542127609253, "learning_rate": 1e-05, "loss": 0.8501, "step": 16310 }, { "epoch": 2.8, "grad_norm": 2.2984085083007812, "learning_rate": 1e-05, "loss": 0.696, "step": 16320 }, { "epoch": 2.8, "grad_norm": 2.8086931705474854, "learning_rate": 1e-05, "loss": 0.7987, "step": 16330 }, { "epoch": 2.81, "grad_norm": 3.405078411102295, "learning_rate": 1e-05, "loss": 0.7761, "step": 16340 }, { "epoch": 2.81, "grad_norm": 2.172470808029175, "learning_rate": 1e-05, "loss": 0.7769, "step": 16350 }, { "epoch": 2.81, "grad_norm": 3.121769666671753, "learning_rate": 1e-05, "loss": 0.7938, "step": 16360 }, { "epoch": 2.81, "grad_norm": 2.402902364730835, "learning_rate": 1e-05, "loss": 0.7294, "step": 16370 }, { "epoch": 2.81, "grad_norm": 2.813246726989746, "learning_rate": 1e-05, "loss": 0.771, "step": 16380 }, { "epoch": 2.81, "grad_norm": 2.922886848449707, "learning_rate": 1e-05, "loss": 0.817, "step": 16390 }, { "epoch": 2.82, "grad_norm": 2.243450403213501, "learning_rate": 1e-05, "loss": 0.7381, "step": 16400 }, { "epoch": 2.82, "grad_norm": 9.812843322753906, "learning_rate": 1e-05, "loss": 0.8756, "step": 16410 }, { "epoch": 2.82, "grad_norm": 2.703944206237793, "learning_rate": 1e-05, "loss": 0.6877, "step": 16420 }, { "epoch": 2.82, "grad_norm": 3.160672187805176, "learning_rate": 1e-05, "loss": 0.733, "step": 16430 }, { "epoch": 2.82, "grad_norm": 3.238607168197632, "learning_rate": 1e-05, "loss": 0.7106, "step": 16440 }, { "epoch": 2.83, "grad_norm": 3.620293140411377, "learning_rate": 1e-05, "loss": 0.7273, "step": 16450 }, { "epoch": 2.83, "grad_norm": 2.082348346710205, "learning_rate": 1e-05, "loss": 0.7373, "step": 16460 }, { "epoch": 2.83, "grad_norm": 3.2884418964385986, "learning_rate": 1e-05, "loss": 0.7943, "step": 16470 }, { "epoch": 2.83, "grad_norm": 2.3396670818328857, "learning_rate": 1e-05, "loss": 0.7064, "step": 16480 }, { "epoch": 2.83, "grad_norm": 3.7502334117889404, "learning_rate": 1e-05, "loss": 0.75, "step": 16490 }, { "epoch": 2.83, "grad_norm": 2.7468998432159424, "learning_rate": 1e-05, "loss": 0.7514, "step": 16500 }, { "epoch": 2.84, "grad_norm": 3.606981039047241, "learning_rate": 1e-05, "loss": 0.7411, "step": 16510 }, { "epoch": 2.84, "grad_norm": 2.4418702125549316, "learning_rate": 1e-05, "loss": 0.7346, "step": 16520 }, { "epoch": 2.84, "grad_norm": 2.283881902694702, "learning_rate": 1e-05, "loss": 0.6863, "step": 16530 }, { "epoch": 2.84, "grad_norm": 3.3163726329803467, "learning_rate": 1e-05, "loss": 0.7655, "step": 16540 }, { "epoch": 2.84, "grad_norm": 3.4496238231658936, "learning_rate": 1e-05, "loss": 0.8603, "step": 16550 }, { "epoch": 2.84, "grad_norm": 2.912651538848877, "learning_rate": 1e-05, "loss": 0.794, "step": 16560 }, { "epoch": 2.85, "grad_norm": 2.5364766120910645, "learning_rate": 1e-05, "loss": 0.725, "step": 16570 }, { "epoch": 2.85, "grad_norm": 3.1615214347839355, "learning_rate": 1e-05, "loss": 0.7345, "step": 16580 }, { "epoch": 2.85, "grad_norm": 3.17732834815979, "learning_rate": 1e-05, "loss": 0.7786, "step": 16590 }, { "epoch": 2.85, "grad_norm": 3.815878391265869, "learning_rate": 1e-05, "loss": 0.7871, "step": 16600 }, { "epoch": 2.85, "grad_norm": 3.21816086769104, "learning_rate": 1e-05, "loss": 0.7331, "step": 16610 }, { "epoch": 2.85, "grad_norm": 3.8046021461486816, "learning_rate": 1e-05, "loss": 0.7428, "step": 16620 }, { "epoch": 2.86, "grad_norm": 2.7986233234405518, "learning_rate": 1e-05, "loss": 0.7871, "step": 16630 }, { "epoch": 2.86, "grad_norm": 4.534671783447266, "learning_rate": 1e-05, "loss": 0.7033, "step": 16640 }, { "epoch": 2.86, "grad_norm": 2.722402572631836, "learning_rate": 1e-05, "loss": 0.7478, "step": 16650 }, { "epoch": 2.86, "grad_norm": 2.5120465755462646, "learning_rate": 1e-05, "loss": 0.8048, "step": 16660 }, { "epoch": 2.86, "grad_norm": 3.8931288719177246, "learning_rate": 1e-05, "loss": 0.7489, "step": 16670 }, { "epoch": 2.86, "grad_norm": 3.675823211669922, "learning_rate": 1e-05, "loss": 0.7575, "step": 16680 }, { "epoch": 2.87, "grad_norm": 2.4684035778045654, "learning_rate": 1e-05, "loss": 0.7459, "step": 16690 }, { "epoch": 2.87, "grad_norm": 2.9120452404022217, "learning_rate": 1e-05, "loss": 0.7388, "step": 16700 }, { "epoch": 2.87, "grad_norm": 3.1678876876831055, "learning_rate": 1e-05, "loss": 0.7115, "step": 16710 }, { "epoch": 2.87, "grad_norm": 3.2457306385040283, "learning_rate": 1e-05, "loss": 0.7298, "step": 16720 }, { "epoch": 2.87, "grad_norm": 4.585461139678955, "learning_rate": 1e-05, "loss": 0.7348, "step": 16730 }, { "epoch": 2.87, "grad_norm": 2.2121076583862305, "learning_rate": 1e-05, "loss": 0.6789, "step": 16740 }, { "epoch": 2.88, "grad_norm": 2.2857534885406494, "learning_rate": 1e-05, "loss": 0.6961, "step": 16750 }, { "epoch": 2.88, "grad_norm": 2.597043514251709, "learning_rate": 1e-05, "loss": 0.754, "step": 16760 }, { "epoch": 2.88, "grad_norm": 2.364671230316162, "learning_rate": 1e-05, "loss": 0.77, "step": 16770 }, { "epoch": 2.88, "grad_norm": 3.1742351055145264, "learning_rate": 1e-05, "loss": 0.7161, "step": 16780 }, { "epoch": 2.88, "grad_norm": 2.498741388320923, "learning_rate": 1e-05, "loss": 0.6919, "step": 16790 }, { "epoch": 2.89, "grad_norm": 2.582516670227051, "learning_rate": 1e-05, "loss": 0.7376, "step": 16800 }, { "epoch": 2.89, "grad_norm": 2.1765763759613037, "learning_rate": 1e-05, "loss": 0.6758, "step": 16810 }, { "epoch": 2.89, "grad_norm": 2.645017385482788, "learning_rate": 1e-05, "loss": 0.7498, "step": 16820 }, { "epoch": 2.89, "grad_norm": 2.96962833404541, "learning_rate": 1e-05, "loss": 0.7105, "step": 16830 }, { "epoch": 2.89, "grad_norm": 3.203653335571289, "learning_rate": 1e-05, "loss": 0.7472, "step": 16840 }, { "epoch": 2.89, "grad_norm": 2.739818572998047, "learning_rate": 1e-05, "loss": 0.7642, "step": 16850 }, { "epoch": 2.9, "grad_norm": 2.86613130569458, "learning_rate": 1e-05, "loss": 0.7257, "step": 16860 }, { "epoch": 2.9, "grad_norm": 2.8112566471099854, "learning_rate": 1e-05, "loss": 0.7461, "step": 16870 }, { "epoch": 2.9, "grad_norm": 3.9736883640289307, "learning_rate": 1e-05, "loss": 0.7416, "step": 16880 }, { "epoch": 2.9, "grad_norm": 2.1086666584014893, "learning_rate": 1e-05, "loss": 0.6816, "step": 16890 }, { "epoch": 2.9, "grad_norm": 3.0595312118530273, "learning_rate": 1e-05, "loss": 0.7162, "step": 16900 }, { "epoch": 2.9, "grad_norm": 2.9815711975097656, "learning_rate": 1e-05, "loss": 0.7105, "step": 16910 }, { "epoch": 2.91, "grad_norm": 3.7968990802764893, "learning_rate": 1e-05, "loss": 0.6456, "step": 16920 }, { "epoch": 2.91, "grad_norm": 3.099365472793579, "learning_rate": 1e-05, "loss": 0.6936, "step": 16930 }, { "epoch": 2.91, "grad_norm": 4.1419782638549805, "learning_rate": 1e-05, "loss": 0.7544, "step": 16940 }, { "epoch": 2.91, "grad_norm": 2.833970308303833, "learning_rate": 1e-05, "loss": 0.7226, "step": 16950 }, { "epoch": 2.91, "grad_norm": 2.6359775066375732, "learning_rate": 1e-05, "loss": 0.7183, "step": 16960 }, { "epoch": 2.91, "grad_norm": 3.082583427429199, "learning_rate": 1e-05, "loss": 0.7529, "step": 16970 }, { "epoch": 2.92, "grad_norm": 2.820340871810913, "learning_rate": 1e-05, "loss": 0.7711, "step": 16980 }, { "epoch": 2.92, "grad_norm": 2.642461061477661, "learning_rate": 1e-05, "loss": 0.7484, "step": 16990 }, { "epoch": 2.92, "grad_norm": 2.2459590435028076, "learning_rate": 1e-05, "loss": 0.7215, "step": 17000 }, { "epoch": 2.92, "grad_norm": 2.3110170364379883, "learning_rate": 1e-05, "loss": 0.7875, "step": 17010 }, { "epoch": 2.92, "grad_norm": 2.8215103149414062, "learning_rate": 1e-05, "loss": 0.8104, "step": 17020 }, { "epoch": 2.92, "grad_norm": 2.3981997966766357, "learning_rate": 1e-05, "loss": 0.7895, "step": 17030 }, { "epoch": 2.93, "grad_norm": 2.9426681995391846, "learning_rate": 1e-05, "loss": 0.7858, "step": 17040 }, { "epoch": 2.93, "grad_norm": 2.6029345989227295, "learning_rate": 1e-05, "loss": 0.8176, "step": 17050 }, { "epoch": 2.93, "grad_norm": 2.9794445037841797, "learning_rate": 1e-05, "loss": 0.7859, "step": 17060 }, { "epoch": 2.93, "grad_norm": 3.120633125305176, "learning_rate": 1e-05, "loss": 0.7464, "step": 17070 }, { "epoch": 2.93, "grad_norm": 5.730931282043457, "learning_rate": 1e-05, "loss": 0.8152, "step": 17080 }, { "epoch": 2.93, "grad_norm": 1.9469467401504517, "learning_rate": 1e-05, "loss": 0.7083, "step": 17090 }, { "epoch": 2.94, "grad_norm": 3.874636650085449, "learning_rate": 1e-05, "loss": 0.768, "step": 17100 }, { "epoch": 2.94, "grad_norm": 2.6435718536376953, "learning_rate": 1e-05, "loss": 0.7137, "step": 17110 }, { "epoch": 2.94, "grad_norm": 4.349155426025391, "learning_rate": 1e-05, "loss": 0.7452, "step": 17120 }, { "epoch": 2.94, "grad_norm": 4.12204647064209, "learning_rate": 1e-05, "loss": 0.7668, "step": 17130 }, { "epoch": 2.94, "grad_norm": 3.769453287124634, "learning_rate": 1e-05, "loss": 0.6694, "step": 17140 }, { "epoch": 2.95, "grad_norm": 2.3985166549682617, "learning_rate": 1e-05, "loss": 0.7694, "step": 17150 }, { "epoch": 2.95, "grad_norm": 2.7735278606414795, "learning_rate": 1e-05, "loss": 0.7295, "step": 17160 }, { "epoch": 2.95, "grad_norm": 2.5758328437805176, "learning_rate": 1e-05, "loss": 0.7437, "step": 17170 }, { "epoch": 2.95, "grad_norm": 3.3075079917907715, "learning_rate": 1e-05, "loss": 0.7758, "step": 17180 }, { "epoch": 2.95, "grad_norm": 2.7021961212158203, "learning_rate": 1e-05, "loss": 0.72, "step": 17190 }, { "epoch": 2.95, "grad_norm": 2.937143087387085, "learning_rate": 1e-05, "loss": 0.741, "step": 17200 }, { "epoch": 2.96, "grad_norm": 2.422276020050049, "learning_rate": 1e-05, "loss": 0.6805, "step": 17210 }, { "epoch": 2.96, "grad_norm": 2.0770819187164307, "learning_rate": 1e-05, "loss": 0.7353, "step": 17220 }, { "epoch": 2.96, "grad_norm": 2.305924654006958, "learning_rate": 1e-05, "loss": 0.6862, "step": 17230 }, { "epoch": 2.96, "grad_norm": 3.3312768936157227, "learning_rate": 1e-05, "loss": 0.7512, "step": 17240 }, { "epoch": 2.96, "grad_norm": 3.8097422122955322, "learning_rate": 1e-05, "loss": 0.777, "step": 17250 }, { "epoch": 2.96, "grad_norm": 2.78053617477417, "learning_rate": 1e-05, "loss": 0.6928, "step": 17260 }, { "epoch": 2.97, "grad_norm": 2.543670654296875, "learning_rate": 1e-05, "loss": 0.701, "step": 17270 }, { "epoch": 2.97, "grad_norm": 3.243194580078125, "learning_rate": 1e-05, "loss": 0.7368, "step": 17280 }, { "epoch": 2.97, "grad_norm": 3.155773878097534, "learning_rate": 1e-05, "loss": 0.7553, "step": 17290 }, { "epoch": 2.97, "grad_norm": 3.1233174800872803, "learning_rate": 1e-05, "loss": 0.7981, "step": 17300 }, { "epoch": 2.97, "grad_norm": 4.1581220626831055, "learning_rate": 1e-05, "loss": 0.7076, "step": 17310 }, { "epoch": 2.97, "grad_norm": 3.5240795612335205, "learning_rate": 1e-05, "loss": 0.7145, "step": 17320 }, { "epoch": 2.98, "grad_norm": 3.32393217086792, "learning_rate": 1e-05, "loss": 0.7386, "step": 17330 }, { "epoch": 2.98, "grad_norm": 3.0469563007354736, "learning_rate": 1e-05, "loss": 0.738, "step": 17340 }, { "epoch": 2.98, "grad_norm": 3.3284199237823486, "learning_rate": 1e-05, "loss": 0.8062, "step": 17350 }, { "epoch": 2.98, "grad_norm": 5.088879585266113, "learning_rate": 1e-05, "loss": 0.7776, "step": 17360 }, { "epoch": 2.98, "grad_norm": 7.793527603149414, "learning_rate": 1e-05, "loss": 0.7674, "step": 17370 }, { "epoch": 2.98, "grad_norm": 3.6413803100585938, "learning_rate": 1e-05, "loss": 0.8351, "step": 17380 }, { "epoch": 2.99, "grad_norm": 3.5020976066589355, "learning_rate": 1e-05, "loss": 0.6976, "step": 17390 }, { "epoch": 2.99, "grad_norm": 2.9255354404449463, "learning_rate": 1e-05, "loss": 0.7419, "step": 17400 }, { "epoch": 2.99, "grad_norm": 3.126370906829834, "learning_rate": 1e-05, "loss": 0.8116, "step": 17410 }, { "epoch": 2.99, "grad_norm": 3.6022801399230957, "learning_rate": 1e-05, "loss": 0.8049, "step": 17420 }, { "epoch": 2.99, "grad_norm": 3.4221079349517822, "learning_rate": 1e-05, "loss": 0.7251, "step": 17430 }, { "epoch": 3.0, "grad_norm": 3.1578404903411865, "learning_rate": 1e-05, "loss": 0.732, "step": 17440 }, { "epoch": 3.0, "grad_norm": 2.2212891578674316, "learning_rate": 1e-05, "loss": 0.6766, "step": 17450 }, { "epoch": 3.0, "grad_norm": 3.3322994709014893, "learning_rate": 1e-05, "loss": 0.7178, "step": 17460 }, { "epoch": 3.0, "eval_loss": 2.0239431858062744, "eval_runtime": 21.6628, "eval_samples_per_second": 46.162, "eval_steps_per_second": 46.162, "step": 17469 } ], "logging_steps": 10, "max_steps": 17469, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 250, "total_flos": 4.146314247762785e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }