[ { "loss": 13.2026, "grad_norm": 6.155358791351318, "learning_rate": 1.739130434782609e-05, "epoch": 0.034782608695652174, "step": 20 }, { "loss": 13.1252, "grad_norm": 5.816741943359375, "learning_rate": 3.478260869565218e-05, "epoch": 0.06956521739130435, "step": 40 }, { "loss": 13.0001, "grad_norm": 5.273156642913818, "learning_rate": 5.2173913043478256e-05, "epoch": 0.10434782608695652, "step": 60 }, { "loss": 12.8639, "grad_norm": 4.86655330657959, "learning_rate": 6.956521739130436e-05, "epoch": 0.1391304347826087, "step": 80 }, { "loss": 12.7376, "grad_norm": 4.438321113586426, "learning_rate": 8.695652173913044e-05, "epoch": 0.17391304347826086, "step": 100 }, { "loss": 12.5722, "grad_norm": 4.164404392242432, "learning_rate": 0.00010434782608695651, "epoch": 0.20869565217391303, "step": 120 }, { "loss": 12.4229, "grad_norm": 3.858990430831909, "learning_rate": 0.00012173913043478261, "epoch": 0.24347826086956523, "step": 140 }, { "loss": 12.2581, "grad_norm": 3.6574394702911377, "learning_rate": 0.0001391304347826087, "epoch": 0.2782608695652174, "step": 160 }, { "loss": 12.0753, "grad_norm": 3.3787951469421387, "learning_rate": 0.0001565217391304348, "epoch": 0.3130434782608696, "step": 180 }, { "loss": 11.9261, "grad_norm": 3.323820114135742, "learning_rate": 0.00017391304347826088, "epoch": 0.34782608695652173, "step": 200 }, { "loss": 11.7417, "grad_norm": 3.247619152069092, "learning_rate": 0.00019130434782608697, "epoch": 0.3826086956521739, "step": 220 }, { "loss": 11.5771, "grad_norm": 3.2254152297973633, "learning_rate": 0.00020869565217391303, "epoch": 0.41739130434782606, "step": 240 }, { "loss": 11.3969, "grad_norm": 3.1803464889526367, "learning_rate": 0.00022608695652173914, "epoch": 0.45217391304347826, "step": 260 }, { "loss": 11.2684, "grad_norm": 3.41034197807312, "learning_rate": 0.00024347826086956522, "epoch": 0.48695652173913045, "step": 280 }, { "loss": 11.0744, "grad_norm": 3.246403217315674, "learning_rate": 0.0002608695652173913, "epoch": 0.5217391304347826, "step": 300 }, { "loss": 10.8929, "grad_norm": 3.202021360397339, "learning_rate": 0.0002782608695652174, "epoch": 0.5565217391304348, "step": 320 }, { "loss": 10.7468, "grad_norm": 3.1231367588043213, "learning_rate": 0.0002956521739130435, "epoch": 0.591304347826087, "step": 340 }, { "loss": 10.606, "grad_norm": 3.1820390224456787, "learning_rate": 0.0003130434782608696, "epoch": 0.6260869565217392, "step": 360 }, { "loss": 10.4871, "grad_norm": 3.2470555305480957, "learning_rate": 0.0003304347826086956, "epoch": 0.6608695652173913, "step": 380 }, { "loss": 10.2836, "grad_norm": 3.2452709674835205, "learning_rate": 0.00034782608695652176, "epoch": 0.6956521739130435, "step": 400 }, { "loss": 10.1154, "grad_norm": 3.203894853591919, "learning_rate": 0.00036521739130434785, "epoch": 0.7304347826086957, "step": 420 }, { "loss": 9.9283, "grad_norm": 3.269970178604126, "learning_rate": 0.00038260869565217393, "epoch": 0.7652173913043478, "step": 440 }, { "loss": 9.8674, "grad_norm": 3.261357545852661, "learning_rate": 0.0004, "epoch": 0.8, "step": 460 }, { "loss": 9.6224, "grad_norm": 3.393953323364258, "learning_rate": 0.00041739130434782605, "epoch": 0.8347826086956521, "step": 480 }, { "loss": 9.524, "grad_norm": 3.321411609649658, "learning_rate": 0.0004347826086956522, "epoch": 0.8695652173913043, "step": 500 }, { "loss": 9.384, "grad_norm": 3.3886823654174805, "learning_rate": 0.0004521739130434783, "epoch": 0.9043478260869565, "step": 520 }, { "loss": 9.1767, "grad_norm": 3.4735491275787354, "learning_rate": 0.00046956521739130436, "epoch": 0.9391304347826087, "step": 540 }, { "loss": 9.047, "grad_norm": 3.416966676712036, "learning_rate": 0.00048695652173913045, "epoch": 0.9739130434782609, "step": 560 }, { "eval_loss": 8.366157531738281, "eval_accuracy": 0.43039677202420984, "eval_runtime": 42.3364, "eval_samples_per_second": 35.123, "eval_steps_per_second": 35.123, "epoch": 1.0, "step": 575 }, { "loss": 8.8835, "grad_norm": 3.446899890899658, "learning_rate": 0.0004995169082125604, "epoch": 1.008695652173913, "step": 580 }, { "loss": 8.6436, "grad_norm": 3.5842247009277344, "learning_rate": 0.0004975845410628019, "epoch": 1.0434782608695652, "step": 600 }, { "loss": 8.4775, "grad_norm": 3.5029306411743164, "learning_rate": 0.0004956521739130435, "epoch": 1.0782608695652174, "step": 620 }, { "loss": 8.322, "grad_norm": 3.5451033115386963, "learning_rate": 0.0004937198067632851, "epoch": 1.1130434782608696, "step": 640 }, { "loss": 8.1264, "grad_norm": 3.5502634048461914, "learning_rate": 0.0004917874396135266, "epoch": 1.1478260869565218, "step": 660 }, { "loss": 7.9905, "grad_norm": 3.607395648956299, "learning_rate": 0.0004898550724637681, "epoch": 1.182608695652174, "step": 680 }, { "loss": 7.8252, "grad_norm": 3.6438565254211426, "learning_rate": 0.0004879227053140097, "epoch": 1.2173913043478262, "step": 700 }, { "loss": 7.7737, "grad_norm": 3.656705141067505, "learning_rate": 0.0004859903381642512, "epoch": 1.2521739130434781, "step": 720 }, { "loss": 7.5822, "grad_norm": 3.7424328327178955, "learning_rate": 0.0004840579710144928, "epoch": 1.2869565217391306, "step": 740 }, { "loss": 7.4563, "grad_norm": 3.673156261444092, "learning_rate": 0.0004821256038647343, "epoch": 1.3217391304347825, "step": 760 }, { "loss": 7.3379, "grad_norm": 3.6774067878723145, "learning_rate": 0.0004801932367149758, "epoch": 1.3565217391304347, "step": 780 }, { "loss": 7.1559, "grad_norm": 3.811283826828003, "learning_rate": 0.0004782608695652174, "epoch": 1.391304347826087, "step": 800 }, { "loss": 7.0834, "grad_norm": 3.7899839878082275, "learning_rate": 0.00047632850241545894, "epoch": 1.4260869565217391, "step": 820 }, { "loss": 6.9172, "grad_norm": 3.583247423171997, "learning_rate": 0.00047439613526570047, "epoch": 1.4608695652173913, "step": 840 }, { "loss": 6.7251, "grad_norm": 3.8192331790924072, "learning_rate": 0.00047246376811594206, "epoch": 1.4956521739130435, "step": 860 }, { "loss": 6.7871, "grad_norm": 3.8098299503326416, "learning_rate": 0.0004705314009661836, "epoch": 1.5304347826086957, "step": 880 }, { "loss": 6.6103, "grad_norm": 3.7341325283050537, "learning_rate": 0.0004685990338164252, "epoch": 1.5652173913043477, "step": 900 }, { "loss": 6.4507, "grad_norm": 3.9190495014190674, "learning_rate": 0.00046666666666666666, "epoch": 1.6, "step": 920 }, { "loss": 6.3619, "grad_norm": 3.9456422328948975, "learning_rate": 0.0004647342995169082, "epoch": 1.634782608695652, "step": 940 }, { "loss": 6.2957, "grad_norm": 3.899134874343872, "learning_rate": 0.0004628019323671498, "epoch": 1.6695652173913045, "step": 960 }, { "loss": 6.1362, "grad_norm": 3.878810167312622, "learning_rate": 0.0004608695652173913, "epoch": 1.7043478260869565, "step": 980 }, { "loss": 5.9814, "grad_norm": 3.9270784854888916, "learning_rate": 0.00045893719806763285, "epoch": 1.7391304347826086, "step": 1000 }, { "loss": 5.9095, "grad_norm": 3.8247644901275635, "learning_rate": 0.00045700483091787444, "epoch": 1.7739130434782608, "step": 1020 }, { "loss": 5.7793, "grad_norm": 3.8870134353637695, "learning_rate": 0.000455072463768116, "epoch": 1.808695652173913, "step": 1040 }, { "loss": 5.7754, "grad_norm": 3.9533441066741943, "learning_rate": 0.00045314009661835745, "epoch": 1.8434782608695652, "step": 1060 }, { "loss": 5.5886, "grad_norm": 3.9928998947143555, "learning_rate": 0.00045120772946859904, "epoch": 1.8782608695652174, "step": 1080 }, { "loss": 5.5482, "grad_norm": 4.030064582824707, "learning_rate": 0.0004492753623188406, "epoch": 1.9130434782608696, "step": 1100 }, { "loss": 5.4807, "grad_norm": 3.961806297302246, "learning_rate": 0.0004473429951690821, "epoch": 1.9478260869565216, "step": 1120 }, { "loss": 5.3508, "grad_norm": 4.003119945526123, "learning_rate": 0.0004454106280193237, "epoch": 1.982608695652174, "step": 1140 }, { "eval_loss": 4.025164604187012, "eval_accuracy": 0.8190988567585743, "eval_runtime": 42.7144, "eval_samples_per_second": 34.813, "eval_steps_per_second": 34.813, "epoch": 2.0, "step": 1150 }, { "loss": 5.1229, "grad_norm": 3.958116292953491, "learning_rate": 0.00044347826086956523, "epoch": 2.017391304347826, "step": 1160 }, { "loss": 4.8146, "grad_norm": 3.864279270172119, "learning_rate": 0.00044154589371980677, "epoch": 2.0521739130434784, "step": 1180 }, { "loss": 4.8843, "grad_norm": 4.045077323913574, "learning_rate": 0.0004396135265700483, "epoch": 2.0869565217391304, "step": 1200 }, { "loss": 4.8078, "grad_norm": 4.061978816986084, "learning_rate": 0.00043768115942028983, "epoch": 2.121739130434783, "step": 1220 }, { "loss": 4.6812, "grad_norm": 4.040159225463867, "learning_rate": 0.0004357487922705314, "epoch": 2.1565217391304348, "step": 1240 }, { "loss": 4.6701, "grad_norm": 4.234623908996582, "learning_rate": 0.00043381642512077296, "epoch": 2.1913043478260867, "step": 1260 }, { "loss": 4.6221, "grad_norm": 4.030038356781006, "learning_rate": 0.0004318840579710145, "epoch": 2.226086956521739, "step": 1280 }, { "loss": 4.5647, "grad_norm": 3.9954497814178467, "learning_rate": 0.0004299516908212561, "epoch": 2.260869565217391, "step": 1300 }, { "loss": 4.4502, "grad_norm": 4.188636779785156, "learning_rate": 0.0004280193236714976, "epoch": 2.2956521739130435, "step": 1320 }, { "loss": 4.359, "grad_norm": 4.185456275939941, "learning_rate": 0.00042608695652173915, "epoch": 2.3304347826086955, "step": 1340 }, { "loss": 4.2863, "grad_norm": 4.123263359069824, "learning_rate": 0.0004241545893719807, "epoch": 2.365217391304348, "step": 1360 }, { "loss": 4.3354, "grad_norm": 4.194387435913086, "learning_rate": 0.0004222222222222222, "epoch": 2.4, "step": 1380 }, { "loss": 4.2176, "grad_norm": 4.065763473510742, "learning_rate": 0.00042028985507246375, "epoch": 2.4347826086956523, "step": 1400 }, { "loss": 4.0597, "grad_norm": 4.120363712310791, "learning_rate": 0.00041835748792270534, "epoch": 2.4695652173913043, "step": 1420 }, { "loss": 4.028, "grad_norm": 4.3197174072265625, "learning_rate": 0.00041642512077294687, "epoch": 2.5043478260869563, "step": 1440 }, { "loss": 3.9833, "grad_norm": 4.2683610916137695, "learning_rate": 0.0004144927536231884, "epoch": 2.5391304347826087, "step": 1460 }, { "loss": 4.0065, "grad_norm": 4.15448522567749, "learning_rate": 0.00041256038647343, "epoch": 2.573913043478261, "step": 1480 }, { "loss": 3.8134, "grad_norm": 4.348177433013916, "learning_rate": 0.0004106280193236715, "epoch": 2.608695652173913, "step": 1500 }, { "loss": 3.8548, "grad_norm": 4.100021839141846, "learning_rate": 0.00040869565217391306, "epoch": 2.643478260869565, "step": 1520 }, { "loss": 3.7814, "grad_norm": 4.344174385070801, "learning_rate": 0.0004067632850241546, "epoch": 2.6782608695652175, "step": 1540 }, { "loss": 3.7578, "grad_norm": 4.240079402923584, "learning_rate": 0.00040483091787439613, "epoch": 2.7130434782608694, "step": 1560 }, { "loss": 3.7331, "grad_norm": 4.468689918518066, "learning_rate": 0.0004028985507246377, "epoch": 2.747826086956522, "step": 1580 }, { "loss": 3.6396, "grad_norm": 4.28464937210083, "learning_rate": 0.00040096618357487925, "epoch": 2.782608695652174, "step": 1600 }, { "loss": 3.5799, "grad_norm": 4.166805744171143, "learning_rate": 0.0003990338164251208, "epoch": 2.8173913043478263, "step": 1620 }, { "loss": 3.4734, "grad_norm": 4.237683296203613, "learning_rate": 0.0003971014492753624, "epoch": 2.8521739130434782, "step": 1640 }, { "loss": 3.5183, "grad_norm": 4.153097152709961, "learning_rate": 0.00039516908212560385, "epoch": 2.8869565217391306, "step": 1660 }, { "loss": 3.3963, "grad_norm": 4.2313947677612305, "learning_rate": 0.0003932367149758454, "epoch": 2.9217391304347826, "step": 1680 }, { "loss": 3.3081, "grad_norm": 3.992475748062134, "learning_rate": 0.000391304347826087, "epoch": 2.9565217391304346, "step": 1700 }, { "loss": 3.3124, "grad_norm": 4.4731059074401855, "learning_rate": 0.0003893719806763285, "epoch": 2.991304347826087, "step": 1720 }, { "eval_loss": 2.1082653999328613, "eval_accuracy": 0.9260255548083389, "eval_runtime": 22.1676, "eval_samples_per_second": 67.08, "eval_steps_per_second": 67.08, "epoch": 3.0, "step": 1725 }, { "loss": 3.1247, "grad_norm": 4.272000312805176, "learning_rate": 0.00038743961352657004, "epoch": 3.026086956521739, "step": 1740 }, { "loss": 3.1064, "grad_norm": 4.102330207824707, "learning_rate": 0.00038550724637681163, "epoch": 3.0608695652173914, "step": 1760 }, { "loss": 2.9371, "grad_norm": 4.381846904754639, "learning_rate": 0.00038357487922705317, "epoch": 3.0956521739130434, "step": 1780 }, { "loss": 2.9355, "grad_norm": 4.1588921546936035, "learning_rate": 0.00038164251207729465, "epoch": 3.130434782608696, "step": 1800 }, { "loss": 2.8545, "grad_norm": 4.279609203338623, "learning_rate": 0.00037971014492753623, "epoch": 3.1652173913043478, "step": 1820 }, { "loss": 2.8096, "grad_norm": 4.240756988525391, "learning_rate": 0.00037777777777777777, "epoch": 3.2, "step": 1840 }, { "loss": 2.8138, "grad_norm": 4.11091947555542, "learning_rate": 0.00037584541062801936, "epoch": 3.234782608695652, "step": 1860 }, { "loss": 2.7417, "grad_norm": 4.078794479370117, "learning_rate": 0.0003739130434782609, "epoch": 3.269565217391304, "step": 1880 }, { "loss": 2.7937, "grad_norm": 4.368116855621338, "learning_rate": 0.0003719806763285024, "epoch": 3.3043478260869565, "step": 1900 }, { "loss": 2.7361, "grad_norm": 4.044319152832031, "learning_rate": 0.000370048309178744, "epoch": 3.3391304347826085, "step": 1920 }, { "loss": 2.7054, "grad_norm": 4.314040184020996, "learning_rate": 0.0003681159420289855, "epoch": 3.373913043478261, "step": 1940 }, { "loss": 2.6682, "grad_norm": 4.185855388641357, "learning_rate": 0.000366183574879227, "epoch": 3.408695652173913, "step": 1960 }, { "loss": 2.6644, "grad_norm": 4.433622360229492, "learning_rate": 0.0003642512077294686, "epoch": 3.4434782608695653, "step": 1980 }, { "loss": 2.618, "grad_norm": 4.048947811126709, "learning_rate": 0.00036231884057971015, "epoch": 3.4782608695652173, "step": 2000 }, { "loss": 2.5982, "grad_norm": 4.145406246185303, "learning_rate": 0.0003603864734299517, "epoch": 3.5130434782608697, "step": 2020 }, { "loss": 2.6138, "grad_norm": 4.2812910079956055, "learning_rate": 0.00035845410628019327, "epoch": 3.5478260869565217, "step": 2040 }, { "loss": 2.5039, "grad_norm": 4.400162220001221, "learning_rate": 0.0003565217391304348, "epoch": 3.5826086956521737, "step": 2060 }, { "loss": 2.5249, "grad_norm": 4.217800617218018, "learning_rate": 0.0003545893719806763, "epoch": 3.617391304347826, "step": 2080 }, { "loss": 2.4547, "grad_norm": 4.076215744018555, "learning_rate": 0.0003526570048309179, "epoch": 3.6521739130434785, "step": 2100 }, { "loss": 2.4315, "grad_norm": 4.139514446258545, "learning_rate": 0.0003507246376811594, "epoch": 3.6869565217391305, "step": 2120 }, { "loss": 2.3836, "grad_norm": 4.118022918701172, "learning_rate": 0.00034879227053140094, "epoch": 3.7217391304347824, "step": 2140 }, { "loss": 2.3284, "grad_norm": 4.137601852416992, "learning_rate": 0.00034685990338164253, "epoch": 3.756521739130435, "step": 2160 }, { "loss": 2.3095, "grad_norm": 4.023979663848877, "learning_rate": 0.00034492753623188406, "epoch": 3.791304347826087, "step": 2180 }, { "loss": 2.305, "grad_norm": 4.042725086212158, "learning_rate": 0.00034299516908212565, "epoch": 3.8260869565217392, "step": 2200 }, { "loss": 2.3237, "grad_norm": 4.265875339508057, "learning_rate": 0.0003410628019323672, "epoch": 3.860869565217391, "step": 2220 }, { "loss": 2.335, "grad_norm": 4.205041408538818, "learning_rate": 0.00033913043478260867, "epoch": 3.8956521739130436, "step": 2240 }, { "loss": 2.2341, "grad_norm": 4.1344709396362305, "learning_rate": 0.00033719806763285025, "epoch": 3.9304347826086956, "step": 2260 }, { "loss": 2.251, "grad_norm": 4.247790813446045, "learning_rate": 0.0003352657004830918, "epoch": 3.965217391304348, "step": 2280 }, { "loss": 2.3212, "grad_norm": 4.859626770019531, "learning_rate": 0.0003333333333333333, "epoch": 4.0, "step": 2300 }, { "eval_loss": 1.2223739624023438, "eval_accuracy": 0.9435104236718225, "eval_runtime": 14.8513, "eval_samples_per_second": 100.126, "eval_steps_per_second": 100.126, "epoch": 4.0, "step": 2300 }, { "loss": 1.9133, "grad_norm": 4.098020553588867, "learning_rate": 0.0003314009661835749, "epoch": 4.034782608695652, "step": 2320 }, { "loss": 1.9814, "grad_norm": 4.198029041290283, "learning_rate": 0.00032946859903381644, "epoch": 4.069565217391304, "step": 2340 }, { "loss": 1.9505, "grad_norm": 3.960844039916992, "learning_rate": 0.000327536231884058, "epoch": 4.104347826086957, "step": 2360 }, { "loss": 1.8815, "grad_norm": 4.0190300941467285, "learning_rate": 0.0003256038647342995, "epoch": 4.139130434782609, "step": 2380 }, { "loss": 1.8365, "grad_norm": 4.040708541870117, "learning_rate": 0.00032367149758454105, "epoch": 4.173913043478261, "step": 2400 }, { "loss": 1.84, "grad_norm": 4.077364444732666, "learning_rate": 0.0003217391304347826, "epoch": 4.208695652173913, "step": 2420 }, { "loss": 1.8864, "grad_norm": 4.267309188842773, "learning_rate": 0.0003199033816425121, "epoch": 4.243478260869566, "step": 2440 }, { "loss": 1.9015, "grad_norm": 3.978663921356201, "learning_rate": 0.00031797101449275363, "epoch": 4.278260869565218, "step": 2460 }, { "loss": 1.8388, "grad_norm": 4.089256763458252, "learning_rate": 0.0003160386473429952, "epoch": 4.3130434782608695, "step": 2480 }, { "loss": 1.7845, "grad_norm": 3.9317057132720947, "learning_rate": 0.0003141062801932367, "epoch": 4.3478260869565215, "step": 2500 }, { "loss": 1.7725, "grad_norm": 3.9738080501556396, "learning_rate": 0.00031217391304347823, "epoch": 4.3826086956521735, "step": 2520 }, { "loss": 1.852, "grad_norm": 4.232215881347656, "learning_rate": 0.0003102415458937198, "epoch": 4.417391304347826, "step": 2540 }, { "loss": 1.8234, "grad_norm": 4.050131797790527, "learning_rate": 0.00030830917874396136, "epoch": 4.452173913043478, "step": 2560 }, { "loss": 1.8148, "grad_norm": 4.217935085296631, "learning_rate": 0.0003063768115942029, "epoch": 4.48695652173913, "step": 2580 }, { "loss": 1.7134, "grad_norm": 3.9807074069976807, "learning_rate": 0.0003044444444444445, "epoch": 4.521739130434782, "step": 2600 }, { "loss": 1.6752, "grad_norm": 4.05940580368042, "learning_rate": 0.000302512077294686, "epoch": 4.556521739130435, "step": 2620 }, { "loss": 1.8413, "grad_norm": 4.454566955566406, "learning_rate": 0.00030057971014492755, "epoch": 4.591304347826087, "step": 2640 }, { "loss": 1.7948, "grad_norm": 4.144088268280029, "learning_rate": 0.0002986473429951691, "epoch": 4.626086956521739, "step": 2660 }, { "loss": 1.7468, "grad_norm": 3.940176010131836, "learning_rate": 0.0002967149758454106, "epoch": 4.660869565217391, "step": 2680 }, { "loss": 1.709, "grad_norm": 4.198675632476807, "learning_rate": 0.0002948792270531401, "epoch": 4.695652173913043, "step": 2700 }, { "loss": 1.6506, "grad_norm": 3.976001501083374, "learning_rate": 0.00029294685990338167, "epoch": 4.730434782608696, "step": 2720 }, { "loss": 1.7042, "grad_norm": 4.033059120178223, "learning_rate": 0.0002910144927536232, "epoch": 4.765217391304348, "step": 2740 }, { "loss": 1.6795, "grad_norm": 4.062041759490967, "learning_rate": 0.0002890821256038648, "epoch": 4.8, "step": 2760 }, { "loss": 1.7029, "grad_norm": 3.988589286804199, "learning_rate": 0.00028714975845410627, "epoch": 4.834782608695652, "step": 2780 }, { "loss": 1.6641, "grad_norm": 4.16325044631958, "learning_rate": 0.0002852173913043478, "epoch": 4.869565217391305, "step": 2800 }, { "loss": 1.6953, "grad_norm": 4.323537349700928, "learning_rate": 0.0002832850241545894, "epoch": 4.904347826086957, "step": 2820 }, { "loss": 1.5863, "grad_norm": 3.8293144702911377, "learning_rate": 0.0002813526570048309, "epoch": 4.939130434782609, "step": 2840 }, { "loss": 1.6276, "grad_norm": 3.8955535888671875, "learning_rate": 0.00027942028985507246, "epoch": 4.973913043478261, "step": 2860 }, { "eval_loss": 0.8229038715362549, "eval_accuracy": 0.9677202420981843, "eval_runtime": 88.6744, "eval_samples_per_second": 16.769, "eval_steps_per_second": 16.769, "epoch": 5.0, "step": 2875 }, { "loss": 1.5701, "grad_norm": 3.8480091094970703, "learning_rate": 0.00027748792270531405, "epoch": 5.008695652173913, "step": 2880 }, { "loss": 1.3786, "grad_norm": 3.679872512817383, "learning_rate": 0.0002755555555555556, "epoch": 5.043478260869565, "step": 2900 }, { "loss": 1.3563, "grad_norm": 4.13381290435791, "learning_rate": 0.00027362318840579706, "epoch": 5.078260869565217, "step": 2920 }, { "loss": 1.3588, "grad_norm": 3.7467329502105713, "learning_rate": 0.00027169082125603865, "epoch": 5.113043478260869, "step": 2940 }, { "loss": 1.3782, "grad_norm": 3.5837419033050537, "learning_rate": 0.0002698550724637681, "epoch": 5.147826086956521, "step": 2960 }, { "loss": 1.3969, "grad_norm": 4.077097415924072, "learning_rate": 0.00026792270531400964, "epoch": 5.182608695652174, "step": 2980 }, { "loss": 1.3346, "grad_norm": 3.5995211601257324, "learning_rate": 0.00026599033816425123, "epoch": 5.217391304347826, "step": 3000 }, { "loss": 1.3772, "grad_norm": 3.714010000228882, "learning_rate": 0.00026405797101449277, "epoch": 5.252173913043478, "step": 3020 }, { "loss": 1.3452, "grad_norm": 3.807094097137451, "learning_rate": 0.00026231884057971016, "epoch": 5.28695652173913, "step": 3040 }, { "loss": 1.3161, "grad_norm": 4.012477397918701, "learning_rate": 0.0002603864734299517, "epoch": 5.321739130434783, "step": 3060 }, { "loss": 1.3146, "grad_norm": 3.850520372390747, "learning_rate": 0.0002584541062801932, "epoch": 5.356521739130435, "step": 3080 }, { "loss": 1.3057, "grad_norm": NaN, "learning_rate": 0.00025661835748792274, "epoch": 5.391304347826087, "step": 3100 }, { "loss": 1.2619, "grad_norm": 3.697744607925415, "learning_rate": 0.0002546859903381643, "epoch": 5.426086956521739, "step": 3120 }, { "loss": 1.3436, "grad_norm": 4.125018119812012, "learning_rate": 0.00025275362318840576, "epoch": 5.460869565217392, "step": 3140 }, { "loss": 1.3289, "grad_norm": 4.1491899490356445, "learning_rate": 0.00025082125603864735, "epoch": 5.495652173913044, "step": 3160 }, { "loss": 1.218, "grad_norm": 3.9294846057891846, "learning_rate": 0.0002488888888888889, "epoch": 5.530434782608696, "step": 3180 }, { "loss": 1.3219, "grad_norm": 3.9030706882476807, "learning_rate": 0.00024695652173913047, "epoch": 5.565217391304348, "step": 3200 }, { "loss": 1.2694, "grad_norm": 4.124849319458008, "learning_rate": 0.000245024154589372, "epoch": 5.6, "step": 3220 }, { "loss": 1.2379, "grad_norm": 4.1668500900268555, "learning_rate": 0.0002432850241545894, "epoch": 5.6347826086956525, "step": 3240 }, { "loss": 1.2892, "grad_norm": 4.098198890686035, "learning_rate": 0.00024135265700483093, "epoch": 5.6695652173913045, "step": 3260 }, { "loss": 1.2742, "grad_norm": 3.690241813659668, "learning_rate": 0.00023942028985507246, "epoch": 5.7043478260869565, "step": 3280 }, { "loss": 1.1755, "grad_norm": 3.978963613510132, "learning_rate": 0.00023748792270531402, "epoch": 5.739130434782608, "step": 3300 }, { "loss": 1.2256, "grad_norm": 3.7397215366363525, "learning_rate": 0.00023574879227053139, "epoch": 5.773913043478261, "step": 3320 }, { "loss": 1.238, "grad_norm": 3.9201064109802246, "learning_rate": 0.00023391304347826088, "epoch": 5.808695652173913, "step": 3340 }, { "loss": 1.1706, "grad_norm": 3.725389242172241, "learning_rate": 0.0002319806763285024, "epoch": 5.843478260869565, "step": 3360 }, { "loss": 1.1644, "grad_norm": 3.5844123363494873, "learning_rate": 0.00023004830917874397, "epoch": 5.878260869565217, "step": 3380 }, { "loss": 1.2256, "grad_norm": 3.79936146736145, "learning_rate": 0.00022821256038647343, "epoch": 5.913043478260869, "step": 3400 }, { "loss": 1.2488, "grad_norm": 3.5947725772857666, "learning_rate": 0.00022628019323671497, "epoch": 5.947826086956522, "step": 3420 }, { "loss": 1.1418, "grad_norm": NaN, "learning_rate": 0.00022444444444444446, "epoch": 5.982608695652174, "step": 3440 }, { "eval_loss": 0.5840117335319519, "eval_accuracy": 0.9757901815736382, "eval_runtime": 97.2696, "eval_samples_per_second": 15.287, "eval_steps_per_second": 15.287, "epoch": 6.0, "step": 3450 }, { "loss": 1.1254, "grad_norm": 3.5959298610687256, "learning_rate": 0.00022260869565217392, "epoch": 6.017391304347826, "step": 3460 }, { "loss": 1.0343, "grad_norm": 3.9623775482177734, "learning_rate": 0.00022067632850241545, "epoch": 6.052173913043478, "step": 3480 }, { "loss": 1.0348, "grad_norm": 3.735102415084839, "learning_rate": 0.00021874396135265702, "epoch": 6.086956521739131, "step": 3500 }, { "loss": 0.9796, "grad_norm": 3.4255013465881348, "learning_rate": 0.00021681159420289855, "epoch": 6.121739130434783, "step": 3520 }, { "loss": 0.9865, "grad_norm": 3.981841564178467, "learning_rate": 0.00021497584541062804, "epoch": 6.156521739130435, "step": 3540 }, { "loss": 1.0054, "grad_norm": 3.9057116508483887, "learning_rate": 0.00021314009661835748, "epoch": 6.191304347826087, "step": 3560 }, { "loss": 1.0012, "grad_norm": 3.626560688018799, "learning_rate": 0.00021120772946859904, "epoch": 6.226086956521739, "step": 3580 }, { "loss": 1.0129, "grad_norm": 3.687683582305908, "learning_rate": 0.0002093719806763285, "epoch": 6.260869565217392, "step": 3600 }, { "loss": 0.9333, "grad_norm": 3.8632826805114746, "learning_rate": 0.00020763285024154592, "epoch": 6.2956521739130435, "step": 3620 }, { "loss": 1.0259, "grad_norm": 4.089422702789307, "learning_rate": 0.0002058937198067633, "epoch": 6.3304347826086955, "step": 3640 }, { "loss": 1.0184, "grad_norm": 4.261268615722656, "learning_rate": 0.00020415458937198067, "epoch": 6.3652173913043475, "step": 3660 }, { "loss": 1.0293, "grad_norm": 2.3901586532592773, "learning_rate": 0.0002026086956521739, "epoch": 6.4, "step": 3680 }, { "loss": 1.0026, "grad_norm": 2.233633518218994, "learning_rate": 0.00020067632850241546, "epoch": 6.434782608695652, "step": 3700 }, { "loss": 1.0426, "grad_norm": 2.049773693084717, "learning_rate": 0.00019893719806763285, "epoch": 6.469565217391304, "step": 3720 }, { "loss": 1.0324, "grad_norm": 2.21939754486084, "learning_rate": 0.0001970048309178744, "epoch": 6.504347826086956, "step": 3740 }, { "loss": 1.0666, "grad_norm": 2.2138895988464355, "learning_rate": 0.00019516908212560387, "epoch": 6.539130434782608, "step": 3760 }, { "loss": 1.0724, "grad_norm": 1.9186855554580688, "learning_rate": 0.0001932367149758454, "epoch": 6.573913043478261, "step": 3780 }, { "loss": 1.0867, "grad_norm": 1.302451729774475, "learning_rate": 0.00019159420289855073, "epoch": 6.608695652173913, "step": 3800 }, { "loss": 1.0659, "grad_norm": 1.1770459413528442, "learning_rate": 0.00018975845410628022, "epoch": 6.643478260869565, "step": 3820 }, { "loss": 1.0494, "grad_norm": 0.2651650309562683, "learning_rate": 0.0001881159420289855, "epoch": 6.678260869565217, "step": 3840 }, { "loss": 1.0464, "grad_norm": 0.0, "learning_rate": 0.0001867632850241546, "epoch": 6.71304347826087, "step": 3860 }, { "loss": 1.0457, "grad_norm": 0.0, "learning_rate": 0.000185024154589372, "epoch": 6.747826086956522, "step": 3880 }, { "loss": 0.9815, "grad_norm": 0.0, "learning_rate": 0.00018328502415458937, "epoch": 6.782608695652174, "step": 3900 }, { "loss": 1.0094, "grad_norm": 0.0, "learning_rate": 0.0001816425120772947, "epoch": 6.817391304347826, "step": 3920 }, { "loss": 1.0023, "grad_norm": NaN, "learning_rate": 0.00018028985507246377, "epoch": 6.852173913043478, "step": 3940 }, { "loss": 1.0278, "grad_norm": 0.0, "learning_rate": 0.00017893719806763288, "epoch": 6.886956521739131, "step": 3960 }, { "loss": 1.0123, "grad_norm": 0.0, "learning_rate": 0.0001771014492753623, "epoch": 6.921739130434783, "step": 3980 }, { "loss": 1.0774, "grad_norm": 0.0, "learning_rate": 0.00017565217391304346, "epoch": 6.956521739130435, "step": 4000 }, { "loss": 1.0484, "grad_norm": 0.0, "learning_rate": 0.00017391304347826088, "epoch": 6.9913043478260875, "step": 4020 }, { "eval_loss": 0.5780686736106873, "eval_accuracy": 0.9737726967047747, "eval_runtime": 118.8154, "eval_samples_per_second": 12.515, "eval_steps_per_second": 12.515, "epoch": 7.0, "step": 4025 }, { "loss": 0.9799, "grad_norm": 0.0, "learning_rate": 0.0001723671497584541, "epoch": 7.026086956521739, "step": 4040 }, { "loss": 0.9588, "grad_norm": 0.0, "learning_rate": 0.00017091787439613525, "epoch": 7.060869565217391, "step": 4060 }, { "loss": 0.9421, "grad_norm": NaN, "learning_rate": 0.00016966183574879226, "epoch": 7.095652173913043, "step": 4080 }, { "loss": 0.9551, "grad_norm": 0.0, "learning_rate": 0.00016782608695652175, "epoch": 7.130434782608695, "step": 4100 }, { "loss": 0.9622, "grad_norm": 0.0, "learning_rate": 0.00016618357487922704, "epoch": 7.165217391304348, "step": 4120 }, { "loss": 0.9712, "grad_norm": 0.0, "learning_rate": 0.00016444444444444446, "epoch": 7.2, "step": 4140 }, { "loss": 0.9834, "grad_norm": 0.0, "learning_rate": 0.00016299516908212561, "epoch": 7.234782608695652, "step": 4160 }, { "loss": 0.9968, "grad_norm": NaN, "learning_rate": 0.00016135265700483093, "epoch": 7.269565217391304, "step": 4180 }, { "loss": 0.956, "grad_norm": 0.0, "learning_rate": 0.00015961352657004833, "epoch": 7.304347826086957, "step": 4200 }, { "loss": 0.8981, "grad_norm": 0.0, "learning_rate": 0.00015806763285024155, "epoch": 7.339130434782609, "step": 4220 }, { "loss": 0.9515, "grad_norm": 0.0, "learning_rate": 0.00015642512077294684, "epoch": 7.373913043478261, "step": 4240 }, { "loss": 0.9535, "grad_norm": 0.0, "learning_rate": 0.0001548792270531401, "epoch": 7.408695652173913, "step": 4260 }, { "loss": 0.9646, "grad_norm": NaN, "learning_rate": 0.00015333333333333334, "epoch": 7.443478260869565, "step": 4280 }, { "loss": 0.9821, "grad_norm": 0.0, "learning_rate": 0.00015140096618357487, "epoch": 7.478260869565218, "step": 4300 }, { "loss": 0.9259, "grad_norm": 0.0, "learning_rate": 0.00015014492753623188, "epoch": 7.51304347826087, "step": 4320 }, { "loss": 0.9494, "grad_norm": 0.0, "learning_rate": 0.00014869565217391303, "epoch": 7.547826086956522, "step": 4340 }, { "loss": 0.9305, "grad_norm": 0.0, "learning_rate": 0.00014714975845410628, "epoch": 7.582608695652174, "step": 4360 }, { "loss": 0.8889, "grad_norm": 0.0, "learning_rate": 0.0001455072463768116, "epoch": 7.6173913043478265, "step": 4380 }, { "loss": 0.9524, "grad_norm": 0.0, "learning_rate": 0.00014396135265700482, "epoch": 7.6521739130434785, "step": 4400 }, { "loss": 0.9065, "grad_norm": 0.0, "learning_rate": 0.00014231884057971014, "epoch": 7.6869565217391305, "step": 4420 }, { "loss": 0.9153, "grad_norm": 0.0, "learning_rate": 0.00014048309178743963, "epoch": 7.721739130434782, "step": 4440 }, { "loss": 0.6675, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 7.756521739130434, "step": 4460 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 7.791304347826087, "step": 4480 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 7.826086956521739, "step": 4500 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 7.860869565217391, "step": 4520 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 7.895652173913043, "step": 4540 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 7.930434782608696, "step": 4560 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 7.965217391304348, "step": 4580 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.0, "step": 4600 }, { "eval_loss": NaN, "eval_accuracy": 0.0006724949562878278, "eval_runtime": 129.6238, "eval_samples_per_second": 11.472, "eval_steps_per_second": 11.472, "epoch": 8.0, "step": 4600 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.034782608695652, "step": 4620 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.069565217391304, "step": 4640 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.104347826086956, "step": 4660 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.139130434782608, "step": 4680 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.173913043478262, "step": 4700 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.208695652173914, "step": 4720 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.243478260869566, "step": 4740 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.278260869565218, "step": 4760 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.31304347826087, "step": 4780 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.347826086956522, "step": 4800 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.382608695652173, "step": 4820 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.417391304347825, "step": 4840 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.452173913043477, "step": 4860 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.486956521739131, "step": 4880 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.521739130434783, "step": 4900 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.556521739130435, "step": 4920 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.591304347826087, "step": 4940 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.626086956521739, "step": 4960 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.660869565217391, "step": 4980 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.695652173913043, "step": 5000 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.730434782608695, "step": 5020 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.765217391304347, "step": 5040 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.8, "step": 5060 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.834782608695653, "step": 5080 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.869565217391305, "step": 5100 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.904347826086957, "step": 5120 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.939130434782609, "step": 5140 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 8.97391304347826, "step": 5160 }, { "eval_loss": NaN, "eval_accuracy": 0.0006724949562878278, "eval_runtime": 117.1288, "eval_samples_per_second": 12.695, "eval_steps_per_second": 12.695, "epoch": 9.0, "step": 5175 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.008695652173913, "step": 5180 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.043478260869565, "step": 5200 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.078260869565218, "step": 5220 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.11304347826087, "step": 5240 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.147826086956522, "step": 5260 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.182608695652174, "step": 5280 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.217391304347826, "step": 5300 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.252173913043478, "step": 5320 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.28695652173913, "step": 5340 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.321739130434782, "step": 5360 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.356521739130434, "step": 5380 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.391304347826088, "step": 5400 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.42608695652174, "step": 5420 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.460869565217392, "step": 5440 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.495652173913044, "step": 5460 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.530434782608696, "step": 5480 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.565217391304348, "step": 5500 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.6, "step": 5520 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.634782608695652, "step": 5540 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.669565217391304, "step": 5560 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.704347826086957, "step": 5580 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.73913043478261, "step": 5600 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.773913043478261, "step": 5620 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.808695652173913, "step": 5640 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.843478260869565, "step": 5660 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.878260869565217, "step": 5680 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.91304347826087, "step": 5700 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.947826086956521, "step": 5720 }, { "loss": 0.0, "grad_norm": NaN, "learning_rate": 0.0001403864734299517, "epoch": 9.982608695652173, "step": 5740 }, { "eval_loss": NaN, "eval_accuracy": 0.0006724949562878278, "eval_runtime": 103.3199, "eval_samples_per_second": 14.392, "eval_steps_per_second": 14.392, "epoch": 10.0, "step": 5750 }, { "train_runtime": 59857.6179, "train_samples_per_second": 24.584, "train_steps_per_second": 0.096, "total_flos": 2.7398100529152e+18, "train_loss": 2.9414075751926587, "epoch": 10.0, "step": 5750 } ]