{ "best_metric": 1.6068978309631348, "best_model_checkpoint": "/home/nlplab5/Desktop/roberta-pretrain/ckpt/roberta/pretrain/medium/checkpoint-12420", "epoch": 9.049218813243746, "eval_steps": 90, "global_step": 12420, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 3.2670376300811768, "learning_rate": 4.166666666666667e-06, "loss": 7.2579, "step": 10 }, { "epoch": 0.02, "grad_norm": 3.1587345600128174, "learning_rate": 8.333333333333334e-06, "loss": 7.2077, "step": 20 }, { "epoch": 0.03, "grad_norm": 2.9462554454803467, "learning_rate": 1.25e-05, "loss": 7.1099, "step": 30 }, { "epoch": 0.03, "grad_norm": 2.7092673778533936, "learning_rate": 1.6666666666666667e-05, "loss": 6.9866, "step": 40 }, { "epoch": 0.04, "grad_norm": 2.605360269546509, "learning_rate": 2.0833333333333333e-05, "loss": 6.87, "step": 50 }, { "epoch": 0.05, "grad_norm": 2.5747287273406982, "learning_rate": 2.5e-05, "loss": 6.7736, "step": 60 }, { "epoch": 0.06, "grad_norm": 2.551903247833252, "learning_rate": 2.9166666666666666e-05, "loss": 6.6903, "step": 70 }, { "epoch": 0.07, "grad_norm": 2.5231289863586426, "learning_rate": 3.3333333333333335e-05, "loss": 6.6151, "step": 80 }, { "epoch": 0.08, "grad_norm": 2.503075361251831, "learning_rate": 3.75e-05, "loss": 6.5462, "step": 90 }, { "epoch": 0.08, "eval_accuracy": 0.2242063046599867, "eval_loss": 6.500818729400635, "eval_runtime": 1083.9947, "eval_samples_per_second": 460.679, "eval_steps_per_second": 2.399, "step": 90 }, { "epoch": 0.08, "grad_norm": 2.4953837394714355, "learning_rate": 4.1666666666666665e-05, "loss": 6.4802, "step": 100 }, { "epoch": 0.09, "grad_norm": 2.4765946865081787, "learning_rate": 4.5833333333333334e-05, "loss": 6.4127, "step": 110 }, { "epoch": 0.1, "grad_norm": 2.4805634021759033, "learning_rate": 5e-05, "loss": 6.3425, "step": 120 }, { "epoch": 0.11, "grad_norm": 2.4720654487609863, "learning_rate": 5.416666666666667e-05, "loss": 6.2706, "step": 130 }, { "epoch": 0.12, "grad_norm": 2.454899787902832, "learning_rate": 5.833333333333333e-05, "loss": 6.1941, "step": 140 }, { "epoch": 0.13, "grad_norm": 2.4357142448425293, "learning_rate": 6.25e-05, "loss": 6.1169, "step": 150 }, { "epoch": 0.14, "grad_norm": 2.4193003177642822, "learning_rate": 6.666666666666667e-05, "loss": 6.0351, "step": 160 }, { "epoch": 0.14, "grad_norm": 2.396649122238159, "learning_rate": 7.083333333333334e-05, "loss": 5.9532, "step": 170 }, { "epoch": 0.15, "grad_norm": 2.3566408157348633, "learning_rate": 7.5e-05, "loss": 5.8695, "step": 180 }, { "epoch": 0.15, "eval_accuracy": 0.22404351306840548, "eval_loss": 5.819457530975342, "eval_runtime": 1077.6772, "eval_samples_per_second": 463.38, "eval_steps_per_second": 2.414, "step": 180 }, { "epoch": 0.16, "grad_norm": 2.310816526412964, "learning_rate": 7.916666666666666e-05, "loss": 5.7876, "step": 190 }, { "epoch": 0.17, "grad_norm": 2.2797272205352783, "learning_rate": 8.333333333333333e-05, "loss": 5.706, "step": 200 }, { "epoch": 0.18, "grad_norm": 2.2188355922698975, "learning_rate": 8.75e-05, "loss": 5.6255, "step": 210 }, { "epoch": 0.19, "grad_norm": 2.142122745513916, "learning_rate": 9.166666666666667e-05, "loss": 5.5471, "step": 220 }, { "epoch": 0.2, "grad_norm": 2.069880485534668, "learning_rate": 9.583333333333334e-05, "loss": 5.4704, "step": 230 }, { "epoch": 0.2, "grad_norm": 1.957664966583252, "learning_rate": 0.0001, "loss": 5.4003, "step": 240 }, { "epoch": 0.21, "grad_norm": 1.8441264629364014, "learning_rate": 0.00010416666666666667, "loss": 5.3306, "step": 250 }, { "epoch": 0.22, "grad_norm": 1.722961664199829, "learning_rate": 0.00010833333333333334, "loss": 5.2648, "step": 260 }, { "epoch": 0.23, "grad_norm": 1.5622942447662354, "learning_rate": 0.00011250000000000001, "loss": 5.2004, "step": 270 }, { "epoch": 0.23, "eval_accuracy": 0.22578753539560809, "eval_loss": 5.162991046905518, "eval_runtime": 1079.7321, "eval_samples_per_second": 462.498, "eval_steps_per_second": 2.409, "step": 270 }, { "epoch": 0.24, "grad_norm": 1.4034879207611084, "learning_rate": 0.00011666666666666667, "loss": 5.144, "step": 280 }, { "epoch": 0.25, "grad_norm": 1.2136635780334473, "learning_rate": 0.00012083333333333333, "loss": 5.087, "step": 290 }, { "epoch": 0.25, "grad_norm": 0.9789605140686035, "learning_rate": 0.000125, "loss": 5.0347, "step": 300 }, { "epoch": 0.26, "grad_norm": 0.7709304094314575, "learning_rate": 0.00012916666666666667, "loss": 4.9873, "step": 310 }, { "epoch": 0.27, "grad_norm": 0.5693560838699341, "learning_rate": 0.00013333333333333334, "loss": 4.95, "step": 320 }, { "epoch": 0.28, "grad_norm": 0.42085811495780945, "learning_rate": 0.0001375, "loss": 4.9181, "step": 330 }, { "epoch": 0.29, "grad_norm": 0.3126681447029114, "learning_rate": 0.00014166666666666668, "loss": 4.8959, "step": 340 }, { "epoch": 0.3, "grad_norm": 0.24236658215522766, "learning_rate": 0.00014583333333333335, "loss": 4.876, "step": 350 }, { "epoch": 0.31, "grad_norm": 0.23320983350276947, "learning_rate": 0.00015, "loss": 4.8607, "step": 360 }, { "epoch": 0.31, "eval_accuracy": 0.2620490039433166, "eval_loss": 4.841182231903076, "eval_runtime": 1079.5667, "eval_samples_per_second": 462.569, "eval_steps_per_second": 2.409, "step": 360 }, { "epoch": 0.31, "grad_norm": 0.23191139101982117, "learning_rate": 0.00015416666666666668, "loss": 4.846, "step": 370 }, { "epoch": 0.32, "grad_norm": 0.2323000133037567, "learning_rate": 0.00015833333333333332, "loss": 4.8304, "step": 380 }, { "epoch": 0.33, "grad_norm": 0.2213001251220703, "learning_rate": 0.00016250000000000002, "loss": 4.816, "step": 390 }, { "epoch": 0.34, "grad_norm": 0.21700094640254974, "learning_rate": 0.00016666666666666666, "loss": 4.8044, "step": 400 }, { "epoch": 0.35, "grad_norm": 0.27367648482322693, "learning_rate": 0.00017083333333333333, "loss": 4.787, "step": 410 }, { "epoch": 0.36, "grad_norm": 0.3331514000892639, "learning_rate": 0.000175, "loss": 4.775, "step": 420 }, { "epoch": 0.37, "grad_norm": 0.3531811833381653, "learning_rate": 0.00017916666666666667, "loss": 4.7599, "step": 430 }, { "epoch": 0.37, "grad_norm": 0.17425844073295593, "learning_rate": 0.00018333333333333334, "loss": 4.7471, "step": 440 }, { "epoch": 0.38, "grad_norm": 0.16081774234771729, "learning_rate": 0.0001875, "loss": 4.732, "step": 450 }, { "epoch": 0.38, "eval_accuracy": 0.2854636107287403, "eval_loss": 4.713276386260986, "eval_runtime": 1135.1993, "eval_samples_per_second": 439.9, "eval_steps_per_second": 2.291, "step": 450 }, { "epoch": 0.39, "grad_norm": 0.5644450783729553, "learning_rate": 0.00019166666666666667, "loss": 4.7196, "step": 460 }, { "epoch": 0.4, "grad_norm": 0.4822804629802704, "learning_rate": 0.00019583333333333334, "loss": 4.7067, "step": 470 }, { "epoch": 0.41, "grad_norm": 0.26140686869621277, "learning_rate": 0.0002, "loss": 4.6939, "step": 480 }, { "epoch": 0.42, "grad_norm": 0.24341146647930145, "learning_rate": 0.00020416666666666668, "loss": 4.6797, "step": 490 }, { "epoch": 0.42, "grad_norm": 0.1883888840675354, "learning_rate": 0.00020833333333333335, "loss": 4.667, "step": 500 }, { "epoch": 0.43, "grad_norm": 0.32793405652046204, "learning_rate": 0.0002125, "loss": 4.6568, "step": 510 }, { "epoch": 0.44, "grad_norm": 0.6900771856307983, "learning_rate": 0.00021666666666666668, "loss": 4.6504, "step": 520 }, { "epoch": 0.45, "grad_norm": 0.23155897855758667, "learning_rate": 0.00022083333333333333, "loss": 4.6371, "step": 530 }, { "epoch": 0.46, "grad_norm": 0.15708310902118683, "learning_rate": 0.00022500000000000002, "loss": 4.6273, "step": 540 }, { "epoch": 0.46, "eval_accuracy": 0.2875459051329938, "eval_loss": 4.6114654541015625, "eval_runtime": 1075.2001, "eval_samples_per_second": 464.447, "eval_steps_per_second": 2.419, "step": 540 }, { "epoch": 0.47, "grad_norm": 0.14138343930244446, "learning_rate": 0.00022916666666666666, "loss": 4.6187, "step": 550 }, { "epoch": 0.48, "grad_norm": 0.10016636550426483, "learning_rate": 0.00023333333333333333, "loss": 4.6101, "step": 560 }, { "epoch": 0.48, "grad_norm": 0.09886801242828369, "learning_rate": 0.0002375, "loss": 4.6027, "step": 570 }, { "epoch": 0.49, "grad_norm": 0.9051061272621155, "learning_rate": 0.00024166666666666667, "loss": 4.5993, "step": 580 }, { "epoch": 0.5, "grad_norm": 0.8797232508659363, "learning_rate": 0.0002458333333333333, "loss": 4.5982, "step": 590 }, { "epoch": 0.51, "grad_norm": 0.4441538453102112, "learning_rate": 0.00025, "loss": 4.5868, "step": 600 }, { "epoch": 0.52, "grad_norm": 0.24725468456745148, "learning_rate": 0.00025416666666666665, "loss": 4.5836, "step": 610 }, { "epoch": 0.53, "grad_norm": 0.08581159263849258, "learning_rate": 0.00025833333333333334, "loss": 4.5773, "step": 620 }, { "epoch": 0.54, "grad_norm": 0.10642833262681961, "learning_rate": 0.00026250000000000004, "loss": 4.572, "step": 630 }, { "epoch": 0.54, "eval_accuracy": 0.28878817310094446, "eval_loss": 4.563485622406006, "eval_runtime": 1076.0022, "eval_samples_per_second": 464.101, "eval_steps_per_second": 2.417, "step": 630 }, { "epoch": 0.54, "grad_norm": 0.17463397979736328, "learning_rate": 0.0002666666666666667, "loss": 4.5707, "step": 640 }, { "epoch": 0.55, "grad_norm": 0.1806878000497818, "learning_rate": 0.0002708333333333333, "loss": 4.5681, "step": 650 }, { "epoch": 0.56, "grad_norm": 0.18553458154201508, "learning_rate": 0.000275, "loss": 4.5612, "step": 660 }, { "epoch": 0.57, "grad_norm": 0.7338638305664062, "learning_rate": 0.00027916666666666666, "loss": 4.5601, "step": 670 }, { "epoch": 0.58, "grad_norm": 0.3852124512195587, "learning_rate": 0.00028333333333333335, "loss": 4.5567, "step": 680 }, { "epoch": 0.59, "grad_norm": 0.2558722198009491, "learning_rate": 0.0002875, "loss": 4.555, "step": 690 }, { "epoch": 0.59, "grad_norm": 0.1996091902256012, "learning_rate": 0.0002916666666666667, "loss": 4.553, "step": 700 }, { "epoch": 0.6, "grad_norm": 0.414126992225647, "learning_rate": 0.00029583333333333333, "loss": 4.551, "step": 710 }, { "epoch": 0.61, "grad_norm": 0.6828728914260864, "learning_rate": 0.0003, "loss": 4.5485, "step": 720 }, { "epoch": 0.61, "eval_accuracy": 0.2897431184302517, "eval_loss": 4.544471263885498, "eval_runtime": 1075.524, "eval_samples_per_second": 464.308, "eval_steps_per_second": 2.418, "step": 720 }, { "epoch": 0.62, "grad_norm": 0.31978073716163635, "learning_rate": 0.00030416666666666667, "loss": 4.5478, "step": 730 }, { "epoch": 0.63, "grad_norm": 0.23724275827407837, "learning_rate": 0.00030833333333333337, "loss": 4.5451, "step": 740 }, { "epoch": 0.64, "grad_norm": 1.0507104396820068, "learning_rate": 0.0003125, "loss": 4.5441, "step": 750 }, { "epoch": 0.65, "grad_norm": 0.38554638624191284, "learning_rate": 0.00031666666666666665, "loss": 4.5433, "step": 760 }, { "epoch": 0.65, "grad_norm": 0.275704562664032, "learning_rate": 0.00032083333333333334, "loss": 4.5415, "step": 770 }, { "epoch": 0.66, "grad_norm": 0.2305123656988144, "learning_rate": 0.00032500000000000004, "loss": 4.5398, "step": 780 }, { "epoch": 0.67, "grad_norm": 0.4813285768032074, "learning_rate": 0.0003291666666666667, "loss": 4.5391, "step": 790 }, { "epoch": 0.68, "grad_norm": 0.6520434617996216, "learning_rate": 0.0003333333333333333, "loss": 4.5361, "step": 800 }, { "epoch": 0.69, "grad_norm": 0.3875904381275177, "learning_rate": 0.0003375, "loss": 4.5351, "step": 810 }, { "epoch": 0.69, "eval_accuracy": 0.29011102141892453, "eval_loss": 4.53138542175293, "eval_runtime": 1074.6899, "eval_samples_per_second": 464.668, "eval_steps_per_second": 2.42, "step": 810 }, { "epoch": 0.7, "grad_norm": 0.6874526143074036, "learning_rate": 0.00034166666666666666, "loss": 4.5338, "step": 820 }, { "epoch": 0.71, "grad_norm": 0.4180966019630432, "learning_rate": 0.00034583333333333335, "loss": 4.5331, "step": 830 }, { "epoch": 0.71, "grad_norm": 0.41761451959609985, "learning_rate": 0.00035, "loss": 4.5291, "step": 840 }, { "epoch": 0.72, "grad_norm": 0.6088799834251404, "learning_rate": 0.0003541666666666667, "loss": 4.5303, "step": 850 }, { "epoch": 0.73, "grad_norm": 0.20438095927238464, "learning_rate": 0.00035833333333333333, "loss": 4.5298, "step": 860 }, { "epoch": 0.74, "grad_norm": 0.4336546063423157, "learning_rate": 0.0003625, "loss": 4.5283, "step": 870 }, { "epoch": 0.75, "grad_norm": 0.40454909205436707, "learning_rate": 0.00036666666666666667, "loss": 4.5301, "step": 880 }, { "epoch": 0.76, "grad_norm": 0.4893989562988281, "learning_rate": 0.00037083333333333337, "loss": 4.5286, "step": 890 }, { "epoch": 0.76, "grad_norm": 0.4546484053134918, "learning_rate": 0.000375, "loss": 4.5263, "step": 900 }, { "epoch": 0.76, "eval_accuracy": 0.2903379261149848, "eval_loss": 4.5242390632629395, "eval_runtime": 1074.5238, "eval_samples_per_second": 464.74, "eval_steps_per_second": 2.421, "step": 900 }, { "epoch": 0.77, "grad_norm": 0.22772909700870514, "learning_rate": 0.00037916666666666665, "loss": 4.5251, "step": 910 }, { "epoch": 0.78, "grad_norm": 0.48488083481788635, "learning_rate": 0.00038333333333333334, "loss": 4.524, "step": 920 }, { "epoch": 0.79, "grad_norm": 0.21915870904922485, "learning_rate": 0.00038750000000000004, "loss": 4.5276, "step": 930 }, { "epoch": 0.8, "grad_norm": 0.4013586938381195, "learning_rate": 0.0003916666666666667, "loss": 4.5263, "step": 940 }, { "epoch": 0.81, "grad_norm": 0.6083785891532898, "learning_rate": 0.0003958333333333333, "loss": 4.5226, "step": 950 }, { "epoch": 0.82, "grad_norm": 0.5477403402328491, "learning_rate": 0.0004, "loss": 4.5223, "step": 960 }, { "epoch": 0.82, "grad_norm": 0.25517192482948303, "learning_rate": 0.00040416666666666666, "loss": 4.5213, "step": 970 }, { "epoch": 0.83, "grad_norm": 0.5681092739105225, "learning_rate": 0.00040833333333333336, "loss": 4.5213, "step": 980 }, { "epoch": 0.84, "grad_norm": 0.27979689836502075, "learning_rate": 0.0004125, "loss": 4.5223, "step": 990 }, { "epoch": 0.84, "eval_accuracy": 0.2904179370099399, "eval_loss": 4.5171709060668945, "eval_runtime": 1074.3659, "eval_samples_per_second": 464.808, "eval_steps_per_second": 2.421, "step": 990 }, { "epoch": 0.85, "grad_norm": 0.278210312128067, "learning_rate": 0.0004166666666666667, "loss": 4.5185, "step": 1000 }, { "epoch": 0.86, "grad_norm": 0.31496554613113403, "learning_rate": 0.00042083333333333333, "loss": 4.5184, "step": 1010 }, { "epoch": 0.87, "grad_norm": 0.28795501589775085, "learning_rate": 0.000425, "loss": 4.5202, "step": 1020 }, { "epoch": 0.88, "grad_norm": 0.1585451066493988, "learning_rate": 0.00042916666666666667, "loss": 4.5177, "step": 1030 }, { "epoch": 0.88, "grad_norm": 0.25573596358299255, "learning_rate": 0.00043333333333333337, "loss": 4.5157, "step": 1040 }, { "epoch": 0.89, "grad_norm": 0.9261253476142883, "learning_rate": 0.0004375, "loss": 4.5155, "step": 1050 }, { "epoch": 0.9, "grad_norm": 0.20488545298576355, "learning_rate": 0.00044166666666666665, "loss": 4.5171, "step": 1060 }, { "epoch": 0.91, "grad_norm": 0.19982470571994781, "learning_rate": 0.00044583333333333335, "loss": 4.5157, "step": 1070 }, { "epoch": 0.92, "grad_norm": 0.2195570170879364, "learning_rate": 0.00045000000000000004, "loss": 4.511, "step": 1080 }, { "epoch": 0.92, "eval_accuracy": 0.29042484814802466, "eval_loss": 4.511170387268066, "eval_runtime": 1074.2406, "eval_samples_per_second": 464.862, "eval_steps_per_second": 2.421, "step": 1080 }, { "epoch": 0.93, "grad_norm": 0.5774135589599609, "learning_rate": 0.0004541666666666667, "loss": 4.5159, "step": 1090 }, { "epoch": 0.93, "grad_norm": 0.3652968108654022, "learning_rate": 0.0004583333333333333, "loss": 4.5152, "step": 1100 }, { "epoch": 0.94, "grad_norm": 0.4962700307369232, "learning_rate": 0.0004625, "loss": 4.5139, "step": 1110 }, { "epoch": 0.95, "grad_norm": 0.3226447105407715, "learning_rate": 0.00046666666666666666, "loss": 4.5117, "step": 1120 }, { "epoch": 0.96, "grad_norm": 0.5751166939735413, "learning_rate": 0.00047083333333333336, "loss": 4.5131, "step": 1130 }, { "epoch": 0.97, "grad_norm": 0.181748166680336, "learning_rate": 0.000475, "loss": 4.5136, "step": 1140 }, { "epoch": 0.98, "grad_norm": 0.6175718903541565, "learning_rate": 0.0004791666666666667, "loss": 4.5107, "step": 1150 }, { "epoch": 0.99, "grad_norm": 0.261405348777771, "learning_rate": 0.00048333333333333334, "loss": 4.5124, "step": 1160 }, { "epoch": 0.99, "grad_norm": 0.18674370646476746, "learning_rate": 0.0004875, "loss": 4.5093, "step": 1170 }, { "epoch": 0.99, "eval_accuracy": 0.2903912366894582, "eval_loss": 4.507014274597168, "eval_runtime": 1075.0378, "eval_samples_per_second": 464.518, "eval_steps_per_second": 2.419, "step": 1170 }, { "epoch": 1.0, "grad_norm": 0.4493379294872284, "learning_rate": 0.0004916666666666666, "loss": 4.5107, "step": 1180 }, { "epoch": 1.01, "grad_norm": 0.40603315830230713, "learning_rate": 0.0004958333333333334, "loss": 4.5103, "step": 1190 }, { "epoch": 1.02, "grad_norm": 0.17475590109825134, "learning_rate": 0.0005, "loss": 4.507, "step": 1200 }, { "epoch": 1.03, "grad_norm": 0.6670963168144226, "learning_rate": 0.0005041666666666667, "loss": 4.5076, "step": 1210 }, { "epoch": 1.04, "grad_norm": 0.20023925602436066, "learning_rate": 0.0005083333333333333, "loss": 4.5092, "step": 1220 }, { "epoch": 1.05, "grad_norm": 0.38801464438438416, "learning_rate": 0.0005124999999999999, "loss": 4.5047, "step": 1230 }, { "epoch": 1.05, "grad_norm": 0.4637294113636017, "learning_rate": 0.0005166666666666667, "loss": 4.5088, "step": 1240 }, { "epoch": 1.06, "grad_norm": 0.17603175342082977, "learning_rate": 0.0005208333333333334, "loss": 4.5057, "step": 1250 }, { "epoch": 1.07, "grad_norm": 0.4553210735321045, "learning_rate": 0.0005250000000000001, "loss": 4.505, "step": 1260 }, { "epoch": 1.07, "eval_accuracy": 0.29054962247372956, "eval_loss": 4.502260208129883, "eval_runtime": 1075.7242, "eval_samples_per_second": 464.221, "eval_steps_per_second": 2.418, "step": 1260 }, { "epoch": 1.08, "grad_norm": 0.30805012583732605, "learning_rate": 0.0005291666666666667, "loss": 4.5032, "step": 1270 }, { "epoch": 1.09, "grad_norm": 0.5969117283821106, "learning_rate": 0.0005333333333333334, "loss": 4.5036, "step": 1280 }, { "epoch": 1.1, "grad_norm": 0.24606676399707794, "learning_rate": 0.0005375, "loss": 4.5042, "step": 1290 }, { "epoch": 1.1, "grad_norm": 0.2750067710876465, "learning_rate": 0.0005416666666666666, "loss": 4.5037, "step": 1300 }, { "epoch": 1.11, "grad_norm": 0.4421214163303375, "learning_rate": 0.0005458333333333333, "loss": 4.5014, "step": 1310 }, { "epoch": 1.12, "grad_norm": 0.2441830188035965, "learning_rate": 0.00055, "loss": 4.5005, "step": 1320 }, { "epoch": 1.13, "grad_norm": 0.25598272681236267, "learning_rate": 0.0005541666666666667, "loss": 4.5007, "step": 1330 }, { "epoch": 1.14, "grad_norm": 0.17499062418937683, "learning_rate": 0.0005583333333333333, "loss": 4.5031, "step": 1340 }, { "epoch": 1.15, "grad_norm": 0.6325914263725281, "learning_rate": 0.0005625000000000001, "loss": 4.5003, "step": 1350 }, { "epoch": 1.15, "eval_accuracy": 0.2903214025778754, "eval_loss": 4.499546051025391, "eval_runtime": 1074.5865, "eval_samples_per_second": 464.713, "eval_steps_per_second": 2.42, "step": 1350 }, { "epoch": 1.16, "grad_norm": 0.29858532547950745, "learning_rate": 0.0005666666666666667, "loss": 4.5014, "step": 1360 }, { "epoch": 1.16, "grad_norm": 0.3625228703022003, "learning_rate": 0.0005708333333333333, "loss": 4.4991, "step": 1370 }, { "epoch": 1.17, "grad_norm": 0.271508127450943, "learning_rate": 0.000575, "loss": 4.499, "step": 1380 }, { "epoch": 1.18, "grad_norm": 0.7316662073135376, "learning_rate": 0.0005791666666666667, "loss": 4.498, "step": 1390 }, { "epoch": 1.19, "grad_norm": 0.3072379529476166, "learning_rate": 0.0005833333333333334, "loss": 4.5021, "step": 1400 }, { "epoch": 1.2, "grad_norm": 0.16316668689250946, "learning_rate": 0.0005875, "loss": 4.4963, "step": 1410 }, { "epoch": 1.21, "grad_norm": 0.6301301717758179, "learning_rate": 0.0005916666666666667, "loss": 4.4978, "step": 1420 }, { "epoch": 1.22, "grad_norm": 0.22797346115112305, "learning_rate": 0.0005958333333333333, "loss": 4.4964, "step": 1430 }, { "epoch": 1.22, "grad_norm": 0.3899094760417938, "learning_rate": 0.0006, "loss": 4.4939, "step": 1440 }, { "epoch": 1.22, "eval_accuracy": 0.29044920062027546, "eval_loss": 4.489974021911621, "eval_runtime": 1075.4503, "eval_samples_per_second": 464.339, "eval_steps_per_second": 2.419, "step": 1440 }, { "epoch": 1.23, "grad_norm": 0.34080126881599426, "learning_rate": 0.0006041666666666666, "loss": 4.493, "step": 1450 }, { "epoch": 1.24, "grad_norm": 1.12690007686615, "learning_rate": 0.0006083333333333333, "loss": 4.4904, "step": 1460 }, { "epoch": 1.25, "grad_norm": 0.30394747853279114, "learning_rate": 0.0006125000000000001, "loss": 4.489, "step": 1470 }, { "epoch": 1.26, "grad_norm": 0.4542248845100403, "learning_rate": 0.0006166666666666667, "loss": 4.4841, "step": 1480 }, { "epoch": 1.27, "grad_norm": 0.3733484447002411, "learning_rate": 0.0006208333333333334, "loss": 4.4738, "step": 1490 }, { "epoch": 1.27, "grad_norm": 0.31465083360671997, "learning_rate": 0.000625, "loss": 4.4695, "step": 1500 }, { "epoch": 1.28, "grad_norm": 0.32257241010665894, "learning_rate": 0.0006291666666666667, "loss": 4.461, "step": 1510 }, { "epoch": 1.29, "grad_norm": 0.7750576734542847, "learning_rate": 0.0006333333333333333, "loss": 4.4636, "step": 1520 }, { "epoch": 1.3, "grad_norm": 0.35094720125198364, "learning_rate": 0.0006374999999999999, "loss": 4.4569, "step": 1530 }, { "epoch": 1.3, "eval_accuracy": 0.2906712538997569, "eval_loss": 4.445650100708008, "eval_runtime": 1075.3046, "eval_samples_per_second": 464.402, "eval_steps_per_second": 2.419, "step": 1530 }, { "epoch": 1.31, "grad_norm": 0.5662222504615784, "learning_rate": 0.0006416666666666667, "loss": 4.4542, "step": 1540 }, { "epoch": 1.32, "grad_norm": 0.6326726675033569, "learning_rate": 0.0006458333333333334, "loss": 4.4509, "step": 1550 }, { "epoch": 1.33, "grad_norm": 0.4773523807525635, "learning_rate": 0.0006500000000000001, "loss": 4.4468, "step": 1560 }, { "epoch": 1.33, "grad_norm": 0.4514019191265106, "learning_rate": 0.0006541666666666667, "loss": 4.442, "step": 1570 }, { "epoch": 1.34, "grad_norm": 0.5631856918334961, "learning_rate": 0.0006583333333333334, "loss": 4.44, "step": 1580 }, { "epoch": 1.35, "grad_norm": 0.36760690808296204, "learning_rate": 0.0006625, "loss": 4.4317, "step": 1590 }, { "epoch": 1.36, "grad_norm": 0.669217586517334, "learning_rate": 0.0006666666666666666, "loss": 4.4365, "step": 1600 }, { "epoch": 1.37, "grad_norm": 0.4648076891899109, "learning_rate": 0.0006708333333333333, "loss": 4.4277, "step": 1610 }, { "epoch": 1.38, "grad_norm": 0.45093855261802673, "learning_rate": 0.000675, "loss": 4.4215, "step": 1620 }, { "epoch": 1.38, "eval_accuracy": 0.29071258845839876, "eval_loss": 4.40159797668457, "eval_runtime": 1073.4339, "eval_samples_per_second": 465.212, "eval_steps_per_second": 2.423, "step": 1620 }, { "epoch": 1.39, "grad_norm": 0.6909874081611633, "learning_rate": 0.0006791666666666667, "loss": 4.4243, "step": 1630 }, { "epoch": 1.39, "grad_norm": 0.5092808604240417, "learning_rate": 0.0006833333333333333, "loss": 4.4162, "step": 1640 }, { "epoch": 1.4, "grad_norm": 0.7031335234642029, "learning_rate": 0.0006875, "loss": 4.409, "step": 1650 }, { "epoch": 1.41, "grad_norm": 0.778090238571167, "learning_rate": 0.0006916666666666667, "loss": 4.4091, "step": 1660 }, { "epoch": 1.42, "grad_norm": 0.935316801071167, "learning_rate": 0.0006958333333333334, "loss": 4.4059, "step": 1670 }, { "epoch": 1.43, "grad_norm": 0.9648371934890747, "learning_rate": 0.0007, "loss": 4.4033, "step": 1680 }, { "epoch": 1.44, "grad_norm": 0.524691641330719, "learning_rate": 0.0007041666666666667, "loss": 4.4012, "step": 1690 }, { "epoch": 1.44, "grad_norm": 0.5595187544822693, "learning_rate": 0.0007083333333333334, "loss": 4.3954, "step": 1700 }, { "epoch": 1.45, "grad_norm": 0.5809574723243713, "learning_rate": 0.0007125, "loss": 4.3943, "step": 1710 }, { "epoch": 1.45, "eval_accuracy": 0.29116029691107925, "eval_loss": 4.3684492111206055, "eval_runtime": 1080.2368, "eval_samples_per_second": 462.282, "eval_steps_per_second": 2.408, "step": 1710 }, { "epoch": 1.46, "grad_norm": 0.4484635293483734, "learning_rate": 0.0007166666666666667, "loss": 4.3881, "step": 1720 }, { "epoch": 1.47, "grad_norm": 0.6823798418045044, "learning_rate": 0.0007208333333333333, "loss": 4.3829, "step": 1730 }, { "epoch": 1.48, "grad_norm": 0.7428690791130066, "learning_rate": 0.000725, "loss": 4.3843, "step": 1740 }, { "epoch": 1.49, "grad_norm": 0.6587589979171753, "learning_rate": 0.0007291666666666666, "loss": 4.3828, "step": 1750 }, { "epoch": 1.5, "grad_norm": 0.5860837697982788, "learning_rate": 0.0007333333333333333, "loss": 4.3764, "step": 1760 }, { "epoch": 1.5, "grad_norm": 0.5413070321083069, "learning_rate": 0.0007375000000000001, "loss": 4.3752, "step": 1770 }, { "epoch": 1.51, "grad_norm": 0.665489137172699, "learning_rate": 0.0007416666666666667, "loss": 4.3728, "step": 1780 }, { "epoch": 1.52, "grad_norm": 0.711599588394165, "learning_rate": 0.0007458333333333334, "loss": 4.373, "step": 1790 }, { "epoch": 1.53, "grad_norm": 0.6164100170135498, "learning_rate": 0.00075, "loss": 4.3677, "step": 1800 }, { "epoch": 1.53, "eval_accuracy": 0.29116748362162354, "eval_loss": 4.338656902313232, "eval_runtime": 1078.2447, "eval_samples_per_second": 463.136, "eval_steps_per_second": 2.412, "step": 1800 }, { "epoch": 1.54, "grad_norm": 0.8174536228179932, "learning_rate": 0.0007541666666666667, "loss": 4.3622, "step": 1810 }, { "epoch": 1.55, "grad_norm": 0.4686708152294159, "learning_rate": 0.0007583333333333333, "loss": 4.3615, "step": 1820 }, { "epoch": 1.56, "grad_norm": 0.726311206817627, "learning_rate": 0.0007624999999999999, "loss": 4.3553, "step": 1830 }, { "epoch": 1.56, "grad_norm": 0.6094339489936829, "learning_rate": 0.0007666666666666667, "loss": 4.3584, "step": 1840 }, { "epoch": 1.57, "grad_norm": 0.43092453479766846, "learning_rate": 0.0007708333333333334, "loss": 4.3515, "step": 1850 }, { "epoch": 1.58, "grad_norm": 0.8314465284347534, "learning_rate": 0.0007750000000000001, "loss": 4.3503, "step": 1860 }, { "epoch": 1.59, "grad_norm": 0.9535554647445679, "learning_rate": 0.0007791666666666667, "loss": 4.3459, "step": 1870 }, { "epoch": 1.6, "grad_norm": 0.9487770795822144, "learning_rate": 0.0007833333333333334, "loss": 4.3438, "step": 1880 }, { "epoch": 1.61, "grad_norm": 0.65323406457901, "learning_rate": 0.0007875, "loss": 4.3382, "step": 1890 }, { "epoch": 1.61, "eval_accuracy": 0.2915768463978657, "eval_loss": 4.299588203430176, "eval_runtime": 1078.9106, "eval_samples_per_second": 462.85, "eval_steps_per_second": 2.411, "step": 1890 }, { "epoch": 1.61, "grad_norm": 0.8234522342681885, "learning_rate": 0.0007916666666666666, "loss": 4.337, "step": 1900 }, { "epoch": 1.62, "grad_norm": 0.984524667263031, "learning_rate": 0.0007958333333333333, "loss": 4.3342, "step": 1910 }, { "epoch": 1.63, "grad_norm": 1.087571382522583, "learning_rate": 0.0008, "loss": 4.3327, "step": 1920 }, { "epoch": 1.64, "grad_norm": 0.8180701732635498, "learning_rate": 0.0008041666666666667, "loss": 4.3292, "step": 1930 }, { "epoch": 1.65, "grad_norm": 1.209524154663086, "learning_rate": 0.0008083333333333333, "loss": 4.3258, "step": 1940 }, { "epoch": 1.66, "grad_norm": 1.2440215349197388, "learning_rate": 0.0008125000000000001, "loss": 4.322, "step": 1950 }, { "epoch": 1.67, "grad_norm": 1.1065999269485474, "learning_rate": 0.0008166666666666667, "loss": 4.3207, "step": 1960 }, { "epoch": 1.67, "grad_norm": 2.2142093181610107, "learning_rate": 0.0008208333333333334, "loss": 4.3183, "step": 1970 }, { "epoch": 1.68, "grad_norm": 0.8045121431350708, "learning_rate": 0.000825, "loss": 4.3216, "step": 1980 }, { "epoch": 1.68, "eval_accuracy": 0.2919771311011085, "eval_loss": 4.267116069793701, "eval_runtime": 1079.6166, "eval_samples_per_second": 462.548, "eval_steps_per_second": 2.409, "step": 1980 }, { "epoch": 1.45, "grad_norm": 0.8329472541809082, "learning_rate": 0.0008291666666666667, "loss": 4.3102, "step": 1990 }, { "epoch": 1.46, "grad_norm": 1.114058256149292, "learning_rate": 0.0008333333333333334, "loss": 4.3138, "step": 2000 }, { "epoch": 1.46, "grad_norm": 0.989930272102356, "learning_rate": 0.0008375, "loss": 4.3068, "step": 2010 }, { "epoch": 1.47, "grad_norm": 0.9290440082550049, "learning_rate": 0.0008416666666666667, "loss": 4.3016, "step": 2020 }, { "epoch": 1.48, "grad_norm": 1.6841094493865967, "learning_rate": 0.0008458333333333333, "loss": 4.3031, "step": 2030 }, { "epoch": 1.49, "grad_norm": 1.0473177433013916, "learning_rate": 0.00085, "loss": 4.3079, "step": 2040 }, { "epoch": 1.49, "grad_norm": 1.1396197080612183, "learning_rate": 0.0008541666666666666, "loss": 4.2986, "step": 2050 }, { "epoch": 1.5, "grad_norm": 1.2318438291549683, "learning_rate": 0.0008583333333333333, "loss": 4.3005, "step": 2060 }, { "epoch": 1.51, "grad_norm": 1.749923825263977, "learning_rate": 0.0008625000000000001, "loss": 4.2879, "step": 2070 }, { "epoch": 1.51, "eval_accuracy": 0.2931822363307358, "eval_loss": 4.231507778167725, "eval_runtime": 1089.4507, "eval_samples_per_second": 458.378, "eval_steps_per_second": 2.047, "step": 2070 }, { "epoch": 1.52, "grad_norm": 0.9980252385139465, "learning_rate": 0.0008666666666666667, "loss": 4.2821, "step": 2080 }, { "epoch": 1.52, "grad_norm": 1.5635493993759155, "learning_rate": 0.0008708333333333334, "loss": 4.29, "step": 2090 }, { "epoch": 1.53, "grad_norm": 1.6463395357131958, "learning_rate": 0.000875, "loss": 4.2856, "step": 2100 }, { "epoch": 1.54, "grad_norm": 1.3602315187454224, "learning_rate": 0.0008791666666666667, "loss": 4.2802, "step": 2110 }, { "epoch": 1.54, "grad_norm": 0.8902882933616638, "learning_rate": 0.0008833333333333333, "loss": 4.2729, "step": 2120 }, { "epoch": 1.55, "grad_norm": 1.186219573020935, "learning_rate": 0.0008874999999999999, "loss": 4.2691, "step": 2130 }, { "epoch": 1.56, "grad_norm": 1.0059683322906494, "learning_rate": 0.0008916666666666667, "loss": 4.26, "step": 2140 }, { "epoch": 1.57, "grad_norm": 1.2011739015579224, "learning_rate": 0.0008958333333333334, "loss": 4.2577, "step": 2150 }, { "epoch": 1.57, "grad_norm": 1.4442743062973022, "learning_rate": 0.0009000000000000001, "loss": 4.263, "step": 2160 }, { "epoch": 1.57, "eval_accuracy": 0.2934964665435206, "eval_loss": 4.21316385269165, "eval_runtime": 1096.5373, "eval_samples_per_second": 455.415, "eval_steps_per_second": 2.034, "step": 2160 }, { "epoch": 1.58, "grad_norm": 1.1617356538772583, "learning_rate": 0.0009041666666666667, "loss": 4.2661, "step": 2170 }, { "epoch": 1.59, "grad_norm": 1.3990079164505005, "learning_rate": 0.0009083333333333334, "loss": 4.2475, "step": 2180 }, { "epoch": 1.6, "grad_norm": 1.2959562540054321, "learning_rate": 0.0009125, "loss": 4.2415, "step": 2190 }, { "epoch": 1.6, "grad_norm": 1.0796222686767578, "learning_rate": 0.0009166666666666666, "loss": 4.2337, "step": 2200 }, { "epoch": 1.61, "grad_norm": 1.6943458318710327, "learning_rate": 0.0009208333333333333, "loss": 4.2281, "step": 2210 }, { "epoch": 1.62, "grad_norm": 1.7960783243179321, "learning_rate": 0.000925, "loss": 4.2241, "step": 2220 }, { "epoch": 1.63, "grad_norm": 2.086534023284912, "learning_rate": 0.0009291666666666667, "loss": 4.2269, "step": 2230 }, { "epoch": 1.63, "grad_norm": 1.137702226638794, "learning_rate": 0.0009333333333333333, "loss": 4.2158, "step": 2240 }, { "epoch": 1.64, "grad_norm": 1.1577701568603516, "learning_rate": 0.0009375, "loss": 4.2013, "step": 2250 }, { "epoch": 1.64, "eval_accuracy": 0.2987269750298371, "eval_loss": 4.123126029968262, "eval_runtime": 1097.6256, "eval_samples_per_second": 454.964, "eval_steps_per_second": 2.032, "step": 2250 }, { "epoch": 1.65, "grad_norm": 1.6455570459365845, "learning_rate": 0.0009416666666666667, "loss": 4.1815, "step": 2260 }, { "epoch": 1.65, "grad_norm": 1.6025768518447876, "learning_rate": 0.0009458333333333334, "loss": 4.1534, "step": 2270 }, { "epoch": 1.66, "grad_norm": 1.2223172187805176, "learning_rate": 0.00095, "loss": 4.1298, "step": 2280 }, { "epoch": 1.67, "grad_norm": 1.766542673110962, "learning_rate": 0.0009541666666666667, "loss": 4.1187, "step": 2290 }, { "epoch": 1.68, "grad_norm": 2.156003952026367, "learning_rate": 0.0009583333333333334, "loss": 4.0858, "step": 2300 }, { "epoch": 1.68, "grad_norm": 1.9074057340621948, "learning_rate": 0.0009625, "loss": 4.0801, "step": 2310 }, { "epoch": 1.69, "grad_norm": 1.6140304803848267, "learning_rate": 0.0009666666666666667, "loss": 4.0383, "step": 2320 }, { "epoch": 1.7, "grad_norm": 1.5922300815582275, "learning_rate": 0.0009708333333333333, "loss": 4.0099, "step": 2330 }, { "epoch": 1.71, "grad_norm": 1.9714833498001099, "learning_rate": 0.000975, "loss": 3.9757, "step": 2340 }, { "epoch": 1.71, "eval_accuracy": 0.3303083702251303, "eval_loss": 3.764934539794922, "eval_runtime": 1104.6754, "eval_samples_per_second": 452.06, "eval_steps_per_second": 2.019, "step": 2340 }, { "epoch": 1.71, "grad_norm": 2.1198415756225586, "learning_rate": 0.0009791666666666666, "loss": 3.9507, "step": 2350 }, { "epoch": 1.72, "grad_norm": 2.0731935501098633, "learning_rate": 0.0009833333333333332, "loss": 3.9258, "step": 2360 }, { "epoch": 1.73, "grad_norm": 2.1984808444976807, "learning_rate": 0.0009875, "loss": 3.9003, "step": 2370 }, { "epoch": 1.73, "grad_norm": 2.033250331878662, "learning_rate": 0.0009916666666666667, "loss": 3.8732, "step": 2380 }, { "epoch": 1.74, "grad_norm": 1.7183982133865356, "learning_rate": 0.0009958333333333334, "loss": 3.8557, "step": 2390 }, { "epoch": 1.75, "grad_norm": 2.216938018798828, "learning_rate": 0.001, "loss": 3.8376, "step": 2400 }, { "epoch": 1.76, "grad_norm": 2.109079599380493, "learning_rate": 0.000999009900990099, "loss": 3.8212, "step": 2410 }, { "epoch": 1.76, "grad_norm": 1.5402984619140625, "learning_rate": 0.0009980198019801981, "loss": 3.8, "step": 2420 }, { "epoch": 1.77, "grad_norm": 2.051513433456421, "learning_rate": 0.000997029702970297, "loss": 3.7913, "step": 2430 }, { "epoch": 1.77, "eval_accuracy": 0.35789052045361985, "eval_loss": 3.5296359062194824, "eval_runtime": 1087.9351, "eval_samples_per_second": 459.016, "eval_steps_per_second": 2.05, "step": 2430 }, { "epoch": 1.78, "grad_norm": 1.8306666612625122, "learning_rate": 0.000996039603960396, "loss": 3.7567, "step": 2440 }, { "epoch": 1.79, "grad_norm": 1.9114989042282104, "learning_rate": 0.000995049504950495, "loss": 3.7491, "step": 2450 }, { "epoch": 1.79, "grad_norm": 1.9881885051727295, "learning_rate": 0.0009940594059405941, "loss": 3.7297, "step": 2460 }, { "epoch": 1.8, "grad_norm": 2.2852580547332764, "learning_rate": 0.0009930693069306932, "loss": 3.7073, "step": 2470 }, { "epoch": 1.81, "grad_norm": 2.090174913406372, "learning_rate": 0.000992079207920792, "loss": 3.6902, "step": 2480 }, { "epoch": 1.81, "grad_norm": 2.5586419105529785, "learning_rate": 0.000991089108910891, "loss": 3.6792, "step": 2490 }, { "epoch": 1.82, "grad_norm": 1.9420301914215088, "learning_rate": 0.0009900990099009901, "loss": 3.6728, "step": 2500 }, { "epoch": 1.83, "grad_norm": 2.319821834564209, "learning_rate": 0.0009891089108910892, "loss": 3.6627, "step": 2510 }, { "epoch": 1.84, "grad_norm": 2.134413480758667, "learning_rate": 0.0009881188118811882, "loss": 3.6435, "step": 2520 }, { "epoch": 1.84, "eval_accuracy": 0.3799301143797497, "eval_loss": 3.3790884017944336, "eval_runtime": 1089.5448, "eval_samples_per_second": 458.338, "eval_steps_per_second": 2.047, "step": 2520 }, { "epoch": 1.84, "grad_norm": 1.8554224967956543, "learning_rate": 0.000987128712871287, "loss": 3.6265, "step": 2530 }, { "epoch": 1.85, "grad_norm": 2.16987681388855, "learning_rate": 0.000986138613861386, "loss": 3.6098, "step": 2540 }, { "epoch": 1.86, "grad_norm": 1.9863182306289673, "learning_rate": 0.0009851485148514852, "loss": 3.5982, "step": 2550 }, { "epoch": 1.87, "grad_norm": 2.0247480869293213, "learning_rate": 0.0009841584158415842, "loss": 3.5911, "step": 2560 }, { "epoch": 1.87, "grad_norm": 2.2719273567199707, "learning_rate": 0.0009831683168316833, "loss": 3.5804, "step": 2570 }, { "epoch": 1.88, "grad_norm": 2.0588369369506836, "learning_rate": 0.000982178217821782, "loss": 3.5654, "step": 2580 }, { "epoch": 1.89, "grad_norm": 1.9666892290115356, "learning_rate": 0.0009811881188118811, "loss": 3.558, "step": 2590 }, { "epoch": 1.89, "grad_norm": 1.752681016921997, "learning_rate": 0.0009801980198019802, "loss": 3.5389, "step": 2600 }, { "epoch": 1.9, "grad_norm": 2.821775197982788, "learning_rate": 0.0009792079207920793, "loss": 3.5327, "step": 2610 }, { "epoch": 1.9, "eval_accuracy": 0.39317171253107736, "eval_loss": 3.2741596698760986, "eval_runtime": 1086.2288, "eval_samples_per_second": 459.737, "eval_steps_per_second": 2.053, "step": 2610 }, { "epoch": 1.91, "grad_norm": 2.4307518005371094, "learning_rate": 0.0009782178217821783, "loss": 3.5311, "step": 2620 }, { "epoch": 1.92, "grad_norm": 1.8416870832443237, "learning_rate": 0.0009772277227722771, "loss": 3.5199, "step": 2630 }, { "epoch": 1.92, "grad_norm": 1.7294279336929321, "learning_rate": 0.0009762376237623762, "loss": 3.5067, "step": 2640 }, { "epoch": 1.93, "grad_norm": 2.0376105308532715, "learning_rate": 0.0009752475247524752, "loss": 3.4957, "step": 2650 }, { "epoch": 1.94, "grad_norm": 1.845569133758545, "learning_rate": 0.0009742574257425743, "loss": 3.4778, "step": 2660 }, { "epoch": 1.95, "grad_norm": 2.1370015144348145, "learning_rate": 0.0009732673267326732, "loss": 3.4766, "step": 2670 }, { "epoch": 1.95, "grad_norm": 2.0046229362487793, "learning_rate": 0.0009722772277227723, "loss": 3.4609, "step": 2680 }, { "epoch": 1.96, "grad_norm": 1.7367238998413086, "learning_rate": 0.0009712871287128712, "loss": 3.4574, "step": 2690 }, { "epoch": 1.97, "grad_norm": 2.245299816131592, "learning_rate": 0.0009702970297029703, "loss": 3.4402, "step": 2700 }, { "epoch": 1.97, "eval_accuracy": 0.40886959953318786, "eval_loss": 3.1605701446533203, "eval_runtime": 1086.3963, "eval_samples_per_second": 459.666, "eval_steps_per_second": 2.053, "step": 2700 }, { "epoch": 1.97, "grad_norm": 1.6792678833007812, "learning_rate": 0.0009693069306930693, "loss": 3.4155, "step": 2710 }, { "epoch": 1.98, "grad_norm": 2.1290223598480225, "learning_rate": 0.0009683168316831683, "loss": 3.3953, "step": 2720 }, { "epoch": 1.99, "grad_norm": 1.9963873624801636, "learning_rate": 0.0009673267326732673, "loss": 3.3722, "step": 2730 }, { "epoch": 2.0, "grad_norm": 2.212454080581665, "learning_rate": 0.0009663366336633663, "loss": 3.3532, "step": 2740 }, { "epoch": 2.0, "grad_norm": 2.145552396774292, "learning_rate": 0.0009653465346534653, "loss": 3.336, "step": 2750 }, { "epoch": 2.01, "grad_norm": 2.423874616622925, "learning_rate": 0.0009643564356435644, "loss": 3.3111, "step": 2760 }, { "epoch": 2.02, "grad_norm": 2.0116701126098633, "learning_rate": 0.0009633663366336633, "loss": 3.305, "step": 2770 }, { "epoch": 2.03, "grad_norm": 2.243619203567505, "learning_rate": 0.0009623762376237624, "loss": 3.2854, "step": 2780 }, { "epoch": 2.03, "grad_norm": 1.5583114624023438, "learning_rate": 0.0009613861386138613, "loss": 3.2635, "step": 2790 }, { "epoch": 2.03, "eval_accuracy": 0.43169227745021366, "eval_loss": 2.9848363399505615, "eval_runtime": 1087.8763, "eval_samples_per_second": 459.041, "eval_steps_per_second": 2.05, "step": 2790 }, { "epoch": 2.04, "grad_norm": 2.156170606613159, "learning_rate": 0.0009603960396039604, "loss": 3.2498, "step": 2800 }, { "epoch": 2.05, "grad_norm": 1.917297601699829, "learning_rate": 0.0009594059405940594, "loss": 3.2343, "step": 2810 }, { "epoch": 2.05, "grad_norm": 1.7647627592086792, "learning_rate": 0.0009584158415841584, "loss": 3.2206, "step": 2820 }, { "epoch": 2.06, "grad_norm": 1.7406831979751587, "learning_rate": 0.0009574257425742574, "loss": 3.2023, "step": 2830 }, { "epoch": 2.07, "grad_norm": 1.721940040588379, "learning_rate": 0.0009564356435643564, "loss": 3.1896, "step": 2840 }, { "epoch": 2.08, "grad_norm": 1.5204572677612305, "learning_rate": 0.0009554455445544554, "loss": 3.1769, "step": 2850 }, { "epoch": 2.08, "grad_norm": 2.20760440826416, "learning_rate": 0.0009544554455445545, "loss": 3.1706, "step": 2860 }, { "epoch": 2.09, "grad_norm": 1.6796480417251587, "learning_rate": 0.0009534653465346534, "loss": 3.1552, "step": 2870 }, { "epoch": 2.1, "grad_norm": 2.044858455657959, "learning_rate": 0.0009524752475247525, "loss": 3.1385, "step": 2880 }, { "epoch": 2.1, "eval_accuracy": 0.4465053493029932, "eval_loss": 2.872570037841797, "eval_runtime": 1089.0874, "eval_samples_per_second": 458.531, "eval_steps_per_second": 2.048, "step": 2880 }, { "epoch": 2.11, "grad_norm": 1.3739113807678223, "learning_rate": 0.0009514851485148514, "loss": 3.1217, "step": 2890 }, { "epoch": 2.11, "grad_norm": 1.631298303604126, "learning_rate": 0.0009504950495049505, "loss": 3.1102, "step": 2900 }, { "epoch": 2.12, "grad_norm": 1.8213354349136353, "learning_rate": 0.0009495049504950495, "loss": 3.092, "step": 2910 }, { "epoch": 2.13, "grad_norm": 1.910646915435791, "learning_rate": 0.0009485148514851485, "loss": 3.0768, "step": 2920 }, { "epoch": 2.14, "grad_norm": 1.6591072082519531, "learning_rate": 0.0009475247524752475, "loss": 3.0721, "step": 2930 }, { "epoch": 2.14, "grad_norm": 1.921587347984314, "learning_rate": 0.0009465346534653465, "loss": 3.0493, "step": 2940 }, { "epoch": 2.15, "grad_norm": 1.576114296913147, "learning_rate": 0.0009455445544554455, "loss": 3.0392, "step": 2950 }, { "epoch": 2.16, "grad_norm": 1.41093909740448, "learning_rate": 0.0009445544554455446, "loss": 3.0204, "step": 2960 }, { "epoch": 2.16, "grad_norm": 1.41178297996521, "learning_rate": 0.0009435643564356435, "loss": 3.0046, "step": 2970 }, { "epoch": 2.16, "eval_accuracy": 0.46085574907280247, "eval_loss": 2.7754335403442383, "eval_runtime": 1088.9809, "eval_samples_per_second": 458.576, "eval_steps_per_second": 2.048, "step": 2970 }, { "epoch": 2.17, "grad_norm": 1.5229026079177856, "learning_rate": 0.0009425742574257426, "loss": 2.989, "step": 2980 }, { "epoch": 2.18, "grad_norm": 1.3193325996398926, "learning_rate": 0.0009415841584158415, "loss": 2.9764, "step": 2990 }, { "epoch": 2.19, "grad_norm": 1.1938610076904297, "learning_rate": 0.0009405940594059406, "loss": 2.9636, "step": 3000 }, { "epoch": 2.19, "grad_norm": 1.1402697563171387, "learning_rate": 0.0009396039603960396, "loss": 2.9517, "step": 3010 }, { "epoch": 2.2, "grad_norm": 1.4980099201202393, "learning_rate": 0.0009386138613861386, "loss": 2.9445, "step": 3020 }, { "epoch": 2.21, "grad_norm": 1.4591041803359985, "learning_rate": 0.0009376237623762376, "loss": 2.9317, "step": 3030 }, { "epoch": 2.22, "grad_norm": 1.4302833080291748, "learning_rate": 0.0009366336633663367, "loss": 2.9167, "step": 3040 }, { "epoch": 2.22, "grad_norm": 1.2571301460266113, "learning_rate": 0.0009356435643564357, "loss": 2.9049, "step": 3050 }, { "epoch": 2.23, "grad_norm": 1.2039096355438232, "learning_rate": 0.0009346534653465348, "loss": 2.8885, "step": 3060 }, { "epoch": 2.23, "eval_accuracy": 0.4740639726753192, "eval_loss": 2.6853535175323486, "eval_runtime": 1088.5789, "eval_samples_per_second": 458.745, "eval_steps_per_second": 2.049, "step": 3060 }, { "epoch": 2.24, "grad_norm": 1.1458439826965332, "learning_rate": 0.0009336633663366337, "loss": 2.8844, "step": 3070 }, { "epoch": 2.24, "grad_norm": 1.1883801221847534, "learning_rate": 0.0009326732673267328, "loss": 2.8777, "step": 3080 }, { "epoch": 2.25, "grad_norm": 1.0597162246704102, "learning_rate": 0.0009316831683168317, "loss": 2.8647, "step": 3090 }, { "epoch": 2.26, "grad_norm": 1.2362898588180542, "learning_rate": 0.0009306930693069308, "loss": 2.8565, "step": 3100 }, { "epoch": 2.27, "grad_norm": 1.588973879814148, "learning_rate": 0.0009297029702970298, "loss": 2.8443, "step": 3110 }, { "epoch": 2.27, "grad_norm": 1.7287636995315552, "learning_rate": 0.0009287128712871288, "loss": 2.8401, "step": 3120 }, { "epoch": 2.28, "grad_norm": 1.2376179695129395, "learning_rate": 0.0009277227722772278, "loss": 2.8338, "step": 3130 }, { "epoch": 2.29, "grad_norm": 1.1424429416656494, "learning_rate": 0.0009267326732673268, "loss": 2.8158, "step": 3140 }, { "epoch": 2.3, "grad_norm": 1.0561778545379639, "learning_rate": 0.0009257425742574258, "loss": 2.8086, "step": 3150 }, { "epoch": 2.3, "eval_accuracy": 0.4839770905517238, "eval_loss": 2.6107919216156006, "eval_runtime": 1086.2364, "eval_samples_per_second": 459.734, "eval_steps_per_second": 2.053, "step": 3150 }, { "epoch": 2.3, "grad_norm": 1.1775144338607788, "learning_rate": 0.0009247524752475249, "loss": 2.8002, "step": 3160 }, { "epoch": 2.31, "grad_norm": 1.1752933263778687, "learning_rate": 0.0009237623762376238, "loss": 2.7913, "step": 3170 }, { "epoch": 2.32, "grad_norm": 1.35330331325531, "learning_rate": 0.0009227722772277229, "loss": 2.7821, "step": 3180 }, { "epoch": 2.32, "grad_norm": 1.163878321647644, "learning_rate": 0.0009217821782178218, "loss": 2.7755, "step": 3190 }, { "epoch": 2.33, "grad_norm": 1.0859084129333496, "learning_rate": 0.0009207920792079209, "loss": 2.7651, "step": 3200 }, { "epoch": 2.34, "grad_norm": 1.1351704597473145, "learning_rate": 0.0009198019801980199, "loss": 2.7583, "step": 3210 }, { "epoch": 2.35, "grad_norm": 1.0613981485366821, "learning_rate": 0.0009188118811881188, "loss": 2.7482, "step": 3220 }, { "epoch": 2.35, "grad_norm": 1.1925811767578125, "learning_rate": 0.0009178217821782179, "loss": 2.7411, "step": 3230 }, { "epoch": 2.36, "grad_norm": 1.00603187084198, "learning_rate": 0.0009168316831683168, "loss": 2.7357, "step": 3240 }, { "epoch": 2.36, "eval_accuracy": 0.4945267646340819, "eval_loss": 2.5409770011901855, "eval_runtime": 1086.0357, "eval_samples_per_second": 459.819, "eval_steps_per_second": 2.053, "step": 3240 }, { "epoch": 2.37, "grad_norm": 1.0638866424560547, "learning_rate": 0.0009158415841584159, "loss": 2.7271, "step": 3250 }, { "epoch": 2.38, "grad_norm": 1.0579949617385864, "learning_rate": 0.000914851485148515, "loss": 2.7192, "step": 3260 }, { "epoch": 2.38, "grad_norm": 0.8697578310966492, "learning_rate": 0.0009138613861386139, "loss": 2.7114, "step": 3270 }, { "epoch": 2.39, "grad_norm": 1.1074854135513306, "learning_rate": 0.0009128712871287129, "loss": 2.7044, "step": 3280 }, { "epoch": 2.4, "grad_norm": 1.2240349054336548, "learning_rate": 0.0009118811881188119, "loss": 2.6981, "step": 3290 }, { "epoch": 2.4, "grad_norm": 1.0825715065002441, "learning_rate": 0.0009108910891089109, "loss": 2.688, "step": 3300 }, { "epoch": 2.41, "grad_norm": 0.9786016941070557, "learning_rate": 0.00090990099009901, "loss": 2.681, "step": 3310 }, { "epoch": 2.42, "grad_norm": 0.7355318665504456, "learning_rate": 0.0009089108910891089, "loss": 2.6771, "step": 3320 }, { "epoch": 2.43, "grad_norm": 1.0296987295150757, "learning_rate": 0.000907920792079208, "loss": 2.6714, "step": 3330 }, { "epoch": 2.43, "eval_accuracy": 0.5031676098849697, "eval_loss": 2.4873406887054443, "eval_runtime": 1086.6349, "eval_samples_per_second": 459.566, "eval_steps_per_second": 2.052, "step": 3330 }, { "epoch": 2.43, "grad_norm": 1.0019482374191284, "learning_rate": 0.0009069306930693069, "loss": 2.659, "step": 3340 }, { "epoch": 2.44, "grad_norm": 0.95697021484375, "learning_rate": 0.000905940594059406, "loss": 2.6534, "step": 3350 }, { "epoch": 2.45, "grad_norm": 0.9206619262695312, "learning_rate": 0.000904950495049505, "loss": 2.6499, "step": 3360 }, { "epoch": 2.46, "grad_norm": 0.9173060059547424, "learning_rate": 0.000903960396039604, "loss": 2.6436, "step": 3370 }, { "epoch": 2.46, "grad_norm": 0.946976900100708, "learning_rate": 0.000902970297029703, "loss": 2.6386, "step": 3380 }, { "epoch": 2.47, "grad_norm": 1.1060245037078857, "learning_rate": 0.000901980198019802, "loss": 2.6295, "step": 3390 }, { "epoch": 2.48, "grad_norm": 0.9128373861312866, "learning_rate": 0.000900990099009901, "loss": 2.6207, "step": 3400 }, { "epoch": 2.48, "grad_norm": 0.7775394916534424, "learning_rate": 0.0009000000000000001, "loss": 2.6152, "step": 3410 }, { "epoch": 2.49, "grad_norm": 1.0009465217590332, "learning_rate": 0.000899009900990099, "loss": 2.6114, "step": 3420 }, { "epoch": 2.49, "eval_accuracy": 0.5107387401188807, "eval_loss": 2.430750608444214, "eval_runtime": 1085.7185, "eval_samples_per_second": 459.953, "eval_steps_per_second": 2.054, "step": 3420 }, { "epoch": 2.5, "grad_norm": 0.9122986197471619, "learning_rate": 0.0008980198019801981, "loss": 2.6046, "step": 3430 }, { "epoch": 2.51, "grad_norm": 0.9263846278190613, "learning_rate": 0.000897029702970297, "loss": 2.6006, "step": 3440 }, { "epoch": 2.51, "grad_norm": 0.9440599083900452, "learning_rate": 0.0008960396039603961, "loss": 2.5954, "step": 3450 }, { "epoch": 2.52, "grad_norm": 1.0791646242141724, "learning_rate": 0.0008950495049504951, "loss": 2.5875, "step": 3460 }, { "epoch": 2.53, "grad_norm": 0.8801349401473999, "learning_rate": 0.0008940594059405941, "loss": 2.5805, "step": 3470 }, { "epoch": 2.54, "grad_norm": 0.8976075053215027, "learning_rate": 0.0008930693069306931, "loss": 2.5856, "step": 3480 }, { "epoch": 2.54, "grad_norm": 0.9874941110610962, "learning_rate": 0.0008920792079207921, "loss": 2.5741, "step": 3490 }, { "epoch": 2.55, "grad_norm": 0.8185487985610962, "learning_rate": 0.0008910891089108911, "loss": 2.5625, "step": 3500 }, { "epoch": 2.56, "grad_norm": 1.0372703075408936, "learning_rate": 0.0008900990099009902, "loss": 2.5622, "step": 3510 }, { "epoch": 2.56, "eval_accuracy": 0.5173882190435195, "eval_loss": 2.3903918266296387, "eval_runtime": 1085.7537, "eval_samples_per_second": 459.939, "eval_steps_per_second": 2.054, "step": 3510 }, { "epoch": 2.56, "grad_norm": 0.7737933397293091, "learning_rate": 0.0008891089108910891, "loss": 2.5551, "step": 3520 }, { "epoch": 2.57, "grad_norm": 0.8418464660644531, "learning_rate": 0.0008881188118811882, "loss": 2.5505, "step": 3530 }, { "epoch": 2.58, "grad_norm": 0.9638449549674988, "learning_rate": 0.0008871287128712871, "loss": 2.5506, "step": 3540 }, { "epoch": 2.59, "grad_norm": 0.8606787919998169, "learning_rate": 0.0008861386138613862, "loss": 2.5446, "step": 3550 }, { "epoch": 2.59, "grad_norm": 0.9567099213600159, "learning_rate": 0.0008851485148514852, "loss": 2.5316, "step": 3560 }, { "epoch": 2.6, "grad_norm": 0.9098414182662964, "learning_rate": 0.0008841584158415842, "loss": 2.5299, "step": 3570 }, { "epoch": 2.61, "grad_norm": 0.9305897951126099, "learning_rate": 0.0008831683168316832, "loss": 2.53, "step": 3580 }, { "epoch": 2.62, "grad_norm": 0.6542484164237976, "learning_rate": 0.0008821782178217822, "loss": 2.5182, "step": 3590 }, { "epoch": 2.62, "grad_norm": 0.8789640069007874, "learning_rate": 0.0008811881188118812, "loss": 2.5145, "step": 3600 }, { "epoch": 2.62, "eval_accuracy": 0.5233957340804408, "eval_loss": 2.3455302715301514, "eval_runtime": 1084.7897, "eval_samples_per_second": 460.347, "eval_steps_per_second": 2.056, "step": 3600 }, { "epoch": 2.63, "grad_norm": 0.7239986062049866, "learning_rate": 0.0008801980198019803, "loss": 2.5092, "step": 3610 }, { "epoch": 2.64, "grad_norm": 0.7931397557258606, "learning_rate": 0.0008792079207920792, "loss": 2.4992, "step": 3620 }, { "epoch": 2.65, "grad_norm": 0.9696986675262451, "learning_rate": 0.0008782178217821783, "loss": 2.5035, "step": 3630 }, { "epoch": 2.65, "grad_norm": 0.7151007056236267, "learning_rate": 0.0008772277227722772, "loss": 2.4927, "step": 3640 }, { "epoch": 2.66, "grad_norm": 1.054768443107605, "learning_rate": 0.0008762376237623763, "loss": 2.4831, "step": 3650 }, { "epoch": 2.67, "grad_norm": 0.7492119669914246, "learning_rate": 0.0008752475247524753, "loss": 2.4872, "step": 3660 }, { "epoch": 2.67, "grad_norm": 0.6677684187889099, "learning_rate": 0.0008742574257425743, "loss": 2.4797, "step": 3670 }, { "epoch": 2.68, "grad_norm": 0.8682121634483337, "learning_rate": 0.0008732673267326733, "loss": 2.4747, "step": 3680 }, { "epoch": 2.69, "grad_norm": 0.9361952543258667, "learning_rate": 0.0008722772277227722, "loss": 2.4741, "step": 3690 }, { "epoch": 2.69, "eval_accuracy": 0.5298733491746213, "eval_loss": 2.301394462585449, "eval_runtime": 1087.2464, "eval_samples_per_second": 459.307, "eval_steps_per_second": 2.051, "step": 3690 }, { "epoch": 2.7, "grad_norm": 0.7424592971801758, "learning_rate": 0.0008712871287128713, "loss": 2.4641, "step": 3700 }, { "epoch": 2.7, "grad_norm": 0.7794514894485474, "learning_rate": 0.0008702970297029704, "loss": 2.4558, "step": 3710 }, { "epoch": 2.71, "grad_norm": 0.8285810351371765, "learning_rate": 0.0008693069306930693, "loss": 2.4541, "step": 3720 }, { "epoch": 2.72, "grad_norm": 0.8555008769035339, "learning_rate": 0.0008683168316831684, "loss": 2.456, "step": 3730 }, { "epoch": 2.73, "grad_norm": 0.7897722721099854, "learning_rate": 0.0008673267326732673, "loss": 2.4447, "step": 3740 }, { "epoch": 2.73, "grad_norm": 0.7043498754501343, "learning_rate": 0.0008663366336633663, "loss": 2.44, "step": 3750 }, { "epoch": 2.74, "grad_norm": 0.7770318984985352, "learning_rate": 0.0008653465346534654, "loss": 2.439, "step": 3760 }, { "epoch": 2.75, "grad_norm": 0.8721603155136108, "learning_rate": 0.0008643564356435643, "loss": 2.4363, "step": 3770 }, { "epoch": 2.75, "grad_norm": 0.653965413570404, "learning_rate": 0.0008633663366336634, "loss": 2.4298, "step": 3780 }, { "epoch": 2.75, "eval_accuracy": 0.5352807916398568, "eval_loss": 2.2673096656799316, "eval_runtime": 1086.4494, "eval_samples_per_second": 459.644, "eval_steps_per_second": 2.053, "step": 3780 }, { "epoch": 2.76, "grad_norm": 0.7826604247093201, "learning_rate": 0.0008623762376237623, "loss": 2.4204, "step": 3790 }, { "epoch": 2.77, "grad_norm": 0.6523510813713074, "learning_rate": 0.0008613861386138614, "loss": 2.4194, "step": 3800 }, { "epoch": 2.78, "grad_norm": 0.9408191442489624, "learning_rate": 0.0008603960396039604, "loss": 2.4135, "step": 3810 }, { "epoch": 2.78, "grad_norm": 0.8393464684486389, "learning_rate": 0.0008594059405940594, "loss": 2.4179, "step": 3820 }, { "epoch": 2.79, "grad_norm": 0.7277712821960449, "learning_rate": 0.0008584158415841584, "loss": 2.4087, "step": 3830 }, { "epoch": 2.8, "grad_norm": 0.6864319443702698, "learning_rate": 0.0008574257425742574, "loss": 2.405, "step": 3840 }, { "epoch": 2.81, "grad_norm": 0.6246985197067261, "learning_rate": 0.0008564356435643564, "loss": 2.3962, "step": 3850 }, { "epoch": 2.81, "grad_norm": 0.7276294231414795, "learning_rate": 0.0008554455445544555, "loss": 2.4043, "step": 3860 }, { "epoch": 2.82, "grad_norm": 0.7767272591590881, "learning_rate": 0.0008544554455445544, "loss": 2.3947, "step": 3870 }, { "epoch": 2.82, "eval_accuracy": 0.5401726205914658, "eval_loss": 2.2340025901794434, "eval_runtime": 1089.0379, "eval_samples_per_second": 458.552, "eval_steps_per_second": 2.048, "step": 3870 }, { "epoch": 2.83, "grad_norm": 0.717089831829071, "learning_rate": 0.0008534653465346535, "loss": 2.3934, "step": 3880 }, { "epoch": 2.83, "grad_norm": 0.7381496429443359, "learning_rate": 0.0008524752475247524, "loss": 2.384, "step": 3890 }, { "epoch": 2.84, "grad_norm": 0.8456007838249207, "learning_rate": 0.0008514851485148515, "loss": 2.3842, "step": 3900 }, { "epoch": 2.85, "grad_norm": 0.6353156566619873, "learning_rate": 0.0008504950495049505, "loss": 2.3774, "step": 3910 }, { "epoch": 2.86, "grad_norm": 0.6743925213813782, "learning_rate": 0.0008495049504950495, "loss": 2.3775, "step": 3920 }, { "epoch": 2.86, "grad_norm": 0.6839917898178101, "learning_rate": 0.0008485148514851485, "loss": 2.3737, "step": 3930 }, { "epoch": 2.87, "grad_norm": 0.6635532379150391, "learning_rate": 0.0008475247524752475, "loss": 2.3689, "step": 3940 }, { "epoch": 2.88, "grad_norm": 0.7584016919136047, "learning_rate": 0.0008465346534653465, "loss": 2.3623, "step": 3950 }, { "epoch": 2.89, "grad_norm": 0.9728506803512573, "learning_rate": 0.0008455445544554456, "loss": 2.3653, "step": 3960 }, { "epoch": 2.89, "eval_accuracy": 0.5444187372461048, "eval_loss": 2.210555076599121, "eval_runtime": 1087.1075, "eval_samples_per_second": 459.366, "eval_steps_per_second": 2.051, "step": 3960 }, { "epoch": 2.89, "grad_norm": 0.7243532538414001, "learning_rate": 0.0008445544554455445, "loss": 2.3605, "step": 3970 }, { "epoch": 2.9, "grad_norm": 0.5774228572845459, "learning_rate": 0.0008435643564356436, "loss": 2.3547, "step": 3980 }, { "epoch": 2.91, "grad_norm": 0.9098168611526489, "learning_rate": 0.0008425742574257425, "loss": 2.3507, "step": 3990 }, { "epoch": 2.91, "grad_norm": 0.8770572543144226, "learning_rate": 0.0008415841584158416, "loss": 2.3535, "step": 4000 }, { "epoch": 2.92, "grad_norm": 0.5555605292320251, "learning_rate": 0.0008405940594059406, "loss": 2.3463, "step": 4010 }, { "epoch": 2.93, "grad_norm": 0.8333105444908142, "learning_rate": 0.0008396039603960396, "loss": 2.3417, "step": 4020 }, { "epoch": 2.94, "grad_norm": 0.724617600440979, "learning_rate": 0.0008386138613861386, "loss": 2.3382, "step": 4030 }, { "epoch": 2.94, "grad_norm": 0.6441348791122437, "learning_rate": 0.0008376237623762376, "loss": 2.3322, "step": 4040 }, { "epoch": 2.95, "grad_norm": 0.7889347672462463, "learning_rate": 0.0008366336633663366, "loss": 2.332, "step": 4050 }, { "epoch": 2.95, "eval_accuracy": 0.5490578933317315, "eval_loss": 2.1787993907928467, "eval_runtime": 1089.6908, "eval_samples_per_second": 458.277, "eval_steps_per_second": 2.046, "step": 4050 }, { "epoch": 2.96, "grad_norm": 0.7708555459976196, "learning_rate": 0.0008356435643564357, "loss": 2.3326, "step": 4060 }, { "epoch": 2.97, "grad_norm": 0.5825323462486267, "learning_rate": 0.0008346534653465346, "loss": 2.3269, "step": 4070 }, { "epoch": 2.97, "grad_norm": 0.6252484321594238, "learning_rate": 0.0008336633663366337, "loss": 2.3226, "step": 4080 }, { "epoch": 2.98, "grad_norm": 0.8860800266265869, "learning_rate": 0.0008326732673267326, "loss": 2.3213, "step": 4090 }, { "epoch": 2.99, "grad_norm": 0.6472296714782715, "learning_rate": 0.0008316831683168317, "loss": 2.3159, "step": 4100 }, { "epoch": 2.99, "grad_norm": 0.5913267731666565, "learning_rate": 0.0008306930693069307, "loss": 2.31, "step": 4110 }, { "epoch": 3.0, "grad_norm": 0.9591660499572754, "learning_rate": 0.0008297029702970297, "loss": 2.3105, "step": 4120 }, { "epoch": 3.01, "grad_norm": 0.6770280599594116, "learning_rate": 0.0008287128712871287, "loss": 2.3043, "step": 4130 }, { "epoch": 3.02, "grad_norm": 0.6088559031486511, "learning_rate": 0.0008277227722772277, "loss": 2.3006, "step": 4140 }, { "epoch": 3.02, "eval_accuracy": 0.5543035745580942, "eval_loss": 2.146070718765259, "eval_runtime": 1088.9664, "eval_samples_per_second": 458.582, "eval_steps_per_second": 2.048, "step": 4140 }, { "epoch": 3.02, "grad_norm": 0.8546580076217651, "learning_rate": 0.0008267326732673267, "loss": 2.3008, "step": 4150 }, { "epoch": 3.03, "grad_norm": 0.5891646146774292, "learning_rate": 0.0008257425742574258, "loss": 2.2996, "step": 4160 }, { "epoch": 3.04, "grad_norm": 0.7681787610054016, "learning_rate": 0.0008247524752475247, "loss": 2.291, "step": 4170 }, { "epoch": 3.05, "grad_norm": 0.6504939198493958, "learning_rate": 0.0008237623762376238, "loss": 2.2908, "step": 4180 }, { "epoch": 3.05, "grad_norm": 0.7509806752204895, "learning_rate": 0.0008227722772277227, "loss": 2.2894, "step": 4190 }, { "epoch": 3.06, "grad_norm": 0.9019284844398499, "learning_rate": 0.0008217821782178218, "loss": 2.2851, "step": 4200 }, { "epoch": 3.07, "grad_norm": 0.5588305592536926, "learning_rate": 0.0008207920792079208, "loss": 2.284, "step": 4210 }, { "epoch": 3.07, "grad_norm": 0.659537672996521, "learning_rate": 0.0008198019801980197, "loss": 2.2758, "step": 4220 }, { "epoch": 3.08, "grad_norm": 0.6673635840415955, "learning_rate": 0.0008188118811881188, "loss": 2.2731, "step": 4230 }, { "epoch": 3.08, "eval_accuracy": 0.5577760802425583, "eval_loss": 2.1262881755828857, "eval_runtime": 1086.5663, "eval_samples_per_second": 459.595, "eval_steps_per_second": 2.052, "step": 4230 }, { "epoch": 3.09, "grad_norm": 0.7020514607429504, "learning_rate": 0.0008178217821782177, "loss": 2.2744, "step": 4240 }, { "epoch": 3.1, "grad_norm": 0.6836090087890625, "learning_rate": 0.0008168316831683168, "loss": 2.2726, "step": 4250 }, { "epoch": 3.1, "grad_norm": 0.8297485709190369, "learning_rate": 0.0008158415841584159, "loss": 2.2696, "step": 4260 }, { "epoch": 3.11, "grad_norm": 0.6747464537620544, "learning_rate": 0.0008148514851485148, "loss": 2.2667, "step": 4270 }, { "epoch": 3.12, "grad_norm": 0.6610013246536255, "learning_rate": 0.0008138613861386138, "loss": 2.261, "step": 4280 }, { "epoch": 3.13, "grad_norm": 0.9729331731796265, "learning_rate": 0.0008128712871287128, "loss": 2.2634, "step": 4290 }, { "epoch": 3.13, "grad_norm": 0.580893874168396, "learning_rate": 0.000811881188118812, "loss": 2.2643, "step": 4300 }, { "epoch": 3.14, "grad_norm": 0.5239897966384888, "learning_rate": 0.000810891089108911, "loss": 2.2533, "step": 4310 }, { "epoch": 3.15, "grad_norm": 0.9247382283210754, "learning_rate": 0.00080990099009901, "loss": 2.2544, "step": 4320 }, { "epoch": 3.15, "eval_accuracy": 0.5607038793304535, "eval_loss": 2.1080663204193115, "eval_runtime": 1087.2566, "eval_samples_per_second": 459.303, "eval_steps_per_second": 2.051, "step": 4320 }, { "epoch": 3.16, "grad_norm": 0.7254253029823303, "learning_rate": 0.000808910891089109, "loss": 2.2521, "step": 4330 }, { "epoch": 3.16, "grad_norm": 0.47916772961616516, "learning_rate": 0.0008079207920792079, "loss": 2.2471, "step": 4340 }, { "epoch": 3.17, "grad_norm": 0.8148419260978699, "learning_rate": 0.000806930693069307, "loss": 2.2479, "step": 4350 }, { "epoch": 3.18, "grad_norm": 0.6527644991874695, "learning_rate": 0.000805940594059406, "loss": 2.2446, "step": 4360 }, { "epoch": 3.18, "grad_norm": 0.7129182815551758, "learning_rate": 0.000804950495049505, "loss": 2.2382, "step": 4370 }, { "epoch": 3.19, "grad_norm": 0.6954285502433777, "learning_rate": 0.000803960396039604, "loss": 2.2399, "step": 4380 }, { "epoch": 3.2, "grad_norm": 0.6172522902488708, "learning_rate": 0.000802970297029703, "loss": 2.2395, "step": 4390 }, { "epoch": 3.21, "grad_norm": 0.8309088349342346, "learning_rate": 0.000801980198019802, "loss": 2.2379, "step": 4400 }, { "epoch": 3.21, "grad_norm": 0.6792633533477783, "learning_rate": 0.0008009900990099011, "loss": 2.2364, "step": 4410 }, { "epoch": 3.21, "eval_accuracy": 0.5646760378560493, "eval_loss": 2.0840134620666504, "eval_runtime": 1086.4023, "eval_samples_per_second": 459.664, "eval_steps_per_second": 2.053, "step": 4410 }, { "epoch": 3.22, "grad_norm": 0.6008450388908386, "learning_rate": 0.0008, "loss": 2.2289, "step": 4420 }, { "epoch": 3.23, "grad_norm": 0.5826246738433838, "learning_rate": 0.0007990099009900991, "loss": 2.2259, "step": 4430 }, { "epoch": 3.24, "grad_norm": 0.6360362768173218, "learning_rate": 0.000798019801980198, "loss": 2.2262, "step": 4440 }, { "epoch": 3.24, "grad_norm": 0.7450495362281799, "learning_rate": 0.0007970297029702971, "loss": 2.2241, "step": 4450 }, { "epoch": 3.25, "grad_norm": 0.5571395754814148, "learning_rate": 0.0007960396039603961, "loss": 2.2245, "step": 4460 }, { "epoch": 3.26, "grad_norm": 0.622724175453186, "learning_rate": 0.0007950495049504951, "loss": 2.2183, "step": 4470 }, { "epoch": 3.26, "grad_norm": 0.621284544467926, "learning_rate": 0.0007940594059405941, "loss": 2.2153, "step": 4480 }, { "epoch": 3.27, "grad_norm": 0.9469096660614014, "learning_rate": 0.0007930693069306931, "loss": 2.2156, "step": 4490 }, { "epoch": 3.28, "grad_norm": 0.6318257451057434, "learning_rate": 0.0007920792079207921, "loss": 2.2143, "step": 4500 }, { "epoch": 3.28, "eval_accuracy": 0.5673093799211185, "eval_loss": 2.0671584606170654, "eval_runtime": 1088.5283, "eval_samples_per_second": 458.766, "eval_steps_per_second": 2.049, "step": 4500 }, { "epoch": 3.29, "grad_norm": 0.7229343056678772, "learning_rate": 0.0007910891089108912, "loss": 2.2111, "step": 4510 }, { "epoch": 3.29, "grad_norm": 0.8711042404174805, "learning_rate": 0.0007900990099009901, "loss": 2.2144, "step": 4520 }, { "epoch": 3.3, "grad_norm": 0.5540309548377991, "learning_rate": 0.0007891089108910892, "loss": 2.2063, "step": 4530 }, { "epoch": 3.31, "grad_norm": 0.6157627105712891, "learning_rate": 0.0007881188118811881, "loss": 2.2026, "step": 4540 }, { "epoch": 3.32, "grad_norm": 1.0260213613510132, "learning_rate": 0.0007871287128712872, "loss": 2.2107, "step": 4550 }, { "epoch": 3.32, "grad_norm": 0.5229135155677795, "learning_rate": 0.0007861386138613862, "loss": 2.2031, "step": 4560 }, { "epoch": 3.33, "grad_norm": 0.460483580827713, "learning_rate": 0.0007851485148514852, "loss": 2.1956, "step": 4570 }, { "epoch": 3.34, "grad_norm": 0.9283266067504883, "learning_rate": 0.0007841584158415842, "loss": 2.1976, "step": 4580 }, { "epoch": 3.34, "grad_norm": 0.5621626973152161, "learning_rate": 0.0007831683168316832, "loss": 2.1972, "step": 4590 }, { "epoch": 3.34, "eval_accuracy": 0.5702843964025183, "eval_loss": 2.049508810043335, "eval_runtime": 1086.5875, "eval_samples_per_second": 459.586, "eval_steps_per_second": 2.052, "step": 4590 }, { "epoch": 3.35, "grad_norm": 0.6998817324638367, "learning_rate": 0.0007821782178217822, "loss": 2.1905, "step": 4600 }, { "epoch": 3.36, "grad_norm": 0.6161481738090515, "learning_rate": 0.0007811881188118813, "loss": 2.1916, "step": 4610 }, { "epoch": 3.37, "grad_norm": 0.7690967321395874, "learning_rate": 0.0007801980198019802, "loss": 2.1904, "step": 4620 }, { "epoch": 3.37, "grad_norm": 0.6058160066604614, "learning_rate": 0.0007792079207920793, "loss": 2.1894, "step": 4630 }, { "epoch": 3.38, "grad_norm": 0.5859819054603577, "learning_rate": 0.0007782178217821782, "loss": 2.1823, "step": 4640 }, { "epoch": 3.39, "grad_norm": 0.7922290563583374, "learning_rate": 0.0007772277227722773, "loss": 2.1816, "step": 4650 }, { "epoch": 3.4, "grad_norm": 0.617785632610321, "learning_rate": 0.0007762376237623763, "loss": 2.1855, "step": 4660 }, { "epoch": 3.4, "grad_norm": 0.6082860827445984, "learning_rate": 0.0007752475247524753, "loss": 2.1811, "step": 4670 }, { "epoch": 3.41, "grad_norm": 0.44060420989990234, "learning_rate": 0.0007742574257425743, "loss": 2.1738, "step": 4680 }, { "epoch": 3.41, "eval_accuracy": 0.5732795510180845, "eval_loss": 2.031883955001831, "eval_runtime": 1085.1554, "eval_samples_per_second": 460.192, "eval_steps_per_second": 2.055, "step": 4680 }, { "epoch": 3.42, "grad_norm": 0.6842211484909058, "learning_rate": 0.0007732673267326733, "loss": 2.1746, "step": 4690 }, { "epoch": 3.42, "grad_norm": 0.7363536357879639, "learning_rate": 0.0007722772277227723, "loss": 2.1756, "step": 4700 }, { "epoch": 3.43, "grad_norm": 0.657122015953064, "learning_rate": 0.0007712871287128714, "loss": 2.1767, "step": 4710 }, { "epoch": 3.44, "grad_norm": 0.525112509727478, "learning_rate": 0.0007702970297029703, "loss": 2.1705, "step": 4720 }, { "epoch": 3.45, "grad_norm": 0.594642162322998, "learning_rate": 0.0007693069306930694, "loss": 2.167, "step": 4730 }, { "epoch": 3.45, "grad_norm": 0.7353718280792236, "learning_rate": 0.0007683168316831683, "loss": 2.1678, "step": 4740 }, { "epoch": 3.46, "grad_norm": 0.7462971806526184, "learning_rate": 0.0007673267326732674, "loss": 2.1665, "step": 4750 }, { "epoch": 3.47, "grad_norm": 0.543685257434845, "learning_rate": 0.0007663366336633664, "loss": 2.1624, "step": 4760 }, { "epoch": 3.48, "grad_norm": 0.53340744972229, "learning_rate": 0.0007653465346534654, "loss": 2.1587, "step": 4770 }, { "epoch": 3.48, "eval_accuracy": 0.5758243970327254, "eval_loss": 2.01580810546875, "eval_runtime": 1087.4822, "eval_samples_per_second": 459.208, "eval_steps_per_second": 2.051, "step": 4770 }, { "epoch": 3.48, "grad_norm": 0.819010853767395, "learning_rate": 0.0007643564356435644, "loss": 2.1623, "step": 4780 }, { "epoch": 3.49, "grad_norm": 0.6191548109054565, "learning_rate": 0.0007633663366336634, "loss": 2.1611, "step": 4790 }, { "epoch": 3.5, "grad_norm": 0.5724292993545532, "learning_rate": 0.0007623762376237624, "loss": 2.1583, "step": 4800 }, { "epoch": 3.5, "grad_norm": 0.6278745532035828, "learning_rate": 0.0007613861386138615, "loss": 2.1573, "step": 4810 }, { "epoch": 3.51, "grad_norm": 0.6978874802589417, "learning_rate": 0.0007603960396039604, "loss": 2.1523, "step": 4820 }, { "epoch": 3.52, "grad_norm": 0.9318163990974426, "learning_rate": 0.0007594059405940595, "loss": 2.1524, "step": 4830 }, { "epoch": 3.53, "grad_norm": 0.5397381782531738, "learning_rate": 0.0007584158415841584, "loss": 2.1505, "step": 4840 }, { "epoch": 3.53, "grad_norm": 0.6277997493743896, "learning_rate": 0.0007574257425742574, "loss": 2.15, "step": 4850 }, { "epoch": 3.54, "grad_norm": 0.6128600239753723, "learning_rate": 0.0007564356435643565, "loss": 2.1466, "step": 4860 }, { "epoch": 3.54, "eval_accuracy": 0.5778634145294884, "eval_loss": 2.0040018558502197, "eval_runtime": 1086.3606, "eval_samples_per_second": 459.682, "eval_steps_per_second": 2.053, "step": 4860 }, { "epoch": 3.55, "grad_norm": 0.602790892124176, "learning_rate": 0.0007554455445544554, "loss": 2.1483, "step": 4870 }, { "epoch": 3.56, "grad_norm": 0.5856905579566956, "learning_rate": 0.0007544554455445545, "loss": 2.1471, "step": 4880 }, { "epoch": 3.56, "grad_norm": 0.6018987894058228, "learning_rate": 0.0007534653465346534, "loss": 2.141, "step": 4890 }, { "epoch": 3.57, "grad_norm": 0.6041855216026306, "learning_rate": 0.0007524752475247525, "loss": 2.1385, "step": 4900 }, { "epoch": 3.58, "grad_norm": 0.580766499042511, "learning_rate": 0.0007514851485148515, "loss": 2.1375, "step": 4910 }, { "epoch": 3.58, "grad_norm": 0.5637401342391968, "learning_rate": 0.0007504950495049505, "loss": 2.1374, "step": 4920 }, { "epoch": 3.59, "grad_norm": 0.6017095446586609, "learning_rate": 0.0007495049504950495, "loss": 2.1372, "step": 4930 }, { "epoch": 3.6, "grad_norm": 0.766730010509491, "learning_rate": 0.0007485148514851485, "loss": 2.1336, "step": 4940 }, { "epoch": 3.61, "grad_norm": 0.5679196119308472, "learning_rate": 0.0007475247524752475, "loss": 2.1339, "step": 4950 }, { "epoch": 3.61, "eval_accuracy": 0.5801526152356328, "eval_loss": 1.9901340007781982, "eval_runtime": 1085.9898, "eval_samples_per_second": 459.839, "eval_steps_per_second": 2.053, "step": 4950 }, { "epoch": 3.61, "grad_norm": 0.5875471234321594, "learning_rate": 0.0007465346534653466, "loss": 2.1327, "step": 4960 }, { "epoch": 3.62, "grad_norm": 0.885311484336853, "learning_rate": 0.0007455445544554455, "loss": 2.1319, "step": 4970 }, { "epoch": 3.63, "grad_norm": 0.5480872392654419, "learning_rate": 0.0007445544554455446, "loss": 2.1283, "step": 4980 }, { "epoch": 3.64, "grad_norm": 0.6052006483078003, "learning_rate": 0.0007435643564356435, "loss": 2.1258, "step": 4990 }, { "epoch": 3.64, "grad_norm": 0.4672467112541199, "learning_rate": 0.0007425742574257426, "loss": 2.1246, "step": 5000 }, { "epoch": 3.65, "grad_norm": 0.7137532234191895, "learning_rate": 0.0007415841584158416, "loss": 2.1225, "step": 5010 }, { "epoch": 3.66, "grad_norm": 0.726308286190033, "learning_rate": 0.0007405940594059406, "loss": 2.1247, "step": 5020 }, { "epoch": 3.67, "grad_norm": 0.4779931902885437, "learning_rate": 0.0007396039603960396, "loss": 2.1215, "step": 5030 }, { "epoch": 3.67, "grad_norm": 0.5192296504974365, "learning_rate": 0.0007386138613861386, "loss": 2.1151, "step": 5040 }, { "epoch": 3.67, "eval_accuracy": 0.5818349975818327, "eval_loss": 1.981979489326477, "eval_runtime": 1085.0979, "eval_samples_per_second": 460.217, "eval_steps_per_second": 2.055, "step": 5040 }, { "epoch": 3.68, "grad_norm": 0.824546754360199, "learning_rate": 0.0007376237623762376, "loss": 2.1176, "step": 5050 }, { "epoch": 3.69, "grad_norm": 0.5459938049316406, "learning_rate": 0.0007366336633663367, "loss": 2.1209, "step": 5060 }, { "epoch": 3.69, "grad_norm": 0.657993733882904, "learning_rate": 0.0007356435643564356, "loss": 2.1174, "step": 5070 }, { "epoch": 3.7, "grad_norm": 0.6625123023986816, "learning_rate": 0.0007346534653465347, "loss": 2.114, "step": 5080 }, { "epoch": 3.71, "grad_norm": 0.6521473526954651, "learning_rate": 0.0007336633663366336, "loss": 2.1126, "step": 5090 }, { "epoch": 3.72, "grad_norm": 0.540843665599823, "learning_rate": 0.0007326732673267327, "loss": 2.1139, "step": 5100 }, { "epoch": 3.72, "grad_norm": 0.5456762313842773, "learning_rate": 0.0007316831683168317, "loss": 2.1096, "step": 5110 }, { "epoch": 3.73, "grad_norm": 0.6277236938476562, "learning_rate": 0.0007306930693069307, "loss": 2.1085, "step": 5120 }, { "epoch": 3.74, "grad_norm": 0.5047609210014343, "learning_rate": 0.0007297029702970297, "loss": 2.1048, "step": 5130 }, { "epoch": 3.74, "eval_accuracy": 0.5844656610858342, "eval_loss": 1.9673104286193848, "eval_runtime": 1086.0993, "eval_samples_per_second": 459.792, "eval_steps_per_second": 2.053, "step": 5130 }, { "epoch": 3.75, "grad_norm": 0.5852828025817871, "learning_rate": 0.0007287128712871287, "loss": 2.1091, "step": 5140 }, { "epoch": 3.75, "grad_norm": 0.49883949756622314, "learning_rate": 0.0007277227722772277, "loss": 2.1059, "step": 5150 }, { "epoch": 3.76, "grad_norm": 0.4874211251735687, "learning_rate": 0.0007267326732673268, "loss": 2.101, "step": 5160 }, { "epoch": 3.77, "grad_norm": 0.807388961315155, "learning_rate": 0.0007257425742574257, "loss": 2.1019, "step": 5170 }, { "epoch": 3.77, "grad_norm": 0.4876428246498108, "learning_rate": 0.0007247524752475248, "loss": 2.101, "step": 5180 }, { "epoch": 3.78, "grad_norm": 0.5534060597419739, "learning_rate": 0.0007237623762376237, "loss": 2.096, "step": 5190 }, { "epoch": 3.79, "grad_norm": 0.5464605093002319, "learning_rate": 0.0007227722772277228, "loss": 2.0999, "step": 5200 }, { "epoch": 3.8, "grad_norm": 0.6738607883453369, "learning_rate": 0.0007217821782178218, "loss": 2.0973, "step": 5210 }, { "epoch": 3.8, "grad_norm": 0.5829378366470337, "learning_rate": 0.0007207920792079208, "loss": 2.0943, "step": 5220 }, { "epoch": 3.8, "eval_accuracy": 0.5862642603086486, "eval_loss": 1.9560039043426514, "eval_runtime": 1085.7866, "eval_samples_per_second": 459.925, "eval_steps_per_second": 2.054, "step": 5220 }, { "epoch": 3.81, "grad_norm": 0.6175411939620972, "learning_rate": 0.0007198019801980198, "loss": 2.097, "step": 5230 }, { "epoch": 3.82, "grad_norm": 0.4551532566547394, "learning_rate": 0.0007188118811881188, "loss": 2.0909, "step": 5240 }, { "epoch": 3.83, "grad_norm": 0.6612002849578857, "learning_rate": 0.0007178217821782178, "loss": 2.0885, "step": 5250 }, { "epoch": 3.83, "grad_norm": 0.5608059763908386, "learning_rate": 0.0007168316831683169, "loss": 2.0893, "step": 5260 }, { "epoch": 3.84, "grad_norm": 0.6450534462928772, "learning_rate": 0.0007158415841584158, "loss": 2.0881, "step": 5270 }, { "epoch": 3.85, "grad_norm": 0.5012123584747314, "learning_rate": 0.0007148514851485149, "loss": 2.0843, "step": 5280 }, { "epoch": 3.85, "grad_norm": 0.5907981395721436, "learning_rate": 0.0007138613861386138, "loss": 2.0848, "step": 5290 }, { "epoch": 3.86, "grad_norm": 0.6378484964370728, "learning_rate": 0.0007128712871287129, "loss": 2.0891, "step": 5300 }, { "epoch": 3.87, "grad_norm": 0.5008774399757385, "learning_rate": 0.0007118811881188119, "loss": 2.0839, "step": 5310 }, { "epoch": 3.87, "eval_accuracy": 0.5886411137424359, "eval_loss": 1.9411782026290894, "eval_runtime": 1086.0682, "eval_samples_per_second": 459.805, "eval_steps_per_second": 2.053, "step": 5310 }, { "epoch": 3.88, "grad_norm": 0.5727280378341675, "learning_rate": 0.0007108910891089109, "loss": 2.0834, "step": 5320 }, { "epoch": 3.88, "grad_norm": 0.5417036414146423, "learning_rate": 0.0007099009900990099, "loss": 2.0806, "step": 5330 }, { "epoch": 3.89, "grad_norm": 0.5739684104919434, "learning_rate": 0.0007089108910891088, "loss": 2.0809, "step": 5340 }, { "epoch": 3.9, "grad_norm": 0.4842034876346588, "learning_rate": 0.0007079207920792079, "loss": 2.0787, "step": 5350 }, { "epoch": 3.91, "grad_norm": 0.7235381603240967, "learning_rate": 0.000706930693069307, "loss": 2.0761, "step": 5360 }, { "epoch": 3.91, "grad_norm": 0.5333278775215149, "learning_rate": 0.0007059405940594059, "loss": 2.082, "step": 5370 }, { "epoch": 3.92, "grad_norm": 0.5933953523635864, "learning_rate": 0.000704950495049505, "loss": 2.0711, "step": 5380 }, { "epoch": 3.93, "grad_norm": 0.3995033800601959, "learning_rate": 0.0007039603960396039, "loss": 2.072, "step": 5390 }, { "epoch": 3.93, "grad_norm": 0.5461521744728088, "learning_rate": 0.0007029702970297029, "loss": 2.0748, "step": 5400 }, { "epoch": 3.93, "eval_accuracy": 0.5892188513716441, "eval_loss": 1.9380106925964355, "eval_runtime": 1086.2866, "eval_samples_per_second": 459.713, "eval_steps_per_second": 2.053, "step": 5400 }, { "epoch": 3.94, "grad_norm": 0.6759417057037354, "learning_rate": 0.000701980198019802, "loss": 2.0743, "step": 5410 }, { "epoch": 3.95, "grad_norm": 0.4820743203163147, "learning_rate": 0.0007009900990099009, "loss": 2.0708, "step": 5420 }, { "epoch": 3.96, "grad_norm": 0.5824475884437561, "learning_rate": 0.0007, "loss": 2.0676, "step": 5430 }, { "epoch": 3.96, "grad_norm": 0.5133311748504639, "learning_rate": 0.0006990099009900989, "loss": 2.0688, "step": 5440 }, { "epoch": 3.97, "grad_norm": 0.5744913220405579, "learning_rate": 0.000698019801980198, "loss": 2.0657, "step": 5450 }, { "epoch": 3.98, "grad_norm": 0.5103346705436707, "learning_rate": 0.000697029702970297, "loss": 2.0674, "step": 5460 }, { "epoch": 3.99, "grad_norm": 0.48300009965896606, "learning_rate": 0.000696039603960396, "loss": 2.0649, "step": 5470 }, { "epoch": 3.99, "grad_norm": 0.64620441198349, "learning_rate": 0.000695049504950495, "loss": 2.0658, "step": 5480 }, { "epoch": 4.0, "grad_norm": 0.5096336603164673, "learning_rate": 0.000694059405940594, "loss": 2.0671, "step": 5490 }, { "epoch": 4.0, "eval_accuracy": 0.5915717983649539, "eval_loss": 1.9237810373306274, "eval_runtime": 1088.2291, "eval_samples_per_second": 458.892, "eval_steps_per_second": 2.049, "step": 5490 }, { "epoch": 4.01, "grad_norm": 0.45002949237823486, "learning_rate": 0.000693069306930693, "loss": 2.0612, "step": 5500 }, { "epoch": 4.01, "grad_norm": 0.48484668135643005, "learning_rate": 0.0006920792079207921, "loss": 2.0598, "step": 5510 }, { "epoch": 4.02, "grad_norm": 0.5888765454292297, "learning_rate": 0.000691089108910891, "loss": 2.0665, "step": 5520 }, { "epoch": 4.03, "grad_norm": 0.6311919689178467, "learning_rate": 0.0006900990099009901, "loss": 2.0604, "step": 5530 }, { "epoch": 4.04, "grad_norm": 0.4337356686592102, "learning_rate": 0.000689108910891089, "loss": 2.0562, "step": 5540 }, { "epoch": 4.04, "grad_norm": 0.5855375528335571, "learning_rate": 0.0006881188118811881, "loss": 2.0535, "step": 5550 }, { "epoch": 4.05, "grad_norm": 0.6715276837348938, "learning_rate": 0.0006871287128712872, "loss": 2.0569, "step": 5560 }, { "epoch": 4.06, "grad_norm": 0.5453487634658813, "learning_rate": 0.0006861386138613862, "loss": 2.0539, "step": 5570 }, { "epoch": 4.07, "grad_norm": 0.44611501693725586, "learning_rate": 0.0006851485148514852, "loss": 2.0557, "step": 5580 }, { "epoch": 4.07, "eval_accuracy": 0.593527753135558, "eval_loss": 1.9137904644012451, "eval_runtime": 1092.9474, "eval_samples_per_second": 456.911, "eval_steps_per_second": 2.04, "step": 5580 }, { "epoch": 4.07, "grad_norm": 0.46945101022720337, "learning_rate": 0.0006841584158415842, "loss": 2.0494, "step": 5590 }, { "epoch": 4.08, "grad_norm": 0.6355441808700562, "learning_rate": 0.0006831683168316832, "loss": 2.0535, "step": 5600 }, { "epoch": 4.09, "grad_norm": 0.5079066157341003, "learning_rate": 0.0006821782178217823, "loss": 2.0535, "step": 5610 }, { "epoch": 4.09, "grad_norm": 0.48156508803367615, "learning_rate": 0.0006811881188118812, "loss": 2.0479, "step": 5620 }, { "epoch": 4.1, "grad_norm": 0.40759479999542236, "learning_rate": 0.0006801980198019803, "loss": 2.0465, "step": 5630 }, { "epoch": 4.11, "grad_norm": 0.6267878413200378, "learning_rate": 0.0006792079207920792, "loss": 2.047, "step": 5640 }, { "epoch": 4.12, "grad_norm": 0.4935464560985565, "learning_rate": 0.0006782178217821783, "loss": 2.0457, "step": 5650 }, { "epoch": 4.12, "grad_norm": 0.8215575814247131, "learning_rate": 0.0006772277227722773, "loss": 2.0459, "step": 5660 }, { "epoch": 4.13, "grad_norm": 0.5285871624946594, "learning_rate": 0.0006762376237623763, "loss": 2.046, "step": 5670 }, { "epoch": 4.13, "eval_accuracy": 0.5947354040081998, "eval_loss": 1.9072514772415161, "eval_runtime": 1086.9971, "eval_samples_per_second": 459.412, "eval_steps_per_second": 2.052, "step": 5670 }, { "epoch": 4.14, "grad_norm": 0.4480656087398529, "learning_rate": 0.0006752475247524753, "loss": 2.0419, "step": 5680 }, { "epoch": 4.15, "grad_norm": 0.5856757760047913, "learning_rate": 0.0006742574257425743, "loss": 2.043, "step": 5690 }, { "epoch": 4.15, "grad_norm": 0.5994493365287781, "learning_rate": 0.0006732673267326733, "loss": 2.0423, "step": 5700 }, { "epoch": 4.16, "grad_norm": 0.5151802897453308, "learning_rate": 0.0006722772277227724, "loss": 2.0367, "step": 5710 }, { "epoch": 4.17, "grad_norm": 0.5299440622329712, "learning_rate": 0.0006712871287128713, "loss": 2.0404, "step": 5720 }, { "epoch": 4.18, "grad_norm": 0.5033411979675293, "learning_rate": 0.0006702970297029704, "loss": 2.0383, "step": 5730 }, { "epoch": 4.18, "grad_norm": 0.5515163540840149, "learning_rate": 0.0006693069306930693, "loss": 2.0377, "step": 5740 }, { "epoch": 4.19, "grad_norm": 0.6445341110229492, "learning_rate": 0.0006683168316831684, "loss": 2.0348, "step": 5750 }, { "epoch": 4.2, "grad_norm": 0.48192110657691956, "learning_rate": 0.0006673267326732674, "loss": 2.0376, "step": 5760 }, { "epoch": 4.2, "eval_accuracy": 0.5963143559474492, "eval_loss": 1.8983112573623657, "eval_runtime": 1086.0935, "eval_samples_per_second": 459.795, "eval_steps_per_second": 2.053, "step": 5760 }, { "epoch": 4.2, "grad_norm": 0.43177658319473267, "learning_rate": 0.0006663366336633664, "loss": 2.0304, "step": 5770 }, { "epoch": 4.21, "grad_norm": 0.5673606395721436, "learning_rate": 0.0006653465346534654, "loss": 2.0342, "step": 5780 }, { "epoch": 4.22, "grad_norm": 0.6636048555374146, "learning_rate": 0.0006643564356435644, "loss": 2.035, "step": 5790 }, { "epoch": 4.23, "grad_norm": 0.5259466171264648, "learning_rate": 0.0006633663366336634, "loss": 2.0323, "step": 5800 }, { "epoch": 4.23, "grad_norm": 0.561341404914856, "learning_rate": 0.0006623762376237625, "loss": 2.0277, "step": 5810 }, { "epoch": 4.24, "grad_norm": 0.48961034417152405, "learning_rate": 0.0006613861386138614, "loss": 2.0316, "step": 5820 }, { "epoch": 4.25, "grad_norm": 0.46912887692451477, "learning_rate": 0.0006603960396039605, "loss": 2.0285, "step": 5830 }, { "epoch": 4.26, "grad_norm": 0.5009626746177673, "learning_rate": 0.0006594059405940594, "loss": 2.0287, "step": 5840 }, { "epoch": 4.26, "grad_norm": 0.5485634207725525, "learning_rate": 0.0006584158415841585, "loss": 2.0275, "step": 5850 }, { "epoch": 4.26, "eval_accuracy": 0.5978907811550054, "eval_loss": 1.8894693851470947, "eval_runtime": 1085.9104, "eval_samples_per_second": 459.872, "eval_steps_per_second": 2.054, "step": 5850 }, { "epoch": 4.27, "grad_norm": 0.5094584822654724, "learning_rate": 0.0006574257425742575, "loss": 2.0285, "step": 5860 }, { "epoch": 4.28, "grad_norm": 0.5796740651130676, "learning_rate": 0.0006564356435643565, "loss": 2.0246, "step": 5870 }, { "epoch": 4.28, "grad_norm": 0.5473222136497498, "learning_rate": 0.0006554455445544555, "loss": 2.0262, "step": 5880 }, { "epoch": 4.29, "grad_norm": 0.514639675617218, "learning_rate": 0.0006544554455445545, "loss": 2.0196, "step": 5890 }, { "epoch": 4.3, "grad_norm": 0.7184515595436096, "learning_rate": 0.0006534653465346535, "loss": 2.0228, "step": 5900 }, { "epoch": 4.31, "grad_norm": 0.5729575157165527, "learning_rate": 0.0006524752475247526, "loss": 2.0216, "step": 5910 }, { "epoch": 4.31, "grad_norm": 0.543946385383606, "learning_rate": 0.0006514851485148515, "loss": 2.0221, "step": 5920 }, { "epoch": 4.32, "grad_norm": 0.5328618884086609, "learning_rate": 0.0006504950495049506, "loss": 2.0205, "step": 5930 }, { "epoch": 4.33, "grad_norm": 0.45202726125717163, "learning_rate": 0.0006495049504950495, "loss": 2.0171, "step": 5940 }, { "epoch": 4.33, "eval_accuracy": 0.5995061137091588, "eval_loss": 1.880002498626709, "eval_runtime": 1087.0114, "eval_samples_per_second": 459.406, "eval_steps_per_second": 2.051, "step": 5940 }, { "epoch": 4.34, "grad_norm": 0.6767913103103638, "learning_rate": 0.0006485148514851485, "loss": 2.0196, "step": 5950 }, { "epoch": 4.34, "grad_norm": 0.6758072972297668, "learning_rate": 0.0006475247524752476, "loss": 2.0187, "step": 5960 }, { "epoch": 4.35, "grad_norm": 0.49338895082473755, "learning_rate": 0.0006465346534653465, "loss": 2.0203, "step": 5970 }, { "epoch": 4.36, "grad_norm": 0.6283939480781555, "learning_rate": 0.0006455445544554456, "loss": 2.0141, "step": 5980 }, { "epoch": 4.36, "grad_norm": 0.4420575499534607, "learning_rate": 0.0006445544554455445, "loss": 2.0169, "step": 5990 }, { "epoch": 4.37, "grad_norm": 0.5496084690093994, "learning_rate": 0.0006435643564356436, "loss": 2.0132, "step": 6000 }, { "epoch": 4.38, "grad_norm": 0.5455350279808044, "learning_rate": 0.0006425742574257426, "loss": 2.0135, "step": 6010 }, { "epoch": 4.39, "grad_norm": 0.5139738917350769, "learning_rate": 0.0006415841584158416, "loss": 2.0165, "step": 6020 }, { "epoch": 4.39, "grad_norm": 0.5023753046989441, "learning_rate": 0.0006405940594059406, "loss": 2.0107, "step": 6030 }, { "epoch": 4.39, "eval_accuracy": 0.6004914418538849, "eval_loss": 1.8729732036590576, "eval_runtime": 1086.4568, "eval_samples_per_second": 459.641, "eval_steps_per_second": 2.053, "step": 6030 }, { "epoch": 4.4, "grad_norm": 0.50832599401474, "learning_rate": 0.0006396039603960396, "loss": 2.0125, "step": 6040 }, { "epoch": 4.41, "grad_norm": 0.6144891977310181, "learning_rate": 0.0006386138613861386, "loss": 2.0074, "step": 6050 }, { "epoch": 4.42, "grad_norm": 0.5852723121643066, "learning_rate": 0.0006376237623762377, "loss": 2.0116, "step": 6060 }, { "epoch": 4.42, "grad_norm": 0.6694257259368896, "learning_rate": 0.0006366336633663366, "loss": 2.0068, "step": 6070 }, { "epoch": 4.43, "grad_norm": 0.5024294257164001, "learning_rate": 0.0006356435643564357, "loss": 2.0097, "step": 6080 }, { "epoch": 4.44, "grad_norm": 0.4397622048854828, "learning_rate": 0.0006346534653465346, "loss": 2.0058, "step": 6090 }, { "epoch": 4.44, "grad_norm": 0.46592214703559875, "learning_rate": 0.0006336633663366337, "loss": 2.0036, "step": 6100 }, { "epoch": 4.45, "grad_norm": 0.6728220582008362, "learning_rate": 0.0006326732673267327, "loss": 2.0027, "step": 6110 }, { "epoch": 4.46, "grad_norm": 0.5555120706558228, "learning_rate": 0.0006316831683168317, "loss": 2.0037, "step": 6120 }, { "epoch": 4.46, "eval_accuracy": 0.6017631778318724, "eval_loss": 1.8681055307388306, "eval_runtime": 1084.7583, "eval_samples_per_second": 460.361, "eval_steps_per_second": 2.056, "step": 6120 }, { "epoch": 4.47, "grad_norm": 0.5361539721488953, "learning_rate": 0.0006306930693069307, "loss": 2.0029, "step": 6130 }, { "epoch": 4.47, "grad_norm": 0.4616907238960266, "learning_rate": 0.0006297029702970297, "loss": 2.004, "step": 6140 }, { "epoch": 4.48, "grad_norm": 0.4987693727016449, "learning_rate": 0.0006287128712871287, "loss": 2.0023, "step": 6150 }, { "epoch": 4.49, "grad_norm": 0.5090926289558411, "learning_rate": 0.0006277227722772278, "loss": 1.9998, "step": 6160 }, { "epoch": 4.5, "grad_norm": 0.5008625388145447, "learning_rate": 0.0006267326732673267, "loss": 1.9983, "step": 6170 }, { "epoch": 4.5, "grad_norm": 0.5074314475059509, "learning_rate": 0.0006257425742574258, "loss": 1.9971, "step": 6180 }, { "epoch": 4.51, "grad_norm": 0.5803602933883667, "learning_rate": 0.0006247524752475247, "loss": 1.9974, "step": 6190 }, { "epoch": 4.52, "grad_norm": 0.534377932548523, "learning_rate": 0.0006237623762376238, "loss": 1.9962, "step": 6200 }, { "epoch": 4.52, "grad_norm": 0.5166971683502197, "learning_rate": 0.0006227722772277228, "loss": 1.9967, "step": 6210 }, { "epoch": 4.52, "eval_accuracy": 0.6030721722960667, "eval_loss": 1.8595592975616455, "eval_runtime": 1083.857, "eval_samples_per_second": 460.743, "eval_steps_per_second": 2.057, "step": 6210 }, { "epoch": 4.53, "grad_norm": 0.610471785068512, "learning_rate": 0.0006217821782178218, "loss": 1.9972, "step": 6220 }, { "epoch": 4.54, "grad_norm": 0.4919542372226715, "learning_rate": 0.0006207920792079208, "loss": 1.9945, "step": 6230 }, { "epoch": 4.55, "grad_norm": 0.6607844829559326, "learning_rate": 0.0006198019801980198, "loss": 1.9962, "step": 6240 }, { "epoch": 4.55, "grad_norm": 0.4905446171760559, "learning_rate": 0.0006188118811881188, "loss": 1.9937, "step": 6250 }, { "epoch": 4.56, "grad_norm": 0.46790874004364014, "learning_rate": 0.0006178217821782179, "loss": 1.99, "step": 6260 }, { "epoch": 4.57, "grad_norm": 0.5997541546821594, "learning_rate": 0.0006168316831683168, "loss": 1.9928, "step": 6270 }, { "epoch": 4.58, "grad_norm": 0.5869884490966797, "learning_rate": 0.0006158415841584159, "loss": 1.9913, "step": 6280 }, { "epoch": 4.58, "grad_norm": 0.5359517335891724, "learning_rate": 0.0006148514851485148, "loss": 1.9892, "step": 6290 }, { "epoch": 4.59, "grad_norm": 0.5119579434394836, "learning_rate": 0.0006138613861386139, "loss": 1.9892, "step": 6300 }, { "epoch": 4.59, "eval_accuracy": 0.6040549869719135, "eval_loss": 1.8538638353347778, "eval_runtime": 1084.8597, "eval_samples_per_second": 460.318, "eval_steps_per_second": 2.056, "step": 6300 }, { "epoch": 4.6, "grad_norm": 0.501132071018219, "learning_rate": 0.0006128712871287129, "loss": 1.9874, "step": 6310 }, { "epoch": 4.6, "grad_norm": 0.5507422089576721, "learning_rate": 0.0006118811881188119, "loss": 1.9904, "step": 6320 }, { "epoch": 4.61, "grad_norm": 0.5412635207176208, "learning_rate": 0.0006108910891089109, "loss": 1.9873, "step": 6330 }, { "epoch": 4.62, "grad_norm": 0.44309449195861816, "learning_rate": 0.0006099009900990099, "loss": 1.9869, "step": 6340 }, { "epoch": 4.63, "grad_norm": 0.5056418776512146, "learning_rate": 0.0006089108910891089, "loss": 1.9855, "step": 6350 }, { "epoch": 4.63, "grad_norm": 0.48882943391799927, "learning_rate": 0.000607920792079208, "loss": 1.984, "step": 6360 }, { "epoch": 4.64, "grad_norm": 0.4717276692390442, "learning_rate": 0.0006069306930693069, "loss": 1.9841, "step": 6370 }, { "epoch": 4.65, "grad_norm": 0.5123859643936157, "learning_rate": 0.000605940594059406, "loss": 1.9847, "step": 6380 }, { "epoch": 4.66, "grad_norm": 0.4882528483867645, "learning_rate": 0.0006049504950495049, "loss": 1.9824, "step": 6390 }, { "epoch": 4.66, "eval_accuracy": 0.6054229902100914, "eval_loss": 1.8454294204711914, "eval_runtime": 1084.8895, "eval_samples_per_second": 460.305, "eval_steps_per_second": 2.056, "step": 6390 }, { "epoch": 4.66, "grad_norm": 0.4354398548603058, "learning_rate": 0.000603960396039604, "loss": 1.9806, "step": 6400 }, { "epoch": 4.67, "grad_norm": 0.5733498334884644, "learning_rate": 0.000602970297029703, "loss": 1.9828, "step": 6410 }, { "epoch": 4.68, "grad_norm": 0.5736912488937378, "learning_rate": 0.000601980198019802, "loss": 1.9816, "step": 6420 }, { "epoch": 4.69, "grad_norm": 0.44418302178382874, "learning_rate": 0.000600990099009901, "loss": 1.978, "step": 6430 }, { "epoch": 4.69, "grad_norm": 0.5334004759788513, "learning_rate": 0.0006, "loss": 1.9814, "step": 6440 }, { "epoch": 4.7, "grad_norm": 0.5706362128257751, "learning_rate": 0.000599009900990099, "loss": 1.9787, "step": 6450 }, { "epoch": 4.71, "grad_norm": 0.8315806984901428, "learning_rate": 0.000598019801980198, "loss": 1.9772, "step": 6460 }, { "epoch": 4.71, "grad_norm": 0.47482118010520935, "learning_rate": 0.000597029702970297, "loss": 1.98, "step": 6470 }, { "epoch": 4.72, "grad_norm": 0.43432384729385376, "learning_rate": 0.000596039603960396, "loss": 1.9766, "step": 6480 }, { "epoch": 4.72, "eval_accuracy": 0.6066947766959975, "eval_loss": 1.8382798433303833, "eval_runtime": 1084.0208, "eval_samples_per_second": 460.674, "eval_steps_per_second": 2.057, "step": 6480 }, { "epoch": 4.73, "grad_norm": 0.4684685170650482, "learning_rate": 0.000595049504950495, "loss": 1.9747, "step": 6490 }, { "epoch": 4.74, "grad_norm": 0.5347057580947876, "learning_rate": 0.000594059405940594, "loss": 1.9773, "step": 6500 }, { "epoch": 4.74, "grad_norm": 0.5178421139717102, "learning_rate": 0.0005930693069306931, "loss": 1.9754, "step": 6510 }, { "epoch": 4.75, "grad_norm": 0.5036115050315857, "learning_rate": 0.000592079207920792, "loss": 1.9737, "step": 6520 }, { "epoch": 4.76, "grad_norm": 0.48273569345474243, "learning_rate": 0.0005910891089108911, "loss": 1.9723, "step": 6530 }, { "epoch": 4.77, "grad_norm": 0.4350590109825134, "learning_rate": 0.00059009900990099, "loss": 1.9715, "step": 6540 }, { "epoch": 4.77, "grad_norm": 0.4483092129230499, "learning_rate": 0.0005891089108910891, "loss": 1.9718, "step": 6550 }, { "epoch": 4.78, "grad_norm": 0.6620519757270813, "learning_rate": 0.0005881188118811881, "loss": 1.9726, "step": 6560 }, { "epoch": 4.79, "grad_norm": 0.41184690594673157, "learning_rate": 0.0005871287128712871, "loss": 1.9682, "step": 6570 }, { "epoch": 4.79, "eval_accuracy": 0.6078188605164281, "eval_loss": 1.8335860967636108, "eval_runtime": 1085.8051, "eval_samples_per_second": 459.917, "eval_steps_per_second": 2.054, "step": 6570 }, { "epoch": 4.79, "grad_norm": 0.6032079458236694, "learning_rate": 0.0005861386138613861, "loss": 1.9701, "step": 6580 }, { "epoch": 4.8, "grad_norm": 0.5113199949264526, "learning_rate": 0.0005851485148514851, "loss": 1.9715, "step": 6590 }, { "epoch": 4.81, "grad_norm": 0.43198567628860474, "learning_rate": 0.0005841584158415841, "loss": 1.9677, "step": 6600 }, { "epoch": 4.82, "grad_norm": 0.511009931564331, "learning_rate": 0.0005831683168316832, "loss": 1.9692, "step": 6610 }, { "epoch": 4.82, "grad_norm": 0.48394373059272766, "learning_rate": 0.0005821782178217821, "loss": 1.9658, "step": 6620 }, { "epoch": 4.83, "grad_norm": 0.5180623531341553, "learning_rate": 0.0005811881188118812, "loss": 1.9656, "step": 6630 }, { "epoch": 4.84, "grad_norm": 0.5282729864120483, "learning_rate": 0.0005801980198019801, "loss": 1.9672, "step": 6640 }, { "epoch": 4.85, "grad_norm": 0.44400766491889954, "learning_rate": 0.0005792079207920792, "loss": 1.9628, "step": 6650 }, { "epoch": 4.85, "grad_norm": 0.47033068537712097, "learning_rate": 0.0005782178217821782, "loss": 1.9653, "step": 6660 }, { "epoch": 4.85, "eval_accuracy": 0.6084373992178661, "eval_loss": 1.8281679153442383, "eval_runtime": 1084.5844, "eval_samples_per_second": 460.434, "eval_steps_per_second": 2.056, "step": 6660 }, { "epoch": 4.86, "grad_norm": 0.7458497881889343, "learning_rate": 0.0005772277227722772, "loss": 1.9662, "step": 6670 }, { "epoch": 4.87, "grad_norm": 0.4841892421245575, "learning_rate": 0.0005762376237623762, "loss": 1.9645, "step": 6680 }, { "epoch": 4.87, "grad_norm": 0.472526490688324, "learning_rate": 0.0005752475247524752, "loss": 1.9621, "step": 6690 }, { "epoch": 4.88, "grad_norm": 0.41672539710998535, "learning_rate": 0.0005742574257425742, "loss": 1.9595, "step": 6700 }, { "epoch": 4.89, "grad_norm": 0.5024113059043884, "learning_rate": 0.0005732673267326733, "loss": 1.9593, "step": 6710 }, { "epoch": 4.9, "grad_norm": 0.5023587942123413, "learning_rate": 0.0005722772277227722, "loss": 1.9599, "step": 6720 }, { "epoch": 4.9, "grad_norm": 0.45752909779548645, "learning_rate": 0.0005712871287128713, "loss": 1.9599, "step": 6730 }, { "epoch": 4.91, "grad_norm": 0.6170557737350464, "learning_rate": 0.0005702970297029702, "loss": 1.9616, "step": 6740 }, { "epoch": 4.92, "grad_norm": 0.4267810583114624, "learning_rate": 0.0005693069306930693, "loss": 1.9599, "step": 6750 }, { "epoch": 4.92, "eval_accuracy": 0.609472718679915, "eval_loss": 1.8216131925582886, "eval_runtime": 1085.0974, "eval_samples_per_second": 460.217, "eval_steps_per_second": 2.055, "step": 6750 }, { "epoch": 4.93, "grad_norm": 0.5765691995620728, "learning_rate": 0.0005683168316831683, "loss": 1.9571, "step": 6760 }, { "epoch": 4.93, "grad_norm": 0.5143380761146545, "learning_rate": 0.0005673267326732673, "loss": 1.9594, "step": 6770 }, { "epoch": 4.94, "grad_norm": 0.47696933150291443, "learning_rate": 0.0005663366336633663, "loss": 1.9569, "step": 6780 }, { "epoch": 4.95, "grad_norm": 0.6473893523216248, "learning_rate": 0.0005653465346534653, "loss": 1.959, "step": 6790 }, { "epoch": 4.95, "grad_norm": 0.42045238614082336, "learning_rate": 0.0005643564356435643, "loss": 1.9537, "step": 6800 }, { "epoch": 4.96, "grad_norm": 0.47495463490486145, "learning_rate": 0.0005633663366336634, "loss": 1.9539, "step": 6810 }, { "epoch": 4.97, "grad_norm": 0.46555668115615845, "learning_rate": 0.0005623762376237624, "loss": 1.9532, "step": 6820 }, { "epoch": 4.98, "grad_norm": 0.5669355988502502, "learning_rate": 0.0005613861386138615, "loss": 1.9539, "step": 6830 }, { "epoch": 4.98, "grad_norm": 0.5012803077697754, "learning_rate": 0.0005603960396039604, "loss": 1.9516, "step": 6840 }, { "epoch": 4.98, "eval_accuracy": 0.6106878911706901, "eval_loss": 1.8161377906799316, "eval_runtime": 1086.8974, "eval_samples_per_second": 459.455, "eval_steps_per_second": 2.052, "step": 6840 }, { "epoch": 4.99, "grad_norm": 0.6031624674797058, "learning_rate": 0.0005594059405940595, "loss": 1.951, "step": 6850 }, { "epoch": 5.0, "grad_norm": 0.48754164576530457, "learning_rate": 0.0005584158415841585, "loss": 1.9512, "step": 6860 }, { "epoch": 5.01, "grad_norm": 0.5661942362785339, "learning_rate": 0.0005574257425742575, "loss": 1.953, "step": 6870 }, { "epoch": 5.01, "grad_norm": 0.5200914144515991, "learning_rate": 0.0005564356435643565, "loss": 1.9528, "step": 6880 }, { "epoch": 5.02, "grad_norm": 0.504625678062439, "learning_rate": 0.0005554455445544555, "loss": 1.9473, "step": 6890 }, { "epoch": 5.03, "grad_norm": 0.5114207863807678, "learning_rate": 0.0005544554455445545, "loss": 1.9503, "step": 6900 }, { "epoch": 5.03, "grad_norm": 0.4563724100589752, "learning_rate": 0.0005534653465346536, "loss": 1.9522, "step": 6910 }, { "epoch": 5.04, "grad_norm": 0.48981210589408875, "learning_rate": 0.0005524752475247525, "loss": 1.9463, "step": 6920 }, { "epoch": 5.05, "grad_norm": 0.4641856849193573, "learning_rate": 0.0005514851485148516, "loss": 1.9473, "step": 6930 }, { "epoch": 5.05, "eval_accuracy": 0.6114587633382502, "eval_loss": 1.8128423690795898, "eval_runtime": 1088.997, "eval_samples_per_second": 458.569, "eval_steps_per_second": 2.048, "step": 6930 }, { "epoch": 5.06, "grad_norm": 0.522405743598938, "learning_rate": 0.0005504950495049505, "loss": 1.9464, "step": 6940 }, { "epoch": 5.06, "grad_norm": 0.5010780692100525, "learning_rate": 0.0005495049504950496, "loss": 1.9462, "step": 6950 }, { "epoch": 5.07, "grad_norm": 0.4186078906059265, "learning_rate": 0.0005485148514851486, "loss": 1.9454, "step": 6960 }, { "epoch": 5.08, "grad_norm": 0.43226584792137146, "learning_rate": 0.0005475247524752476, "loss": 1.9429, "step": 6970 }, { "epoch": 5.09, "grad_norm": 0.4429096579551697, "learning_rate": 0.0005465346534653466, "loss": 1.9439, "step": 6980 }, { "epoch": 5.09, "grad_norm": 0.5576241612434387, "learning_rate": 0.0005455445544554456, "loss": 1.9465, "step": 6990 }, { "epoch": 5.1, "grad_norm": 0.5840058326721191, "learning_rate": 0.0005445544554455446, "loss": 1.9465, "step": 7000 }, { "epoch": 5.11, "grad_norm": 0.42570897936820984, "learning_rate": 0.0005435643564356437, "loss": 1.9423, "step": 7010 }, { "epoch": 5.11, "grad_norm": 0.4703156650066376, "learning_rate": 0.0005425742574257426, "loss": 1.9445, "step": 7020 }, { "epoch": 5.11, "eval_accuracy": 0.6122824776592447, "eval_loss": 1.8064905405044556, "eval_runtime": 1089.2916, "eval_samples_per_second": 458.445, "eval_steps_per_second": 2.047, "step": 7020 }, { "epoch": 5.12, "grad_norm": 0.47701114416122437, "learning_rate": 0.0005415841584158417, "loss": 1.9414, "step": 7030 }, { "epoch": 5.13, "grad_norm": 0.4756263196468353, "learning_rate": 0.0005405940594059406, "loss": 1.9412, "step": 7040 }, { "epoch": 5.14, "grad_norm": 0.4438433051109314, "learning_rate": 0.0005396039603960396, "loss": 1.9396, "step": 7050 }, { "epoch": 5.14, "grad_norm": 0.7217634916305542, "learning_rate": 0.0005386138613861387, "loss": 1.9405, "step": 7060 }, { "epoch": 5.15, "grad_norm": 0.5862283110618591, "learning_rate": 0.0005376237623762376, "loss": 1.9408, "step": 7070 }, { "epoch": 5.16, "grad_norm": 0.4042336642742157, "learning_rate": 0.0005366336633663367, "loss": 1.9389, "step": 7080 }, { "epoch": 5.17, "grad_norm": 0.48928365111351013, "learning_rate": 0.0005356435643564356, "loss": 1.9387, "step": 7090 }, { "epoch": 5.17, "grad_norm": 0.4354238212108612, "learning_rate": 0.0005346534653465347, "loss": 1.9383, "step": 7100 }, { "epoch": 5.18, "grad_norm": 0.4175672233104706, "learning_rate": 0.0005336633663366337, "loss": 1.9352, "step": 7110 }, { "epoch": 5.18, "eval_accuracy": 0.6133205485630702, "eval_loss": 1.8010112047195435, "eval_runtime": 1088.7719, "eval_samples_per_second": 458.664, "eval_steps_per_second": 2.048, "step": 7110 }, { "epoch": 5.19, "grad_norm": 0.49512535333633423, "learning_rate": 0.0005326732673267327, "loss": 1.936, "step": 7120 }, { "epoch": 5.2, "grad_norm": 0.5564088225364685, "learning_rate": 0.0005316831683168317, "loss": 1.9352, "step": 7130 }, { "epoch": 5.2, "grad_norm": 0.42951545119285583, "learning_rate": 0.0005306930693069307, "loss": 1.9382, "step": 7140 }, { "epoch": 5.21, "grad_norm": 0.4925052523612976, "learning_rate": 0.0005297029702970297, "loss": 1.9332, "step": 7150 }, { "epoch": 5.22, "grad_norm": 0.47808635234832764, "learning_rate": 0.0005287128712871288, "loss": 1.9347, "step": 7160 }, { "epoch": 5.22, "grad_norm": 0.49086272716522217, "learning_rate": 0.0005277227722772277, "loss": 1.9346, "step": 7170 }, { "epoch": 5.23, "grad_norm": 0.6410700678825378, "learning_rate": 0.0005267326732673268, "loss": 1.9355, "step": 7180 }, { "epoch": 5.24, "grad_norm": 0.5207043886184692, "learning_rate": 0.0005257425742574257, "loss": 1.9362, "step": 7190 }, { "epoch": 5.25, "grad_norm": 0.4774588346481323, "learning_rate": 0.0005247524752475248, "loss": 1.9275, "step": 7200 }, { "epoch": 5.25, "eval_accuracy": 0.6140941851085354, "eval_loss": 1.795212745666504, "eval_runtime": 1086.7491, "eval_samples_per_second": 459.517, "eval_steps_per_second": 2.052, "step": 7200 }, { "epoch": 5.25, "grad_norm": 0.43084409832954407, "learning_rate": 0.0005237623762376238, "loss": 1.9323, "step": 7210 }, { "epoch": 5.26, "grad_norm": 0.47473257780075073, "learning_rate": 0.0005227722772277228, "loss": 1.9325, "step": 7220 }, { "epoch": 5.27, "grad_norm": 0.5835340023040771, "learning_rate": 0.0005217821782178218, "loss": 1.9307, "step": 7230 }, { "epoch": 5.28, "grad_norm": 0.5061103701591492, "learning_rate": 0.0005207920792079208, "loss": 1.9312, "step": 7240 }, { "epoch": 5.28, "grad_norm": 0.41326335072517395, "learning_rate": 0.0005198019801980198, "loss": 1.9302, "step": 7250 }, { "epoch": 5.29, "grad_norm": 0.4128727316856384, "learning_rate": 0.0005188118811881189, "loss": 1.9306, "step": 7260 }, { "epoch": 5.3, "grad_norm": 0.4514748752117157, "learning_rate": 0.0005178217821782178, "loss": 1.9306, "step": 7270 }, { "epoch": 5.3, "grad_norm": 0.4459412097930908, "learning_rate": 0.0005168316831683169, "loss": 1.9273, "step": 7280 }, { "epoch": 5.31, "grad_norm": 0.508747935295105, "learning_rate": 0.0005158415841584158, "loss": 1.9288, "step": 7290 }, { "epoch": 5.31, "eval_accuracy": 0.6147504784530803, "eval_loss": 1.7935823202133179, "eval_runtime": 1086.9403, "eval_samples_per_second": 459.436, "eval_steps_per_second": 2.052, "step": 7290 }, { "epoch": 5.32, "grad_norm": 0.47734731435775757, "learning_rate": 0.0005148514851485149, "loss": 1.9298, "step": 7300 }, { "epoch": 5.33, "grad_norm": 0.5174552798271179, "learning_rate": 0.0005138613861386139, "loss": 1.9265, "step": 7310 }, { "epoch": 5.33, "grad_norm": 0.5077902674674988, "learning_rate": 0.0005128712871287129, "loss": 1.9261, "step": 7320 }, { "epoch": 5.34, "grad_norm": 0.5650092959403992, "learning_rate": 0.0005118811881188119, "loss": 1.9298, "step": 7330 }, { "epoch": 5.35, "grad_norm": 0.4757165312767029, "learning_rate": 0.0005108910891089109, "loss": 1.924, "step": 7340 }, { "epoch": 5.36, "grad_norm": 0.39661648869514465, "learning_rate": 0.0005099009900990099, "loss": 1.9246, "step": 7350 }, { "epoch": 5.36, "grad_norm": 0.4992424547672272, "learning_rate": 0.000508910891089109, "loss": 1.9238, "step": 7360 }, { "epoch": 5.37, "grad_norm": 0.5065789222717285, "learning_rate": 0.0005079207920792079, "loss": 1.9227, "step": 7370 }, { "epoch": 5.38, "grad_norm": 0.4040853977203369, "learning_rate": 0.000506930693069307, "loss": 1.9216, "step": 7380 }, { "epoch": 5.38, "eval_accuracy": 0.6157344716957146, "eval_loss": 1.7872822284698486, "eval_runtime": 1088.1718, "eval_samples_per_second": 458.917, "eval_steps_per_second": 2.049, "step": 7380 }, { "epoch": 5.38, "grad_norm": 0.5322463512420654, "learning_rate": 0.0005059405940594059, "loss": 1.9224, "step": 7390 }, { "epoch": 5.39, "grad_norm": 0.7337666153907776, "learning_rate": 0.000504950495049505, "loss": 1.9215, "step": 7400 }, { "epoch": 5.4, "grad_norm": 0.4274744987487793, "learning_rate": 0.000503960396039604, "loss": 1.9204, "step": 7410 }, { "epoch": 5.41, "grad_norm": 0.5131354928016663, "learning_rate": 0.000502970297029703, "loss": 1.9232, "step": 7420 }, { "epoch": 5.41, "grad_norm": 0.46376702189445496, "learning_rate": 0.000501980198019802, "loss": 1.9206, "step": 7430 }, { "epoch": 5.42, "grad_norm": 0.4123290181159973, "learning_rate": 0.000500990099009901, "loss": 1.9195, "step": 7440 }, { "epoch": 5.43, "grad_norm": 0.5006473660469055, "learning_rate": 0.0005, "loss": 1.9193, "step": 7450 }, { "epoch": 5.44, "grad_norm": 0.4529099464416504, "learning_rate": 0.0004990099009900991, "loss": 1.9194, "step": 7460 }, { "epoch": 5.44, "grad_norm": 0.43744367361068726, "learning_rate": 0.000498019801980198, "loss": 1.9215, "step": 7470 }, { "epoch": 5.44, "eval_accuracy": 0.6164003583042955, "eval_loss": 1.7831112146377563, "eval_runtime": 1087.85, "eval_samples_per_second": 459.052, "eval_steps_per_second": 2.05, "step": 7470 }, { "epoch": 5.45, "grad_norm": 0.4329184591770172, "learning_rate": 0.0004970297029702971, "loss": 1.9184, "step": 7480 }, { "epoch": 5.46, "grad_norm": 0.4381932020187378, "learning_rate": 0.000496039603960396, "loss": 1.9194, "step": 7490 }, { "epoch": 5.46, "grad_norm": 0.4533543586730957, "learning_rate": 0.0004950495049504951, "loss": 1.9163, "step": 7500 }, { "epoch": 5.47, "grad_norm": 0.50531405210495, "learning_rate": 0.0004940594059405941, "loss": 1.9196, "step": 7510 }, { "epoch": 5.48, "grad_norm": 0.45237472653388977, "learning_rate": 0.000493069306930693, "loss": 1.9148, "step": 7520 }, { "epoch": 5.49, "grad_norm": 0.4136326313018799, "learning_rate": 0.0004920792079207921, "loss": 1.9152, "step": 7530 }, { "epoch": 5.49, "grad_norm": 0.5655802488327026, "learning_rate": 0.000491089108910891, "loss": 1.9162, "step": 7540 }, { "epoch": 5.5, "grad_norm": 0.4693652391433716, "learning_rate": 0.0004900990099009901, "loss": 1.9172, "step": 7550 }, { "epoch": 5.51, "grad_norm": 0.43454521894454956, "learning_rate": 0.0004891089108910892, "loss": 1.9143, "step": 7560 }, { "epoch": 5.51, "eval_accuracy": 0.6173679783746377, "eval_loss": 1.7775607109069824, "eval_runtime": 1086.5491, "eval_samples_per_second": 459.602, "eval_steps_per_second": 2.052, "step": 7560 }, { "epoch": 5.52, "grad_norm": 0.5520709156990051, "learning_rate": 0.0004881188118811881, "loss": 1.9129, "step": 7570 }, { "epoch": 5.52, "grad_norm": 0.5848320722579956, "learning_rate": 0.00048712871287128715, "loss": 1.9157, "step": 7580 }, { "epoch": 5.53, "grad_norm": 0.4443696141242981, "learning_rate": 0.00048613861386138615, "loss": 1.9129, "step": 7590 }, { "epoch": 5.54, "grad_norm": 0.4639554023742676, "learning_rate": 0.00048514851485148515, "loss": 1.9118, "step": 7600 }, { "epoch": 5.54, "grad_norm": 0.4965130090713501, "learning_rate": 0.00048415841584158414, "loss": 1.911, "step": 7610 }, { "epoch": 5.55, "grad_norm": 0.5710552334785461, "learning_rate": 0.00048316831683168314, "loss": 1.9133, "step": 7620 }, { "epoch": 5.56, "grad_norm": 0.5551273226737976, "learning_rate": 0.0004821782178217822, "loss": 1.9115, "step": 7630 }, { "epoch": 5.57, "grad_norm": 0.4237355887889862, "learning_rate": 0.0004811881188118812, "loss": 1.9107, "step": 7640 }, { "epoch": 5.57, "grad_norm": 0.5999632477760315, "learning_rate": 0.0004801980198019802, "loss": 1.9125, "step": 7650 }, { "epoch": 5.57, "eval_accuracy": 0.6181112629292015, "eval_loss": 1.7737356424331665, "eval_runtime": 1089.6084, "eval_samples_per_second": 458.311, "eval_steps_per_second": 2.047, "step": 7650 }, { "epoch": 5.58, "grad_norm": 0.4806526303291321, "learning_rate": 0.0004792079207920792, "loss": 1.9102, "step": 7660 }, { "epoch": 5.59, "grad_norm": 0.4812794327735901, "learning_rate": 0.0004782178217821782, "loss": 1.9071, "step": 7670 }, { "epoch": 5.6, "grad_norm": 0.4029771089553833, "learning_rate": 0.00047722772277227724, "loss": 1.9081, "step": 7680 }, { "epoch": 5.6, "grad_norm": 0.40626466274261475, "learning_rate": 0.00047623762376237624, "loss": 1.9113, "step": 7690 }, { "epoch": 5.61, "grad_norm": 0.4513317942619324, "learning_rate": 0.00047524752475247524, "loss": 1.9088, "step": 7700 }, { "epoch": 5.62, "grad_norm": 0.4461658000946045, "learning_rate": 0.00047425742574257423, "loss": 1.9054, "step": 7710 }, { "epoch": 5.62, "grad_norm": 0.4517589807510376, "learning_rate": 0.00047326732673267323, "loss": 1.9082, "step": 7720 }, { "epoch": 5.63, "grad_norm": 0.45047953724861145, "learning_rate": 0.0004722772277227723, "loss": 1.9064, "step": 7730 }, { "epoch": 5.64, "grad_norm": 0.4856921434402466, "learning_rate": 0.0004712871287128713, "loss": 1.9075, "step": 7740 }, { "epoch": 5.64, "eval_accuracy": 0.6187679819070717, "eval_loss": 1.7702995538711548, "eval_runtime": 1084.6666, "eval_samples_per_second": 460.4, "eval_steps_per_second": 2.056, "step": 7740 }, { "epoch": 5.65, "grad_norm": 0.4753153920173645, "learning_rate": 0.0004702970297029703, "loss": 1.9063, "step": 7750 }, { "epoch": 5.65, "grad_norm": 0.45308151841163635, "learning_rate": 0.0004693069306930693, "loss": 1.9077, "step": 7760 }, { "epoch": 5.66, "grad_norm": 0.4464881718158722, "learning_rate": 0.00046831683168316833, "loss": 1.9038, "step": 7770 }, { "epoch": 5.67, "grad_norm": 0.49109694361686707, "learning_rate": 0.0004673267326732674, "loss": 1.9041, "step": 7780 }, { "epoch": 5.68, "grad_norm": 0.45286795496940613, "learning_rate": 0.0004663366336633664, "loss": 1.9028, "step": 7790 }, { "epoch": 5.68, "grad_norm": 0.4543616771697998, "learning_rate": 0.0004653465346534654, "loss": 1.9055, "step": 7800 }, { "epoch": 5.69, "grad_norm": 0.45107975602149963, "learning_rate": 0.0004643564356435644, "loss": 1.905, "step": 7810 }, { "epoch": 5.7, "grad_norm": 0.4717351198196411, "learning_rate": 0.0004633663366336634, "loss": 1.9035, "step": 7820 }, { "epoch": 5.71, "grad_norm": 0.47570666670799255, "learning_rate": 0.00046237623762376243, "loss": 1.9023, "step": 7830 }, { "epoch": 5.71, "eval_accuracy": 0.619487519380987, "eval_loss": 1.764754295349121, "eval_runtime": 1085.4383, "eval_samples_per_second": 460.072, "eval_steps_per_second": 2.054, "step": 7830 }, { "epoch": 5.71, "grad_norm": 0.4219855070114136, "learning_rate": 0.00046138613861386143, "loss": 1.9023, "step": 7840 }, { "epoch": 5.72, "grad_norm": 0.43669965863227844, "learning_rate": 0.0004603960396039604, "loss": 1.9027, "step": 7850 }, { "epoch": 5.73, "grad_norm": 0.3779612183570862, "learning_rate": 0.0004594059405940594, "loss": 1.8983, "step": 7860 }, { "epoch": 5.73, "grad_norm": 0.42528143525123596, "learning_rate": 0.0004584158415841584, "loss": 1.9003, "step": 7870 }, { "epoch": 5.74, "grad_norm": 0.5401535630226135, "learning_rate": 0.0004574257425742575, "loss": 1.902, "step": 7880 }, { "epoch": 5.75, "grad_norm": 0.5040842890739441, "learning_rate": 0.00045643564356435647, "loss": 1.898, "step": 7890 }, { "epoch": 5.76, "grad_norm": 0.40838822722435, "learning_rate": 0.00045544554455445547, "loss": 1.9009, "step": 7900 }, { "epoch": 5.76, "grad_norm": 0.4518304169178009, "learning_rate": 0.00045445544554455447, "loss": 1.8971, "step": 7910 }, { "epoch": 5.77, "grad_norm": 0.4375505745410919, "learning_rate": 0.00045346534653465347, "loss": 1.8968, "step": 7920 }, { "epoch": 5.77, "eval_accuracy": 0.6199796521907708, "eval_loss": 1.7632313966751099, "eval_runtime": 1087.3022, "eval_samples_per_second": 459.284, "eval_steps_per_second": 2.051, "step": 7920 }, { "epoch": 5.78, "grad_norm": 0.4348089098930359, "learning_rate": 0.0004524752475247525, "loss": 1.897, "step": 7930 }, { "epoch": 5.79, "grad_norm": 0.4852411150932312, "learning_rate": 0.0004514851485148515, "loss": 1.8979, "step": 7940 }, { "epoch": 5.79, "grad_norm": 0.3691408932209015, "learning_rate": 0.0004504950495049505, "loss": 1.8952, "step": 7950 }, { "epoch": 5.8, "grad_norm": 0.40795382857322693, "learning_rate": 0.0004495049504950495, "loss": 1.8976, "step": 7960 }, { "epoch": 5.81, "grad_norm": 0.5768758058547974, "learning_rate": 0.0004485148514851485, "loss": 1.9002, "step": 7970 }, { "epoch": 5.81, "grad_norm": 0.47147759795188904, "learning_rate": 0.00044752475247524756, "loss": 1.8962, "step": 7980 }, { "epoch": 5.82, "grad_norm": 0.4033481478691101, "learning_rate": 0.00044653465346534656, "loss": 1.8902, "step": 7990 }, { "epoch": 5.83, "grad_norm": 0.474514365196228, "learning_rate": 0.00044554455445544556, "loss": 1.8914, "step": 8000 }, { "epoch": 5.84, "grad_norm": 0.4343509376049042, "learning_rate": 0.00044455445544554456, "loss": 1.8909, "step": 8010 }, { "epoch": 5.84, "eval_accuracy": 0.6212259728987508, "eval_loss": 1.755420207977295, "eval_runtime": 1084.9224, "eval_samples_per_second": 460.291, "eval_steps_per_second": 2.055, "step": 8010 }, { "epoch": 5.84, "grad_norm": 0.435395210981369, "learning_rate": 0.00044356435643564356, "loss": 1.8907, "step": 8020 }, { "epoch": 5.85, "grad_norm": 0.48715683817863464, "learning_rate": 0.0004425742574257426, "loss": 1.8894, "step": 8030 }, { "epoch": 5.86, "grad_norm": 0.4001710116863251, "learning_rate": 0.0004415841584158416, "loss": 1.89, "step": 8040 }, { "epoch": 5.87, "grad_norm": 0.38079318404197693, "learning_rate": 0.0004405940594059406, "loss": 1.8893, "step": 8050 }, { "epoch": 5.87, "grad_norm": 0.5229191780090332, "learning_rate": 0.0004396039603960396, "loss": 1.8916, "step": 8060 }, { "epoch": 5.88, "grad_norm": 0.4705289900302887, "learning_rate": 0.0004386138613861386, "loss": 1.891, "step": 8070 }, { "epoch": 5.89, "grad_norm": 0.4324556589126587, "learning_rate": 0.00043762376237623765, "loss": 1.8872, "step": 8080 }, { "epoch": 5.89, "grad_norm": 0.4106965959072113, "learning_rate": 0.00043663366336633665, "loss": 1.8861, "step": 8090 }, { "epoch": 5.9, "grad_norm": 0.461008220911026, "learning_rate": 0.00043564356435643565, "loss": 1.8903, "step": 8100 }, { "epoch": 5.9, "eval_accuracy": 0.6217799147539793, "eval_loss": 1.751676082611084, "eval_runtime": 1084.7892, "eval_samples_per_second": 460.348, "eval_steps_per_second": 2.056, "step": 8100 }, { "epoch": 5.91, "grad_norm": 0.4489499032497406, "learning_rate": 0.00043465346534653465, "loss": 1.8894, "step": 8110 }, { "epoch": 5.92, "grad_norm": 0.40970441699028015, "learning_rate": 0.00043366336633663365, "loss": 1.8858, "step": 8120 }, { "epoch": 5.92, "grad_norm": 0.397197425365448, "learning_rate": 0.0004326732673267327, "loss": 1.8866, "step": 8130 }, { "epoch": 5.93, "grad_norm": 0.4046621322631836, "learning_rate": 0.0004316831683168317, "loss": 1.8865, "step": 8140 }, { "epoch": 5.94, "grad_norm": 0.4488195478916168, "learning_rate": 0.0004306930693069307, "loss": 1.8898, "step": 8150 }, { "epoch": 5.95, "grad_norm": 0.37480929493904114, "learning_rate": 0.0004297029702970297, "loss": 1.883, "step": 8160 }, { "epoch": 5.95, "grad_norm": 0.39327338337898254, "learning_rate": 0.0004287128712871287, "loss": 1.8837, "step": 8170 }, { "epoch": 5.96, "grad_norm": 0.4443519115447998, "learning_rate": 0.00042772277227722774, "loss": 1.8825, "step": 8180 }, { "epoch": 5.97, "grad_norm": 0.4902293384075165, "learning_rate": 0.00042673267326732674, "loss": 1.8837, "step": 8190 }, { "epoch": 5.97, "eval_accuracy": 0.622874142241673, "eval_loss": 1.7469381093978882, "eval_runtime": 1082.2695, "eval_samples_per_second": 461.419, "eval_steps_per_second": 2.06, "step": 8190 }, { "epoch": 5.97, "grad_norm": 0.43381059169769287, "learning_rate": 0.00042574257425742574, "loss": 1.8812, "step": 8200 }, { "epoch": 5.98, "grad_norm": 0.4675629734992981, "learning_rate": 0.00042475247524752474, "loss": 1.8808, "step": 8210 }, { "epoch": 5.99, "grad_norm": 0.4100710451602936, "learning_rate": 0.00042376237623762374, "loss": 1.882, "step": 8220 }, { "epoch": 6.0, "grad_norm": 0.5555077791213989, "learning_rate": 0.0004227722772277228, "loss": 1.8824, "step": 8230 }, { "epoch": 6.0, "grad_norm": 0.5407902598381042, "learning_rate": 0.0004217821782178218, "loss": 1.8813, "step": 8240 }, { "epoch": 6.01, "grad_norm": 0.48739898204803467, "learning_rate": 0.0004207920792079208, "loss": 1.8782, "step": 8250 }, { "epoch": 6.02, "grad_norm": 0.4977239966392517, "learning_rate": 0.0004198019801980198, "loss": 1.8801, "step": 8260 }, { "epoch": 6.03, "grad_norm": 0.47402721643447876, "learning_rate": 0.0004188118811881188, "loss": 1.8811, "step": 8270 }, { "epoch": 6.03, "grad_norm": 0.4796026349067688, "learning_rate": 0.00041782178217821784, "loss": 1.8801, "step": 8280 }, { "epoch": 6.03, "eval_accuracy": 0.6238451571943063, "eval_loss": 1.741037130355835, "eval_runtime": 1087.5002, "eval_samples_per_second": 459.2, "eval_steps_per_second": 2.051, "step": 8280 }, { "epoch": 6.04, "grad_norm": 0.419162392616272, "learning_rate": 0.00041683168316831683, "loss": 1.8796, "step": 8290 }, { "epoch": 6.05, "grad_norm": 0.409493625164032, "learning_rate": 0.00041584158415841583, "loss": 1.8794, "step": 8300 }, { "epoch": 6.05, "grad_norm": 0.5155593752861023, "learning_rate": 0.00041485148514851483, "loss": 1.8786, "step": 8310 }, { "epoch": 6.06, "grad_norm": 0.46643194556236267, "learning_rate": 0.00041386138613861383, "loss": 1.8786, "step": 8320 }, { "epoch": 6.07, "grad_norm": 0.4528968930244446, "learning_rate": 0.0004128712871287129, "loss": 1.8765, "step": 8330 }, { "epoch": 6.08, "grad_norm": 0.4780101180076599, "learning_rate": 0.0004118811881188119, "loss": 1.8742, "step": 8340 }, { "epoch": 6.08, "grad_norm": 0.41187387704849243, "learning_rate": 0.0004108910891089109, "loss": 1.8754, "step": 8350 }, { "epoch": 6.09, "grad_norm": 0.48807311058044434, "learning_rate": 0.0004099009900990099, "loss": 1.877, "step": 8360 }, { "epoch": 6.1, "grad_norm": 0.4205549657344818, "learning_rate": 0.0004089108910891089, "loss": 1.8769, "step": 8370 }, { "epoch": 6.1, "eval_accuracy": 0.6247575321496361, "eval_loss": 1.7364323139190674, "eval_runtime": 1085.7284, "eval_samples_per_second": 459.949, "eval_steps_per_second": 2.054, "step": 8370 }, { "epoch": 6.11, "grad_norm": 0.5422940850257874, "learning_rate": 0.0004079207920792079, "loss": 1.8743, "step": 8380 }, { "epoch": 6.11, "grad_norm": 0.4061022400856018, "learning_rate": 0.0004069306930693069, "loss": 1.8748, "step": 8390 }, { "epoch": 6.12, "grad_norm": 0.4819527268409729, "learning_rate": 0.000405940594059406, "loss": 1.8734, "step": 8400 }, { "epoch": 6.13, "grad_norm": 0.4938518702983856, "learning_rate": 0.000404950495049505, "loss": 1.8746, "step": 8410 }, { "epoch": 6.13, "grad_norm": 0.39021047949790955, "learning_rate": 0.00040396039603960397, "loss": 1.874, "step": 8420 }, { "epoch": 6.14, "grad_norm": 0.3815496265888214, "learning_rate": 0.000402970297029703, "loss": 1.8722, "step": 8430 }, { "epoch": 6.15, "grad_norm": 0.4226173758506775, "learning_rate": 0.000401980198019802, "loss": 1.8716, "step": 8440 }, { "epoch": 6.16, "grad_norm": 0.4311840236186981, "learning_rate": 0.000400990099009901, "loss": 1.8729, "step": 8450 }, { "epoch": 6.16, "grad_norm": 0.37644141912460327, "learning_rate": 0.0004, "loss": 1.8697, "step": 8460 }, { "epoch": 6.16, "eval_accuracy": 0.6258217191475034, "eval_loss": 1.7313834428787231, "eval_runtime": 1087.4157, "eval_samples_per_second": 459.236, "eval_steps_per_second": 2.051, "step": 8460 }, { "epoch": 6.17, "grad_norm": 0.34123849868774414, "learning_rate": 0.000399009900990099, "loss": 1.8709, "step": 8470 }, { "epoch": 6.18, "grad_norm": 0.5545117259025574, "learning_rate": 0.00039801980198019807, "loss": 1.8729, "step": 8480 }, { "epoch": 6.19, "grad_norm": 0.44565874338150024, "learning_rate": 0.00039702970297029707, "loss": 1.8716, "step": 8490 }, { "epoch": 6.19, "grad_norm": 0.4301845133304596, "learning_rate": 0.00039603960396039607, "loss": 1.866, "step": 8500 }, { "epoch": 6.2, "grad_norm": 0.46877652406692505, "learning_rate": 0.00039504950495049506, "loss": 1.8672, "step": 8510 }, { "epoch": 6.21, "grad_norm": 0.535370945930481, "learning_rate": 0.00039405940594059406, "loss": 1.8712, "step": 8520 }, { "epoch": 6.22, "grad_norm": 0.39393341541290283, "learning_rate": 0.0003930693069306931, "loss": 1.8671, "step": 8530 }, { "epoch": 6.22, "grad_norm": 0.36698782444000244, "learning_rate": 0.0003920792079207921, "loss": 1.8685, "step": 8540 }, { "epoch": 6.23, "grad_norm": 0.4744206666946411, "learning_rate": 0.0003910891089108911, "loss": 1.8673, "step": 8550 }, { "epoch": 6.23, "eval_accuracy": 0.6265433755799307, "eval_loss": 1.7275755405426025, "eval_runtime": 1085.596, "eval_samples_per_second": 460.005, "eval_steps_per_second": 2.054, "step": 8550 }, { "epoch": 6.24, "grad_norm": 0.4614261984825134, "learning_rate": 0.0003900990099009901, "loss": 1.8631, "step": 8560 }, { "epoch": 6.24, "grad_norm": 0.4780130386352539, "learning_rate": 0.0003891089108910891, "loss": 1.8658, "step": 8570 }, { "epoch": 6.25, "grad_norm": 0.4078359007835388, "learning_rate": 0.00038811881188118816, "loss": 1.8644, "step": 8580 }, { "epoch": 6.26, "grad_norm": 0.4950817823410034, "learning_rate": 0.00038712871287128716, "loss": 1.8631, "step": 8590 }, { "epoch": 6.27, "grad_norm": 0.4473728835582733, "learning_rate": 0.00038613861386138616, "loss": 1.862, "step": 8600 }, { "epoch": 6.27, "grad_norm": 0.41996896266937256, "learning_rate": 0.00038514851485148515, "loss": 1.8609, "step": 8610 }, { "epoch": 6.28, "grad_norm": 0.4896424114704132, "learning_rate": 0.00038415841584158415, "loss": 1.8639, "step": 8620 }, { "epoch": 6.29, "grad_norm": 0.5288357138633728, "learning_rate": 0.0003831683168316832, "loss": 1.8629, "step": 8630 }, { "epoch": 6.3, "grad_norm": 0.414982408285141, "learning_rate": 0.0003821782178217822, "loss": 1.8611, "step": 8640 }, { "epoch": 6.3, "eval_accuracy": 0.6280281595524545, "eval_loss": 1.7198432683944702, "eval_runtime": 1085.3535, "eval_samples_per_second": 460.108, "eval_steps_per_second": 2.055, "step": 8640 }, { "epoch": 6.3, "grad_norm": 0.45578229427337646, "learning_rate": 0.0003811881188118812, "loss": 1.8598, "step": 8650 }, { "epoch": 6.31, "grad_norm": 0.4705806374549866, "learning_rate": 0.0003801980198019802, "loss": 1.8606, "step": 8660 }, { "epoch": 6.32, "grad_norm": 0.40844404697418213, "learning_rate": 0.0003792079207920792, "loss": 1.8597, "step": 8670 }, { "epoch": 6.32, "grad_norm": 0.3853258490562439, "learning_rate": 0.00037821782178217825, "loss": 1.8582, "step": 8680 }, { "epoch": 6.33, "grad_norm": 0.4357406198978424, "learning_rate": 0.00037722772277227725, "loss": 1.8595, "step": 8690 }, { "epoch": 6.34, "grad_norm": 0.5262021422386169, "learning_rate": 0.00037623762376237625, "loss": 1.858, "step": 8700 }, { "epoch": 6.35, "grad_norm": 0.5264425873756409, "learning_rate": 0.00037524752475247524, "loss": 1.8587, "step": 8710 }, { "epoch": 6.35, "grad_norm": 0.45019951462745667, "learning_rate": 0.00037425742574257424, "loss": 1.8566, "step": 8720 }, { "epoch": 6.36, "grad_norm": 0.41189640760421753, "learning_rate": 0.0003732673267326733, "loss": 1.8572, "step": 8730 }, { "epoch": 6.36, "eval_accuracy": 0.6290112373666048, "eval_loss": 1.7162350416183472, "eval_runtime": 1085.09, "eval_samples_per_second": 460.22, "eval_steps_per_second": 2.055, "step": 8730 }, { "epoch": 6.37, "grad_norm": 0.4982648491859436, "learning_rate": 0.0003722772277227723, "loss": 1.8546, "step": 8740 }, { "epoch": 6.38, "grad_norm": 0.3992471694946289, "learning_rate": 0.0003712871287128713, "loss": 1.8537, "step": 8750 }, { "epoch": 6.38, "grad_norm": 0.46979019045829773, "learning_rate": 0.0003702970297029703, "loss": 1.8566, "step": 8760 }, { "epoch": 6.39, "grad_norm": 0.5312979817390442, "learning_rate": 0.0003693069306930693, "loss": 1.8528, "step": 8770 }, { "epoch": 6.4, "grad_norm": 0.4439733624458313, "learning_rate": 0.00036831683168316834, "loss": 1.855, "step": 8780 }, { "epoch": 6.4, "grad_norm": 0.39169180393218994, "learning_rate": 0.00036732673267326734, "loss": 1.8528, "step": 8790 }, { "epoch": 6.41, "grad_norm": 0.41544708609580994, "learning_rate": 0.00036633663366336634, "loss": 1.8508, "step": 8800 }, { "epoch": 6.42, "grad_norm": 0.4738489091396332, "learning_rate": 0.00036534653465346533, "loss": 1.8515, "step": 8810 }, { "epoch": 6.43, "grad_norm": 0.4519752860069275, "learning_rate": 0.00036435643564356433, "loss": 1.8507, "step": 8820 }, { "epoch": 6.43, "eval_accuracy": 0.6307912179674288, "eval_loss": 1.707141399383545, "eval_runtime": 1087.4605, "eval_samples_per_second": 459.217, "eval_steps_per_second": 2.051, "step": 8820 }, { "epoch": 6.43, "grad_norm": 0.382656991481781, "learning_rate": 0.0003633663366336634, "loss": 1.8487, "step": 8830 }, { "epoch": 6.44, "grad_norm": 0.4599255919456482, "learning_rate": 0.0003623762376237624, "loss": 1.8506, "step": 8840 }, { "epoch": 6.45, "grad_norm": 0.4658602476119995, "learning_rate": 0.0003613861386138614, "loss": 1.8496, "step": 8850 }, { "epoch": 6.46, "grad_norm": 0.43937841057777405, "learning_rate": 0.0003603960396039604, "loss": 1.8504, "step": 8860 }, { "epoch": 6.46, "grad_norm": 0.4001791477203369, "learning_rate": 0.0003594059405940594, "loss": 1.8479, "step": 8870 }, { "epoch": 6.47, "grad_norm": 0.47235003113746643, "learning_rate": 0.00035841584158415843, "loss": 1.8462, "step": 8880 }, { "epoch": 6.48, "grad_norm": 0.42399758100509644, "learning_rate": 0.00035742574257425743, "loss": 1.8483, "step": 8890 }, { "epoch": 6.48, "grad_norm": 0.3666219413280487, "learning_rate": 0.0003564356435643564, "loss": 1.8462, "step": 8900 }, { "epoch": 6.49, "grad_norm": 0.4140106737613678, "learning_rate": 0.0003554455445544554, "loss": 1.8447, "step": 8910 }, { "epoch": 6.49, "eval_accuracy": 0.6324917716108477, "eval_loss": 1.6985867023468018, "eval_runtime": 1087.0314, "eval_samples_per_second": 459.398, "eval_steps_per_second": 2.051, "step": 8910 }, { "epoch": 6.5, "grad_norm": 0.4904990792274475, "learning_rate": 0.0003544554455445544, "loss": 1.8451, "step": 8920 }, { "epoch": 6.51, "grad_norm": 0.43653395771980286, "learning_rate": 0.0003534653465346535, "loss": 1.8434, "step": 8930 }, { "epoch": 6.51, "grad_norm": 0.4815143942832947, "learning_rate": 0.0003524752475247525, "loss": 1.8405, "step": 8940 }, { "epoch": 6.52, "grad_norm": 0.5665289759635925, "learning_rate": 0.00035148514851485147, "loss": 1.8426, "step": 8950 }, { "epoch": 6.53, "grad_norm": 0.45166435837745667, "learning_rate": 0.00035049504950495047, "loss": 1.8431, "step": 8960 }, { "epoch": 6.54, "grad_norm": 0.45748040080070496, "learning_rate": 0.00034950495049504947, "loss": 1.8434, "step": 8970 }, { "epoch": 6.54, "grad_norm": 0.42432862520217896, "learning_rate": 0.0003485148514851485, "loss": 1.8408, "step": 8980 }, { "epoch": 6.55, "grad_norm": 0.3683374226093292, "learning_rate": 0.0003475247524752475, "loss": 1.8388, "step": 8990 }, { "epoch": 6.56, "grad_norm": 0.4106079041957855, "learning_rate": 0.0003465346534653465, "loss": 1.839, "step": 9000 }, { "epoch": 6.56, "eval_accuracy": 0.6338676239837059, "eval_loss": 1.6945050954818726, "eval_runtime": 1084.8256, "eval_samples_per_second": 460.332, "eval_steps_per_second": 2.056, "step": 9000 }, { "epoch": 6.56, "grad_norm": 0.46325406432151794, "learning_rate": 0.0003455445544554455, "loss": 1.8383, "step": 9010 }, { "epoch": 6.57, "grad_norm": 0.4024347960948944, "learning_rate": 0.0003445544554455445, "loss": 1.8397, "step": 9020 }, { "epoch": 6.58, "grad_norm": 0.5055080652236938, "learning_rate": 0.0003435643564356436, "loss": 1.8386, "step": 9030 }, { "epoch": 6.59, "grad_norm": 0.38540130853652954, "learning_rate": 0.0003425742574257426, "loss": 1.8381, "step": 9040 }, { "epoch": 6.59, "grad_norm": 0.42219218611717224, "learning_rate": 0.0003415841584158416, "loss": 1.8379, "step": 9050 }, { "epoch": 6.6, "grad_norm": 0.37353622913360596, "learning_rate": 0.0003405940594059406, "loss": 1.8354, "step": 9060 }, { "epoch": 6.61, "grad_norm": 0.4756326973438263, "learning_rate": 0.0003396039603960396, "loss": 1.8345, "step": 9070 }, { "epoch": 6.62, "grad_norm": 0.44085556268692017, "learning_rate": 0.00033861386138613867, "loss": 1.8355, "step": 9080 }, { "epoch": 6.62, "grad_norm": 0.4287554919719696, "learning_rate": 0.00033762376237623766, "loss": 1.8345, "step": 9090 }, { "epoch": 6.62, "eval_accuracy": 0.6351918568609488, "eval_loss": 1.6874170303344727, "eval_runtime": 1085.5989, "eval_samples_per_second": 460.004, "eval_steps_per_second": 2.054, "step": 9090 }, { "epoch": 6.63, "grad_norm": 0.4309207499027252, "learning_rate": 0.00033663366336633666, "loss": 1.8336, "step": 9100 }, { "epoch": 6.64, "grad_norm": 0.41971975564956665, "learning_rate": 0.00033564356435643566, "loss": 1.83, "step": 9110 }, { "epoch": 6.64, "grad_norm": 0.4418039619922638, "learning_rate": 0.00033465346534653466, "loss": 1.829, "step": 9120 }, { "epoch": 6.65, "grad_norm": 0.44311702251434326, "learning_rate": 0.0003336633663366337, "loss": 1.8337, "step": 9130 }, { "epoch": 6.66, "grad_norm": 0.46464502811431885, "learning_rate": 0.0003326732673267327, "loss": 1.835, "step": 9140 }, { "epoch": 6.67, "grad_norm": 0.40370500087738037, "learning_rate": 0.0003316831683168317, "loss": 1.8304, "step": 9150 }, { "epoch": 6.67, "grad_norm": 0.37608906626701355, "learning_rate": 0.0003306930693069307, "loss": 1.8312, "step": 9160 }, { "epoch": 6.68, "grad_norm": 0.3990706503391266, "learning_rate": 0.0003297029702970297, "loss": 1.8313, "step": 9170 }, { "epoch": 6.69, "grad_norm": 0.5069761872291565, "learning_rate": 0.00032871287128712876, "loss": 1.8292, "step": 9180 }, { "epoch": 6.69, "eval_accuracy": 0.6364002620555558, "eval_loss": 1.6848387718200684, "eval_runtime": 1085.7505, "eval_samples_per_second": 459.94, "eval_steps_per_second": 2.054, "step": 9180 }, { "epoch": 6.7, "grad_norm": 0.450114905834198, "learning_rate": 0.00032772277227722775, "loss": 1.8279, "step": 9190 }, { "epoch": 6.7, "grad_norm": 0.4129829704761505, "learning_rate": 0.00032673267326732675, "loss": 1.8282, "step": 9200 }, { "epoch": 6.71, "grad_norm": 0.4841521084308624, "learning_rate": 0.00032574257425742575, "loss": 1.8282, "step": 9210 }, { "epoch": 6.72, "grad_norm": 0.5160727500915527, "learning_rate": 0.00032475247524752475, "loss": 1.8269, "step": 9220 }, { "epoch": 6.73, "grad_norm": 0.5048640370368958, "learning_rate": 0.0003237623762376238, "loss": 1.8268, "step": 9230 }, { "epoch": 6.73, "grad_norm": 0.584237813949585, "learning_rate": 0.0003227722772277228, "loss": 1.8295, "step": 9240 }, { "epoch": 6.74, "grad_norm": 0.43098556995391846, "learning_rate": 0.0003217821782178218, "loss": 1.8249, "step": 9250 }, { "epoch": 6.75, "grad_norm": 0.3972054421901703, "learning_rate": 0.0003207920792079208, "loss": 1.8246, "step": 9260 }, { "epoch": 6.75, "grad_norm": 0.3777482211589813, "learning_rate": 0.0003198019801980198, "loss": 1.8243, "step": 9270 }, { "epoch": 6.75, "eval_accuracy": 0.6379164581645916, "eval_loss": 1.6772228479385376, "eval_runtime": 1085.6894, "eval_samples_per_second": 459.966, "eval_steps_per_second": 2.054, "step": 9270 }, { "epoch": 6.76, "grad_norm": 0.42633891105651855, "learning_rate": 0.00031881188118811885, "loss": 1.8221, "step": 9280 }, { "epoch": 6.77, "grad_norm": 0.5893245339393616, "learning_rate": 0.00031782178217821784, "loss": 1.8243, "step": 9290 }, { "epoch": 6.78, "grad_norm": 0.5304878354072571, "learning_rate": 0.00031683168316831684, "loss": 1.8242, "step": 9300 }, { "epoch": 6.78, "grad_norm": 0.4657045304775238, "learning_rate": 0.00031584158415841584, "loss": 1.8242, "step": 9310 }, { "epoch": 6.79, "grad_norm": 0.43276962637901306, "learning_rate": 0.00031485148514851484, "loss": 1.8225, "step": 9320 }, { "epoch": 6.8, "grad_norm": 0.40515634417533875, "learning_rate": 0.0003138613861386139, "loss": 1.8228, "step": 9330 }, { "epoch": 6.81, "grad_norm": 0.40448087453842163, "learning_rate": 0.0003128712871287129, "loss": 1.8198, "step": 9340 }, { "epoch": 6.81, "grad_norm": 0.4380359351634979, "learning_rate": 0.0003118811881188119, "loss": 1.8201, "step": 9350 }, { "epoch": 6.82, "grad_norm": 0.46539106965065, "learning_rate": 0.0003108910891089109, "loss": 1.8217, "step": 9360 }, { "epoch": 6.82, "eval_accuracy": 0.6389200973153136, "eval_loss": 1.6709976196289062, "eval_runtime": 1085.3788, "eval_samples_per_second": 460.097, "eval_steps_per_second": 2.055, "step": 9360 }, { "epoch": 6.83, "grad_norm": 0.4772910177707672, "learning_rate": 0.0003099009900990099, "loss": 1.8202, "step": 9370 }, { "epoch": 6.83, "grad_norm": 0.42799142003059387, "learning_rate": 0.00030891089108910894, "loss": 1.819, "step": 9380 }, { "epoch": 6.84, "grad_norm": 0.40562522411346436, "learning_rate": 0.00030792079207920793, "loss": 1.8208, "step": 9390 }, { "epoch": 6.85, "grad_norm": 0.6129370927810669, "learning_rate": 0.00030693069306930693, "loss": 1.8158, "step": 9400 }, { "epoch": 6.86, "grad_norm": 0.4654337465763092, "learning_rate": 0.00030594059405940593, "loss": 1.8175, "step": 9410 }, { "epoch": 6.86, "grad_norm": 0.4340899884700775, "learning_rate": 0.00030495049504950493, "loss": 1.8156, "step": 9420 }, { "epoch": 6.87, "grad_norm": 0.4216438829898834, "learning_rate": 0.000303960396039604, "loss": 1.8182, "step": 9430 }, { "epoch": 6.88, "grad_norm": 0.3995111584663391, "learning_rate": 0.000302970297029703, "loss": 1.8175, "step": 9440 }, { "epoch": 6.89, "grad_norm": 0.4166571795940399, "learning_rate": 0.000301980198019802, "loss": 1.8171, "step": 9450 }, { "epoch": 6.89, "eval_accuracy": 0.6397325747268708, "eval_loss": 1.6698857545852661, "eval_runtime": 1086.4276, "eval_samples_per_second": 459.653, "eval_steps_per_second": 2.053, "step": 9450 }, { "epoch": 6.89, "grad_norm": 0.4397519528865814, "learning_rate": 0.000300990099009901, "loss": 1.8171, "step": 9460 }, { "epoch": 6.9, "grad_norm": 0.4201519787311554, "learning_rate": 0.0003, "loss": 1.8166, "step": 9470 }, { "epoch": 6.91, "grad_norm": 0.4194183945655823, "learning_rate": 0.000299009900990099, "loss": 1.8137, "step": 9480 }, { "epoch": 6.91, "grad_norm": 0.4156358540058136, "learning_rate": 0.000298019801980198, "loss": 1.8149, "step": 9490 }, { "epoch": 6.92, "grad_norm": 0.43356573581695557, "learning_rate": 0.000297029702970297, "loss": 1.8104, "step": 9500 }, { "epoch": 6.93, "grad_norm": 0.4354686141014099, "learning_rate": 0.000296039603960396, "loss": 1.8131, "step": 9510 }, { "epoch": 6.94, "grad_norm": 0.45513659715652466, "learning_rate": 0.000295049504950495, "loss": 1.8115, "step": 9520 }, { "epoch": 6.94, "grad_norm": 0.39433979988098145, "learning_rate": 0.00029405940594059407, "loss": 1.8106, "step": 9530 }, { "epoch": 6.95, "grad_norm": 0.5176064372062683, "learning_rate": 0.00029306930693069307, "loss": 1.8153, "step": 9540 }, { "epoch": 6.95, "eval_accuracy": 0.640691022778346, "eval_loss": 1.66335129737854, "eval_runtime": 1082.6564, "eval_samples_per_second": 461.254, "eval_steps_per_second": 2.06, "step": 9540 }, { "epoch": 6.96, "grad_norm": 0.5153635740280151, "learning_rate": 0.00029207920792079207, "loss": 1.8142, "step": 9550 }, { "epoch": 6.97, "grad_norm": 0.37104055285453796, "learning_rate": 0.00029108910891089107, "loss": 1.812, "step": 9560 }, { "epoch": 6.97, "grad_norm": 0.40426042675971985, "learning_rate": 0.00029009900990099006, "loss": 1.8119, "step": 9570 }, { "epoch": 6.98, "grad_norm": 0.5108228325843811, "learning_rate": 0.0002891089108910891, "loss": 1.8131, "step": 9580 }, { "epoch": 6.99, "grad_norm": 0.4702747166156769, "learning_rate": 0.0002881188118811881, "loss": 1.812, "step": 9590 }, { "epoch": 6.99, "grad_norm": 0.3683488965034485, "learning_rate": 0.0002871287128712871, "loss": 1.8081, "step": 9600 }, { "epoch": 7.0, "grad_norm": 0.42281776666641235, "learning_rate": 0.0002861386138613861, "loss": 1.81, "step": 9610 }, { "epoch": 7.01, "grad_norm": 0.48128095269203186, "learning_rate": 0.0002851485148514851, "loss": 1.81, "step": 9620 }, { "epoch": 7.02, "grad_norm": 0.4020933210849762, "learning_rate": 0.00028415841584158416, "loss": 1.81, "step": 9630 }, { "epoch": 7.02, "eval_accuracy": 0.6416495776980271, "eval_loss": 1.6598803997039795, "eval_runtime": 1084.1267, "eval_samples_per_second": 460.629, "eval_steps_per_second": 2.057, "step": 9630 }, { "epoch": 7.02, "grad_norm": 0.5061802864074707, "learning_rate": 0.00028316831683168316, "loss": 1.8091, "step": 9640 }, { "epoch": 7.03, "grad_norm": 0.5182695388793945, "learning_rate": 0.00028217821782178216, "loss": 1.8101, "step": 9650 }, { "epoch": 7.04, "grad_norm": 0.45669490098953247, "learning_rate": 0.0002811881188118812, "loss": 1.8059, "step": 9660 }, { "epoch": 7.05, "grad_norm": 0.5214717984199524, "learning_rate": 0.0002801980198019802, "loss": 1.809, "step": 9670 }, { "epoch": 7.05, "grad_norm": 0.46073251962661743, "learning_rate": 0.00027920792079207926, "loss": 1.8066, "step": 9680 }, { "epoch": 7.06, "grad_norm": 0.3924192190170288, "learning_rate": 0.00027821782178217826, "loss": 1.8058, "step": 9690 }, { "epoch": 7.07, "grad_norm": 0.42634785175323486, "learning_rate": 0.00027722772277227726, "loss": 1.8082, "step": 9700 }, { "epoch": 7.07, "grad_norm": 0.46675705909729004, "learning_rate": 0.00027623762376237626, "loss": 1.8057, "step": 9710 }, { "epoch": 7.08, "grad_norm": 0.43609708547592163, "learning_rate": 0.00027524752475247525, "loss": 1.8051, "step": 9720 }, { "epoch": 7.08, "eval_accuracy": 0.6424563152815425, "eval_loss": 1.6557390689849854, "eval_runtime": 1083.0526, "eval_samples_per_second": 461.086, "eval_steps_per_second": 2.059, "step": 9720 }, { "epoch": 7.09, "grad_norm": 0.35754847526550293, "learning_rate": 0.0002742574257425743, "loss": 1.8053, "step": 9730 }, { "epoch": 7.1, "grad_norm": 0.38588428497314453, "learning_rate": 0.0002732673267326733, "loss": 1.8059, "step": 9740 }, { "epoch": 7.1, "grad_norm": 0.4815811514854431, "learning_rate": 0.0002722772277227723, "loss": 1.8053, "step": 9750 }, { "epoch": 7.11, "grad_norm": 0.4292014241218567, "learning_rate": 0.0002712871287128713, "loss": 1.8018, "step": 9760 }, { "epoch": 7.12, "grad_norm": 0.4443877339363098, "learning_rate": 0.0002702970297029703, "loss": 1.8022, "step": 9770 }, { "epoch": 7.13, "grad_norm": 0.4887067675590515, "learning_rate": 0.00026930693069306935, "loss": 1.804, "step": 9780 }, { "epoch": 7.13, "grad_norm": 0.4577280282974243, "learning_rate": 0.00026831683168316835, "loss": 1.8043, "step": 9790 }, { "epoch": 7.14, "grad_norm": 0.38725900650024414, "learning_rate": 0.00026732673267326735, "loss": 1.8014, "step": 9800 }, { "epoch": 7.15, "grad_norm": 0.45041412115097046, "learning_rate": 0.00026633663366336635, "loss": 1.8046, "step": 9810 }, { "epoch": 7.15, "eval_accuracy": 0.6431962908340371, "eval_loss": 1.6528569459915161, "eval_runtime": 1084.5222, "eval_samples_per_second": 460.461, "eval_steps_per_second": 2.056, "step": 9810 }, { "epoch": 7.15, "grad_norm": 0.532798707485199, "learning_rate": 0.00026534653465346534, "loss": 1.8015, "step": 9820 }, { "epoch": 7.16, "grad_norm": 0.43234437704086304, "learning_rate": 0.0002643564356435644, "loss": 1.8013, "step": 9830 }, { "epoch": 7.17, "grad_norm": 0.4301891624927521, "learning_rate": 0.0002633663366336634, "loss": 1.8011, "step": 9840 }, { "epoch": 7.18, "grad_norm": 0.41938352584838867, "learning_rate": 0.0002623762376237624, "loss": 1.8032, "step": 9850 }, { "epoch": 7.18, "grad_norm": 0.4415999948978424, "learning_rate": 0.0002613861386138614, "loss": 1.8011, "step": 9860 }, { "epoch": 7.19, "grad_norm": 0.46587106585502625, "learning_rate": 0.0002603960396039604, "loss": 1.8005, "step": 9870 }, { "epoch": 7.2, "grad_norm": 0.3755381107330322, "learning_rate": 0.00025940594059405944, "loss": 1.7996, "step": 9880 }, { "epoch": 7.21, "grad_norm": 0.37551912665367126, "learning_rate": 0.00025841584158415844, "loss": 1.7981, "step": 9890 }, { "epoch": 7.21, "grad_norm": 0.4268946349620819, "learning_rate": 0.00025742574257425744, "loss": 1.7997, "step": 9900 }, { "epoch": 7.21, "eval_accuracy": 0.6438810244549774, "eval_loss": 1.6489626169204712, "eval_runtime": 1085.6377, "eval_samples_per_second": 459.988, "eval_steps_per_second": 2.054, "step": 9900 }, { "epoch": 7.22, "grad_norm": 0.5793518424034119, "learning_rate": 0.00025643564356435644, "loss": 1.8, "step": 9910 }, { "epoch": 7.23, "grad_norm": 0.37436190247535706, "learning_rate": 0.00025544554455445543, "loss": 1.7974, "step": 9920 }, { "epoch": 7.24, "grad_norm": 0.5522225499153137, "learning_rate": 0.0002544554455445545, "loss": 1.7968, "step": 9930 }, { "epoch": 7.24, "grad_norm": 0.4452868402004242, "learning_rate": 0.0002534653465346535, "loss": 1.7997, "step": 9940 }, { "epoch": 7.25, "grad_norm": 0.41211819648742676, "learning_rate": 0.0002524752475247525, "loss": 1.798, "step": 9950 }, { "epoch": 7.26, "grad_norm": 0.4052869379520416, "learning_rate": 0.0002514851485148515, "loss": 1.7948, "step": 9960 }, { "epoch": 7.26, "grad_norm": 0.4514144957065582, "learning_rate": 0.0002504950495049505, "loss": 1.7955, "step": 9970 }, { "epoch": 7.27, "grad_norm": 0.49351832270622253, "learning_rate": 0.00024950495049504953, "loss": 1.7991, "step": 9980 }, { "epoch": 7.28, "grad_norm": 0.49827703833580017, "learning_rate": 0.00024851485148514853, "loss": 1.7994, "step": 9990 }, { "epoch": 7.28, "eval_accuracy": 0.6442858204973027, "eval_loss": 1.6466220617294312, "eval_runtime": 1086.2584, "eval_samples_per_second": 459.725, "eval_steps_per_second": 2.053, "step": 9990 }, { "epoch": 7.29, "grad_norm": 0.4449995160102844, "learning_rate": 0.00024752475247524753, "loss": 1.7962, "step": 10000 }, { "epoch": 7.29, "grad_norm": 0.3683604300022125, "learning_rate": 0.0002465346534653465, "loss": 1.7984, "step": 10010 }, { "epoch": 7.3, "grad_norm": 0.48126864433288574, "learning_rate": 0.0002455445544554455, "loss": 1.7953, "step": 10020 }, { "epoch": 7.31, "grad_norm": 0.3565351068973541, "learning_rate": 0.0002445544554455446, "loss": 1.7914, "step": 10030 }, { "epoch": 7.32, "grad_norm": 0.4369056820869446, "learning_rate": 0.00024356435643564357, "loss": 1.7949, "step": 10040 }, { "epoch": 7.32, "grad_norm": 0.5566734671592712, "learning_rate": 0.00024257425742574257, "loss": 1.7961, "step": 10050 }, { "epoch": 7.33, "grad_norm": 0.4066598117351532, "learning_rate": 0.00024158415841584157, "loss": 1.7962, "step": 10060 }, { "epoch": 7.34, "grad_norm": 0.4281260669231415, "learning_rate": 0.0002405940594059406, "loss": 1.7943, "step": 10070 }, { "epoch": 7.34, "grad_norm": 0.34586983919143677, "learning_rate": 0.0002396039603960396, "loss": 1.7934, "step": 10080 }, { "epoch": 7.34, "eval_accuracy": 0.6448193534487687, "eval_loss": 1.644710898399353, "eval_runtime": 1084.6339, "eval_samples_per_second": 460.413, "eval_steps_per_second": 2.056, "step": 10080 }, { "epoch": 7.35, "grad_norm": 0.3622014820575714, "learning_rate": 0.00023861386138613862, "loss": 1.7912, "step": 10090 }, { "epoch": 7.36, "grad_norm": 0.456106960773468, "learning_rate": 0.00023762376237623762, "loss": 1.7922, "step": 10100 }, { "epoch": 7.37, "grad_norm": 0.4329501986503601, "learning_rate": 0.00023663366336633662, "loss": 1.7937, "step": 10110 }, { "epoch": 7.37, "grad_norm": 0.4640803039073944, "learning_rate": 0.00023564356435643564, "loss": 1.793, "step": 10120 }, { "epoch": 7.38, "grad_norm": 0.39238548278808594, "learning_rate": 0.00023465346534653464, "loss": 1.7916, "step": 10130 }, { "epoch": 7.39, "grad_norm": 0.43311530351638794, "learning_rate": 0.0002336633663366337, "loss": 1.7943, "step": 10140 }, { "epoch": 7.4, "grad_norm": 0.35872432589530945, "learning_rate": 0.0002326732673267327, "loss": 1.7918, "step": 10150 }, { "epoch": 7.4, "grad_norm": 0.36510738730430603, "learning_rate": 0.0002316831683168317, "loss": 1.7907, "step": 10160 }, { "epoch": 7.41, "grad_norm": 0.46106651425361633, "learning_rate": 0.00023069306930693071, "loss": 1.7917, "step": 10170 }, { "epoch": 7.41, "eval_accuracy": 0.6454906168005663, "eval_loss": 1.641427993774414, "eval_runtime": 1084.7154, "eval_samples_per_second": 460.379, "eval_steps_per_second": 2.056, "step": 10170 }, { "epoch": 7.42, "grad_norm": 0.37493211030960083, "learning_rate": 0.0002297029702970297, "loss": 1.7921, "step": 10180 }, { "epoch": 7.42, "grad_norm": 0.3865686058998108, "learning_rate": 0.00022871287128712874, "loss": 1.791, "step": 10190 }, { "epoch": 7.43, "grad_norm": 0.3889116942882538, "learning_rate": 0.00022772277227722774, "loss": 1.7907, "step": 10200 }, { "epoch": 7.44, "grad_norm": 0.6045088768005371, "learning_rate": 0.00022673267326732673, "loss": 1.7902, "step": 10210 }, { "epoch": 7.45, "grad_norm": 0.4185848832130432, "learning_rate": 0.00022574257425742576, "loss": 1.7889, "step": 10220 }, { "epoch": 7.45, "grad_norm": 0.3915616571903229, "learning_rate": 0.00022475247524752476, "loss": 1.7906, "step": 10230 }, { "epoch": 7.46, "grad_norm": 0.3687775433063507, "learning_rate": 0.00022376237623762378, "loss": 1.7897, "step": 10240 }, { "epoch": 7.47, "grad_norm": 0.39049315452575684, "learning_rate": 0.00022277227722772278, "loss": 1.789, "step": 10250 }, { "epoch": 7.48, "grad_norm": 0.37289920449256897, "learning_rate": 0.00022178217821782178, "loss": 1.7887, "step": 10260 }, { "epoch": 7.48, "eval_accuracy": 0.645869271741179, "eval_loss": 1.639427661895752, "eval_runtime": 1118.4989, "eval_samples_per_second": 446.473, "eval_steps_per_second": 1.994, "step": 10260 }, { "epoch": 7.48, "grad_norm": 0.45301392674446106, "learning_rate": 0.0002207920792079208, "loss": 1.7911, "step": 10270 }, { "epoch": 7.49, "grad_norm": 0.42282310128211975, "learning_rate": 0.0002198019801980198, "loss": 1.788, "step": 10280 }, { "epoch": 7.5, "grad_norm": 0.34825190901756287, "learning_rate": 0.00021881188118811883, "loss": 1.7888, "step": 10290 }, { "epoch": 7.5, "grad_norm": 0.4688248038291931, "learning_rate": 0.00021782178217821783, "loss": 1.7878, "step": 10300 }, { "epoch": 7.51, "grad_norm": 0.4295547902584076, "learning_rate": 0.00021683168316831682, "loss": 1.7877, "step": 10310 }, { "epoch": 7.52, "grad_norm": 0.3433161675930023, "learning_rate": 0.00021584158415841585, "loss": 1.7884, "step": 10320 }, { "epoch": 7.53, "grad_norm": 0.34638333320617676, "learning_rate": 0.00021485148514851485, "loss": 1.7891, "step": 10330 }, { "epoch": 7.53, "grad_norm": 0.40277931094169617, "learning_rate": 0.00021386138613861387, "loss": 1.7856, "step": 10340 }, { "epoch": 7.54, "grad_norm": 0.42518341541290283, "learning_rate": 0.00021287128712871287, "loss": 1.7861, "step": 10350 }, { "epoch": 7.54, "eval_accuracy": 0.6466240342094058, "eval_loss": 1.637886881828308, "eval_runtime": 1094.5797, "eval_samples_per_second": 456.23, "eval_steps_per_second": 2.037, "step": 10350 }, { "epoch": 7.55, "grad_norm": 0.3844136893749237, "learning_rate": 0.00021188118811881187, "loss": 1.7862, "step": 10360 }, { "epoch": 7.56, "grad_norm": 0.34440210461616516, "learning_rate": 0.0002108910891089109, "loss": 1.7864, "step": 10370 }, { "epoch": 7.56, "grad_norm": 0.501716136932373, "learning_rate": 0.0002099009900990099, "loss": 1.7843, "step": 10380 }, { "epoch": 7.57, "grad_norm": 0.3695526421070099, "learning_rate": 0.00020891089108910892, "loss": 1.7855, "step": 10390 }, { "epoch": 7.58, "grad_norm": 0.38437628746032715, "learning_rate": 0.00020792079207920792, "loss": 1.7847, "step": 10400 }, { "epoch": 7.58, "grad_norm": 0.4197578430175781, "learning_rate": 0.00020693069306930691, "loss": 1.7833, "step": 10410 }, { "epoch": 7.59, "grad_norm": 0.39093175530433655, "learning_rate": 0.00020594059405940594, "loss": 1.786, "step": 10420 }, { "epoch": 7.6, "grad_norm": 0.36000731587409973, "learning_rate": 0.00020495049504950494, "loss": 1.7872, "step": 10430 }, { "epoch": 7.61, "grad_norm": 0.45473846793174744, "learning_rate": 0.00020396039603960396, "loss": 1.7853, "step": 10440 }, { "epoch": 7.61, "eval_accuracy": 0.6470880404015521, "eval_loss": 1.6340434551239014, "eval_runtime": 1091.9524, "eval_samples_per_second": 457.328, "eval_steps_per_second": 2.042, "step": 10440 }, { "epoch": 7.61, "grad_norm": 0.3739522099494934, "learning_rate": 0.000202970297029703, "loss": 1.7824, "step": 10450 }, { "epoch": 7.62, "grad_norm": 0.40237516164779663, "learning_rate": 0.00020198019801980199, "loss": 1.7837, "step": 10460 }, { "epoch": 7.63, "grad_norm": 0.44717445969581604, "learning_rate": 0.000200990099009901, "loss": 1.7828, "step": 10470 }, { "epoch": 7.64, "grad_norm": 0.4383144676685333, "learning_rate": 0.0002, "loss": 1.7841, "step": 10480 }, { "epoch": 7.64, "grad_norm": 0.368528813123703, "learning_rate": 0.00019900990099009903, "loss": 1.7818, "step": 10490 }, { "epoch": 7.65, "grad_norm": 0.40288105607032776, "learning_rate": 0.00019801980198019803, "loss": 1.7832, "step": 10500 }, { "epoch": 7.66, "grad_norm": 0.3775827884674072, "learning_rate": 0.00019702970297029703, "loss": 1.7814, "step": 10510 }, { "epoch": 7.66, "grad_norm": 0.4067000448703766, "learning_rate": 0.00019603960396039606, "loss": 1.7815, "step": 10520 }, { "epoch": 7.67, "grad_norm": 0.45529502630233765, "learning_rate": 0.00019504950495049505, "loss": 1.7847, "step": 10530 }, { "epoch": 7.67, "eval_accuracy": 0.647348903103301, "eval_loss": 1.6344057321548462, "eval_runtime": 1092.9918, "eval_samples_per_second": 456.893, "eval_steps_per_second": 2.04, "step": 10530 }, { "epoch": 7.68, "grad_norm": 0.4383666217327118, "learning_rate": 0.00019405940594059408, "loss": 1.7799, "step": 10540 }, { "epoch": 7.69, "grad_norm": 0.41071295738220215, "learning_rate": 0.00019306930693069308, "loss": 1.7826, "step": 10550 }, { "epoch": 7.69, "grad_norm": 0.4367865324020386, "learning_rate": 0.00019207920792079208, "loss": 1.7819, "step": 10560 }, { "epoch": 7.7, "grad_norm": 0.3609165549278259, "learning_rate": 0.0001910891089108911, "loss": 1.7802, "step": 10570 }, { "epoch": 7.71, "grad_norm": 0.39005783200263977, "learning_rate": 0.0001900990099009901, "loss": 1.78, "step": 10580 }, { "epoch": 7.72, "grad_norm": 0.3287705183029175, "learning_rate": 0.00018910891089108913, "loss": 1.7797, "step": 10590 }, { "epoch": 7.72, "grad_norm": 0.3266151547431946, "learning_rate": 0.00018811881188118812, "loss": 1.7795, "step": 10600 }, { "epoch": 7.73, "grad_norm": 0.35796740651130676, "learning_rate": 0.00018712871287128712, "loss": 1.7782, "step": 10610 }, { "epoch": 7.74, "grad_norm": 0.39394471049308777, "learning_rate": 0.00018613861386138615, "loss": 1.7799, "step": 10620 }, { "epoch": 7.74, "eval_accuracy": 0.647908722477522, "eval_loss": 1.6299968957901, "eval_runtime": 1086.7095, "eval_samples_per_second": 459.534, "eval_steps_per_second": 2.052, "step": 10620 }, { "epoch": 7.75, "grad_norm": 0.4640734791755676, "learning_rate": 0.00018514851485148514, "loss": 1.7794, "step": 10630 }, { "epoch": 7.75, "grad_norm": 0.3890862464904785, "learning_rate": 0.00018415841584158417, "loss": 1.7794, "step": 10640 }, { "epoch": 7.76, "grad_norm": 0.3503568768501282, "learning_rate": 0.00018316831683168317, "loss": 1.7776, "step": 10650 }, { "epoch": 7.77, "grad_norm": 0.39593997597694397, "learning_rate": 0.00018217821782178217, "loss": 1.7798, "step": 10660 }, { "epoch": 7.77, "grad_norm": 0.3677063286304474, "learning_rate": 0.0001811881188118812, "loss": 1.7769, "step": 10670 }, { "epoch": 7.78, "grad_norm": 0.3558836877346039, "learning_rate": 0.0001801980198019802, "loss": 1.7803, "step": 10680 }, { "epoch": 7.79, "grad_norm": 0.4983728229999542, "learning_rate": 0.00017920792079207922, "loss": 1.7774, "step": 10690 }, { "epoch": 7.8, "grad_norm": 0.3735315203666687, "learning_rate": 0.0001782178217821782, "loss": 1.778, "step": 10700 }, { "epoch": 7.8, "grad_norm": 0.3440467417240143, "learning_rate": 0.0001772277227722772, "loss": 1.7773, "step": 10710 }, { "epoch": 7.8, "eval_accuracy": 0.6480254558570787, "eval_loss": 1.6308239698410034, "eval_runtime": 1084.4491, "eval_samples_per_second": 460.492, "eval_steps_per_second": 2.056, "step": 10710 }, { "epoch": 7.81, "grad_norm": 0.3812803626060486, "learning_rate": 0.00017623762376237624, "loss": 1.7776, "step": 10720 }, { "epoch": 7.82, "grad_norm": 0.3772016763687134, "learning_rate": 0.00017524752475247524, "loss": 1.7751, "step": 10730 }, { "epoch": 7.83, "grad_norm": 0.3638882339000702, "learning_rate": 0.00017425742574257426, "loss": 1.7794, "step": 10740 }, { "epoch": 7.83, "grad_norm": 0.43426576256752014, "learning_rate": 0.00017326732673267326, "loss": 1.7787, "step": 10750 }, { "epoch": 7.84, "grad_norm": 0.41420796513557434, "learning_rate": 0.00017227722772277226, "loss": 1.7766, "step": 10760 }, { "epoch": 7.85, "grad_norm": 0.3958096206188202, "learning_rate": 0.0001712871287128713, "loss": 1.7765, "step": 10770 }, { "epoch": 7.85, "grad_norm": 0.3829286992549896, "learning_rate": 0.0001702970297029703, "loss": 1.7773, "step": 10780 }, { "epoch": 7.86, "grad_norm": 0.41278573870658875, "learning_rate": 0.00016930693069306933, "loss": 1.7769, "step": 10790 }, { "epoch": 7.87, "grad_norm": 0.337071031332016, "learning_rate": 0.00016831683168316833, "loss": 1.7766, "step": 10800 }, { "epoch": 7.87, "eval_accuracy": 0.6483471740815172, "eval_loss": 1.6272797584533691, "eval_runtime": 1083.9003, "eval_samples_per_second": 460.725, "eval_steps_per_second": 2.057, "step": 10800 }, { "epoch": 7.88, "grad_norm": 0.3459130823612213, "learning_rate": 0.00016732673267326733, "loss": 1.7781, "step": 10810 }, { "epoch": 7.88, "grad_norm": 0.3339349925518036, "learning_rate": 0.00016633663366336635, "loss": 1.7754, "step": 10820 }, { "epoch": 7.89, "grad_norm": 0.3156519830226898, "learning_rate": 0.00016534653465346535, "loss": 1.776, "step": 10830 }, { "epoch": 7.9, "grad_norm": 0.3628999590873718, "learning_rate": 0.00016435643564356438, "loss": 1.7744, "step": 10840 }, { "epoch": 7.91, "grad_norm": 0.350087434053421, "learning_rate": 0.00016336633663366338, "loss": 1.7752, "step": 10850 }, { "epoch": 7.91, "grad_norm": 0.40664511919021606, "learning_rate": 0.00016237623762376237, "loss": 1.7781, "step": 10860 }, { "epoch": 7.92, "grad_norm": 0.3364078104496002, "learning_rate": 0.0001613861386138614, "loss": 1.776, "step": 10870 }, { "epoch": 7.93, "grad_norm": 0.32285594940185547, "learning_rate": 0.0001603960396039604, "loss": 1.772, "step": 10880 }, { "epoch": 7.93, "grad_norm": 0.39036986231803894, "learning_rate": 0.00015940594059405942, "loss": 1.7755, "step": 10890 }, { "epoch": 7.93, "eval_accuracy": 0.6488458576263911, "eval_loss": 1.6268614530563354, "eval_runtime": 1085.4437, "eval_samples_per_second": 460.07, "eval_steps_per_second": 2.054, "step": 10890 }, { "epoch": 7.94, "grad_norm": 0.41111525893211365, "learning_rate": 0.00015841584158415842, "loss": 1.7747, "step": 10900 }, { "epoch": 7.95, "grad_norm": 0.4492338299751282, "learning_rate": 0.00015742574257425742, "loss": 1.7753, "step": 10910 }, { "epoch": 7.96, "grad_norm": 0.3836340606212616, "learning_rate": 0.00015643564356435644, "loss": 1.7727, "step": 10920 }, { "epoch": 7.96, "grad_norm": 0.33723685145378113, "learning_rate": 0.00015544554455445544, "loss": 1.775, "step": 10930 }, { "epoch": 7.97, "grad_norm": 0.4088629484176636, "learning_rate": 0.00015445544554455447, "loss": 1.7741, "step": 10940 }, { "epoch": 7.98, "grad_norm": 0.3302168548107147, "learning_rate": 0.00015346534653465347, "loss": 1.7732, "step": 10950 }, { "epoch": 7.99, "grad_norm": 0.3605554401874542, "learning_rate": 0.00015247524752475246, "loss": 1.7722, "step": 10960 }, { "epoch": 7.99, "grad_norm": 0.355826735496521, "learning_rate": 0.0001514851485148515, "loss": 1.7715, "step": 10970 }, { "epoch": 8.0, "grad_norm": 0.3708418607711792, "learning_rate": 0.0001504950495049505, "loss": 1.7721, "step": 10980 }, { "epoch": 8.0, "eval_accuracy": 0.6491409467366662, "eval_loss": 1.6255041360855103, "eval_runtime": 1085.4323, "eval_samples_per_second": 460.075, "eval_steps_per_second": 2.054, "step": 10980 }, { "epoch": 8.01, "grad_norm": 0.3321845233440399, "learning_rate": 0.0001495049504950495, "loss": 1.7719, "step": 10990 }, { "epoch": 8.01, "grad_norm": 0.3251964747905731, "learning_rate": 0.0001485148514851485, "loss": 1.7731, "step": 11000 }, { "epoch": 8.02, "grad_norm": 0.4384077787399292, "learning_rate": 0.0001475247524752475, "loss": 1.7753, "step": 11010 }, { "epoch": 8.03, "grad_norm": 0.3885464370250702, "learning_rate": 0.00014653465346534653, "loss": 1.7707, "step": 11020 }, { "epoch": 8.04, "grad_norm": 0.42948633432388306, "learning_rate": 0.00014554455445544553, "loss": 1.7716, "step": 11030 }, { "epoch": 8.04, "grad_norm": 0.33700281381607056, "learning_rate": 0.00014455445544554456, "loss": 1.7711, "step": 11040 }, { "epoch": 8.05, "grad_norm": 0.37887144088745117, "learning_rate": 0.00014356435643564356, "loss": 1.7707, "step": 11050 }, { "epoch": 8.06, "grad_norm": 0.2920531630516052, "learning_rate": 0.00014257425742574255, "loss": 1.7717, "step": 11060 }, { "epoch": 8.07, "grad_norm": 0.41969195008277893, "learning_rate": 0.00014158415841584158, "loss": 1.77, "step": 11070 }, { "epoch": 8.07, "eval_accuracy": 0.6496409109474853, "eval_loss": 1.6225236654281616, "eval_runtime": 1085.9823, "eval_samples_per_second": 459.842, "eval_steps_per_second": 2.053, "step": 11070 }, { "epoch": 8.07, "grad_norm": 0.3497646152973175, "learning_rate": 0.0001405940594059406, "loss": 1.7712, "step": 11080 }, { "epoch": 8.08, "grad_norm": 0.38377320766448975, "learning_rate": 0.00013960396039603963, "loss": 1.77, "step": 11090 }, { "epoch": 8.09, "grad_norm": 0.3549109101295471, "learning_rate": 0.00013861386138613863, "loss": 1.7736, "step": 11100 }, { "epoch": 8.09, "grad_norm": 0.3375445604324341, "learning_rate": 0.00013762376237623763, "loss": 1.7715, "step": 11110 }, { "epoch": 8.1, "grad_norm": 0.30400267243385315, "learning_rate": 0.00013663366336633665, "loss": 1.7708, "step": 11120 }, { "epoch": 8.11, "grad_norm": 0.36075320839881897, "learning_rate": 0.00013564356435643565, "loss": 1.7706, "step": 11130 }, { "epoch": 8.12, "grad_norm": 0.40548309683799744, "learning_rate": 0.00013465346534653468, "loss": 1.7677, "step": 11140 }, { "epoch": 8.12, "grad_norm": 0.45684516429901123, "learning_rate": 0.00013366336633663367, "loss": 1.7682, "step": 11150 }, { "epoch": 8.13, "grad_norm": 0.35352224111557007, "learning_rate": 0.00013267326732673267, "loss": 1.7708, "step": 11160 }, { "epoch": 8.13, "eval_accuracy": 0.6498058850737081, "eval_loss": 1.6215531826019287, "eval_runtime": 1090.6922, "eval_samples_per_second": 457.856, "eval_steps_per_second": 2.045, "step": 11160 }, { "epoch": 8.14, "grad_norm": 0.32690539956092834, "learning_rate": 0.0001316831683168317, "loss": 1.7701, "step": 11170 }, { "epoch": 8.15, "grad_norm": 0.3275192975997925, "learning_rate": 0.0001306930693069307, "loss": 1.7664, "step": 11180 }, { "epoch": 8.15, "grad_norm": 0.35621440410614014, "learning_rate": 0.00012970297029702972, "loss": 1.7708, "step": 11190 }, { "epoch": 8.16, "grad_norm": 0.3187929093837738, "learning_rate": 0.00012871287128712872, "loss": 1.7686, "step": 11200 }, { "epoch": 8.17, "grad_norm": 0.30944034457206726, "learning_rate": 0.00012772277227722772, "loss": 1.7705, "step": 11210 }, { "epoch": 8.17, "grad_norm": 0.3147297501564026, "learning_rate": 0.00012673267326732674, "loss": 1.7668, "step": 11220 }, { "epoch": 8.18, "grad_norm": 0.31606265902519226, "learning_rate": 0.00012574257425742574, "loss": 1.7664, "step": 11230 }, { "epoch": 8.19, "grad_norm": 0.3430984914302826, "learning_rate": 0.00012475247524752477, "loss": 1.7688, "step": 11240 }, { "epoch": 8.2, "grad_norm": 0.36714431643486023, "learning_rate": 0.00012376237623762376, "loss": 1.7686, "step": 11250 }, { "epoch": 8.2, "eval_accuracy": 0.6500912899825928, "eval_loss": 1.6193368434906006, "eval_runtime": 1084.4098, "eval_samples_per_second": 460.509, "eval_steps_per_second": 2.056, "step": 11250 }, { "epoch": 8.2, "grad_norm": 0.33857467770576477, "learning_rate": 0.00012277227722772276, "loss": 1.7706, "step": 11260 }, { "epoch": 8.21, "grad_norm": 0.33994871377944946, "learning_rate": 0.00012178217821782179, "loss": 1.7657, "step": 11270 }, { "epoch": 8.22, "grad_norm": 0.3522297143936157, "learning_rate": 0.00012079207920792079, "loss": 1.7671, "step": 11280 }, { "epoch": 8.23, "grad_norm": 0.3409149646759033, "learning_rate": 0.0001198019801980198, "loss": 1.7683, "step": 11290 }, { "epoch": 8.23, "grad_norm": 0.34772610664367676, "learning_rate": 0.00011881188118811881, "loss": 1.7694, "step": 11300 }, { "epoch": 8.24, "grad_norm": 0.3905799388885498, "learning_rate": 0.00011782178217821782, "loss": 1.7676, "step": 11310 }, { "epoch": 8.25, "grad_norm": 0.3478334844112396, "learning_rate": 0.00011683168316831685, "loss": 1.7656, "step": 11320 }, { "epoch": 8.26, "grad_norm": 0.3372560143470764, "learning_rate": 0.00011584158415841584, "loss": 1.7662, "step": 11330 }, { "epoch": 8.26, "grad_norm": 0.31676506996154785, "learning_rate": 0.00011485148514851486, "loss": 1.7673, "step": 11340 }, { "epoch": 8.26, "eval_accuracy": 0.6503277550921033, "eval_loss": 1.6178277730941772, "eval_runtime": 1085.1752, "eval_samples_per_second": 460.184, "eval_steps_per_second": 2.055, "step": 11340 }, { "epoch": 8.27, "grad_norm": 0.31384769082069397, "learning_rate": 0.00011386138613861387, "loss": 1.7657, "step": 11350 }, { "epoch": 8.28, "grad_norm": 0.3267647922039032, "learning_rate": 0.00011287128712871288, "loss": 1.766, "step": 11360 }, { "epoch": 8.28, "grad_norm": 0.32966649532318115, "learning_rate": 0.00011188118811881189, "loss": 1.7636, "step": 11370 }, { "epoch": 8.29, "grad_norm": 0.34511563181877136, "learning_rate": 0.00011089108910891089, "loss": 1.767, "step": 11380 }, { "epoch": 8.3, "grad_norm": 0.3151010274887085, "learning_rate": 0.0001099009900990099, "loss": 1.7663, "step": 11390 }, { "epoch": 8.31, "grad_norm": 0.33481037616729736, "learning_rate": 0.00010891089108910891, "loss": 1.7687, "step": 11400 }, { "epoch": 8.31, "grad_norm": 0.35512518882751465, "learning_rate": 0.00010792079207920792, "loss": 1.7681, "step": 11410 }, { "epoch": 8.32, "grad_norm": 0.3472909927368164, "learning_rate": 0.00010693069306930694, "loss": 1.7653, "step": 11420 }, { "epoch": 8.33, "grad_norm": 0.3452986180782318, "learning_rate": 0.00010594059405940593, "loss": 1.7666, "step": 11430 }, { "epoch": 8.33, "eval_accuracy": 0.6505906465254327, "eval_loss": 1.6169500350952148, "eval_runtime": 1083.6278, "eval_samples_per_second": 460.841, "eval_steps_per_second": 2.058, "step": 11430 }, { "epoch": 8.34, "grad_norm": 0.34459248185157776, "learning_rate": 0.00010495049504950495, "loss": 1.7661, "step": 11440 }, { "epoch": 8.34, "grad_norm": 0.3042079508304596, "learning_rate": 0.00010396039603960396, "loss": 1.761, "step": 11450 }, { "epoch": 8.35, "grad_norm": 0.32908689975738525, "learning_rate": 0.00010297029702970297, "loss": 1.7657, "step": 11460 }, { "epoch": 8.36, "grad_norm": 0.34110862016677856, "learning_rate": 0.00010198019801980198, "loss": 1.7644, "step": 11470 }, { "epoch": 8.36, "grad_norm": 0.2914797365665436, "learning_rate": 0.00010099009900990099, "loss": 1.7659, "step": 11480 }, { "epoch": 8.37, "grad_norm": 0.32843562960624695, "learning_rate": 0.0001, "loss": 1.7634, "step": 11490 }, { "epoch": 8.38, "grad_norm": 0.3443434536457062, "learning_rate": 9.900990099009902e-05, "loss": 1.7647, "step": 11500 }, { "epoch": 8.39, "grad_norm": 0.3086354732513428, "learning_rate": 9.801980198019803e-05, "loss": 1.7633, "step": 11510 }, { "epoch": 8.39, "grad_norm": 0.2858017086982727, "learning_rate": 9.702970297029704e-05, "loss": 1.7635, "step": 11520 }, { "epoch": 8.39, "eval_accuracy": 0.6506746559931541, "eval_loss": 1.61593496799469, "eval_runtime": 1085.1265, "eval_samples_per_second": 460.204, "eval_steps_per_second": 2.055, "step": 11520 }, { "epoch": 8.4, "grad_norm": 0.32728368043899536, "learning_rate": 9.603960396039604e-05, "loss": 1.7637, "step": 11530 }, { "epoch": 8.41, "grad_norm": 0.3133088946342468, "learning_rate": 9.504950495049505e-05, "loss": 1.7629, "step": 11540 }, { "epoch": 8.42, "grad_norm": 0.2798636853694916, "learning_rate": 9.405940594059406e-05, "loss": 1.766, "step": 11550 }, { "epoch": 8.42, "grad_norm": 0.3476144075393677, "learning_rate": 9.306930693069307e-05, "loss": 1.7658, "step": 11560 }, { "epoch": 8.43, "grad_norm": 0.2828819751739502, "learning_rate": 9.207920792079209e-05, "loss": 1.7624, "step": 11570 }, { "epoch": 8.44, "grad_norm": 0.27723389863967896, "learning_rate": 9.108910891089108e-05, "loss": 1.7635, "step": 11580 }, { "epoch": 8.44, "grad_norm": 0.32631412148475647, "learning_rate": 9.00990099009901e-05, "loss": 1.7621, "step": 11590 }, { "epoch": 8.45, "grad_norm": 0.3203299641609192, "learning_rate": 8.91089108910891e-05, "loss": 1.7629, "step": 11600 }, { "epoch": 8.46, "grad_norm": 0.31430014967918396, "learning_rate": 8.811881188118812e-05, "loss": 1.7631, "step": 11610 }, { "epoch": 8.46, "eval_accuracy": 0.6510713372187361, "eval_loss": 1.6139030456542969, "eval_runtime": 1084.8096, "eval_samples_per_second": 460.339, "eval_steps_per_second": 2.056, "step": 11610 }, { "epoch": 8.47, "grad_norm": 0.302937775850296, "learning_rate": 8.712871287128713e-05, "loss": 1.7628, "step": 11620 }, { "epoch": 8.47, "grad_norm": 0.2944415807723999, "learning_rate": 8.613861386138613e-05, "loss": 1.7642, "step": 11630 }, { "epoch": 8.48, "grad_norm": 0.3318140506744385, "learning_rate": 8.514851485148515e-05, "loss": 1.7626, "step": 11640 }, { "epoch": 8.49, "grad_norm": 0.2729699909687042, "learning_rate": 8.415841584158417e-05, "loss": 1.7623, "step": 11650 }, { "epoch": 8.5, "grad_norm": 0.37606656551361084, "learning_rate": 8.316831683168318e-05, "loss": 1.7609, "step": 11660 }, { "epoch": 8.5, "grad_norm": 0.30239492654800415, "learning_rate": 8.217821782178219e-05, "loss": 1.7621, "step": 11670 }, { "epoch": 8.51, "grad_norm": 0.2841242253780365, "learning_rate": 8.118811881188119e-05, "loss": 1.7629, "step": 11680 }, { "epoch": 8.52, "grad_norm": 0.3027147054672241, "learning_rate": 8.01980198019802e-05, "loss": 1.7618, "step": 11690 }, { "epoch": 8.52, "grad_norm": 0.2852645218372345, "learning_rate": 7.920792079207921e-05, "loss": 1.7633, "step": 11700 }, { "epoch": 8.52, "eval_accuracy": 0.6512795497031492, "eval_loss": 1.6128230094909668, "eval_runtime": 1083.3765, "eval_samples_per_second": 460.948, "eval_steps_per_second": 2.058, "step": 11700 }, { "epoch": 8.53, "grad_norm": 0.30832818150520325, "learning_rate": 7.821782178217822e-05, "loss": 1.7606, "step": 11710 }, { "epoch": 8.54, "grad_norm": 0.30394139885902405, "learning_rate": 7.722772277227723e-05, "loss": 1.7609, "step": 11720 }, { "epoch": 8.55, "grad_norm": 0.2910577952861786, "learning_rate": 7.623762376237623e-05, "loss": 1.7607, "step": 11730 }, { "epoch": 8.55, "grad_norm": 0.27040547132492065, "learning_rate": 7.524752475247524e-05, "loss": 1.7607, "step": 11740 }, { "epoch": 8.56, "grad_norm": 0.2972952723503113, "learning_rate": 7.425742574257426e-05, "loss": 1.7615, "step": 11750 }, { "epoch": 8.57, "grad_norm": 0.26889026165008545, "learning_rate": 7.326732673267327e-05, "loss": 1.7631, "step": 11760 }, { "epoch": 8.58, "grad_norm": 0.3071919083595276, "learning_rate": 7.227722772277228e-05, "loss": 1.7572, "step": 11770 }, { "epoch": 8.58, "grad_norm": 0.30390483140945435, "learning_rate": 7.128712871287128e-05, "loss": 1.7593, "step": 11780 }, { "epoch": 8.59, "grad_norm": 0.2942393124103546, "learning_rate": 7.02970297029703e-05, "loss": 1.7616, "step": 11790 }, { "epoch": 8.59, "eval_accuracy": 0.6516749418564994, "eval_loss": 1.6112834215164185, "eval_runtime": 1083.6459, "eval_samples_per_second": 460.833, "eval_steps_per_second": 2.058, "step": 11790 }, { "epoch": 8.6, "grad_norm": 0.2823123037815094, "learning_rate": 6.930693069306931e-05, "loss": 1.7615, "step": 11800 }, { "epoch": 8.6, "grad_norm": 0.3058627247810364, "learning_rate": 6.831683168316833e-05, "loss": 1.7612, "step": 11810 }, { "epoch": 8.61, "grad_norm": 0.2954027056694031, "learning_rate": 6.732673267326734e-05, "loss": 1.7623, "step": 11820 }, { "epoch": 8.62, "grad_norm": 0.32210031151771545, "learning_rate": 6.633663366336634e-05, "loss": 1.7607, "step": 11830 }, { "epoch": 8.63, "grad_norm": 0.2638227343559265, "learning_rate": 6.534653465346535e-05, "loss": 1.7591, "step": 11840 }, { "epoch": 8.63, "grad_norm": 0.2716045379638672, "learning_rate": 6.435643564356436e-05, "loss": 1.7602, "step": 11850 }, { "epoch": 8.64, "grad_norm": 0.2823101282119751, "learning_rate": 6.336633663366337e-05, "loss": 1.7602, "step": 11860 }, { "epoch": 8.65, "grad_norm": 0.26111429929733276, "learning_rate": 6.237623762376238e-05, "loss": 1.7608, "step": 11870 }, { "epoch": 8.66, "grad_norm": 0.28957730531692505, "learning_rate": 6.138613861386138e-05, "loss": 1.7602, "step": 11880 }, { "epoch": 8.66, "eval_accuracy": 0.6518121780771471, "eval_loss": 1.610386610031128, "eval_runtime": 1085.801, "eval_samples_per_second": 459.919, "eval_steps_per_second": 2.054, "step": 11880 }, { "epoch": 8.66, "grad_norm": 0.31074461340904236, "learning_rate": 6.039603960396039e-05, "loss": 1.7599, "step": 11890 }, { "epoch": 8.67, "grad_norm": 0.2974682152271271, "learning_rate": 5.9405940594059404e-05, "loss": 1.7604, "step": 11900 }, { "epoch": 8.68, "grad_norm": 0.3034124970436096, "learning_rate": 5.841584158415842e-05, "loss": 1.7605, "step": 11910 }, { "epoch": 8.68, "grad_norm": 0.28555795550346375, "learning_rate": 5.742574257425743e-05, "loss": 1.7568, "step": 11920 }, { "epoch": 8.69, "grad_norm": 0.2668933868408203, "learning_rate": 5.643564356435644e-05, "loss": 1.7576, "step": 11930 }, { "epoch": 8.7, "grad_norm": 0.2799495458602905, "learning_rate": 5.5445544554455445e-05, "loss": 1.7595, "step": 11940 }, { "epoch": 8.71, "grad_norm": 0.28266316652297974, "learning_rate": 5.4455445544554456e-05, "loss": 1.7602, "step": 11950 }, { "epoch": 8.71, "grad_norm": 0.27878817915916443, "learning_rate": 5.346534653465347e-05, "loss": 1.7607, "step": 11960 }, { "epoch": 8.72, "grad_norm": 0.2904963493347168, "learning_rate": 5.247524752475247e-05, "loss": 1.7578, "step": 11970 }, { "epoch": 8.72, "eval_accuracy": 0.6515499792766575, "eval_loss": 1.6111468076705933, "eval_runtime": 1090.6667, "eval_samples_per_second": 457.867, "eval_steps_per_second": 2.045, "step": 11970 }, { "epoch": 8.73, "grad_norm": 0.289413720369339, "learning_rate": 5.1485148514851485e-05, "loss": 1.758, "step": 11980 }, { "epoch": 8.74, "grad_norm": 0.2739205062389374, "learning_rate": 5.0495049504950497e-05, "loss": 1.7579, "step": 11990 }, { "epoch": 8.74, "grad_norm": 0.26597511768341064, "learning_rate": 4.950495049504951e-05, "loss": 1.7568, "step": 12000 }, { "epoch": 8.75, "grad_norm": 0.24635004997253418, "learning_rate": 4.851485148514852e-05, "loss": 1.7584, "step": 12010 }, { "epoch": 8.76, "grad_norm": 0.2534136474132538, "learning_rate": 4.7524752475247525e-05, "loss": 1.7602, "step": 12020 }, { "epoch": 8.77, "grad_norm": 0.26007363200187683, "learning_rate": 4.653465346534654e-05, "loss": 1.7567, "step": 12030 }, { "epoch": 8.77, "grad_norm": 0.2807808816432953, "learning_rate": 4.554455445544554e-05, "loss": 1.7566, "step": 12040 }, { "epoch": 8.78, "grad_norm": 0.2677513360977173, "learning_rate": 4.455445544554455e-05, "loss": 1.7567, "step": 12050 }, { "epoch": 8.79, "grad_norm": 0.2691977620124817, "learning_rate": 4.3564356435643565e-05, "loss": 1.757, "step": 12060 }, { "epoch": 8.79, "eval_accuracy": 0.6521601327172856, "eval_loss": 1.60829758644104, "eval_runtime": 1089.928, "eval_samples_per_second": 458.177, "eval_steps_per_second": 2.046, "step": 12060 }, { "epoch": 8.79, "grad_norm": 0.2577356696128845, "learning_rate": 4.257425742574258e-05, "loss": 1.7584, "step": 12070 }, { "epoch": 8.8, "grad_norm": 0.2654874324798584, "learning_rate": 4.158415841584159e-05, "loss": 1.7571, "step": 12080 }, { "epoch": 8.81, "grad_norm": 0.25344353914260864, "learning_rate": 4.0594059405940594e-05, "loss": 1.7581, "step": 12090 }, { "epoch": 8.82, "grad_norm": 0.25865158438682556, "learning_rate": 3.9603960396039605e-05, "loss": 1.7552, "step": 12100 }, { "epoch": 8.82, "grad_norm": 0.28875982761383057, "learning_rate": 3.861386138613862e-05, "loss": 1.757, "step": 12110 }, { "epoch": 8.83, "grad_norm": 0.2697414755821228, "learning_rate": 3.762376237623762e-05, "loss": 1.7579, "step": 12120 }, { "epoch": 8.84, "grad_norm": 0.2786589562892914, "learning_rate": 3.6633663366336634e-05, "loss": 1.7583, "step": 12130 }, { "epoch": 8.85, "grad_norm": 0.258486270904541, "learning_rate": 3.564356435643564e-05, "loss": 1.7581, "step": 12140 }, { "epoch": 8.85, "grad_norm": 0.2595365345478058, "learning_rate": 3.465346534653466e-05, "loss": 1.757, "step": 12150 }, { "epoch": 8.85, "eval_accuracy": 0.652040482066107, "eval_loss": 1.6086018085479736, "eval_runtime": 1089.5635, "eval_samples_per_second": 458.33, "eval_steps_per_second": 2.047, "step": 12150 }, { "epoch": 8.86, "grad_norm": 0.25674012303352356, "learning_rate": 3.366336633663367e-05, "loss": 1.7595, "step": 12160 }, { "epoch": 8.87, "grad_norm": 0.23194921016693115, "learning_rate": 3.2673267326732674e-05, "loss": 1.7574, "step": 12170 }, { "epoch": 8.87, "grad_norm": 0.2626875936985016, "learning_rate": 3.1683168316831686e-05, "loss": 1.7571, "step": 12180 }, { "epoch": 8.88, "grad_norm": 0.2361476868391037, "learning_rate": 3.069306930693069e-05, "loss": 1.7573, "step": 12190 }, { "epoch": 8.89, "grad_norm": 0.2606755793094635, "learning_rate": 2.9702970297029702e-05, "loss": 1.7567, "step": 12200 }, { "epoch": 8.9, "grad_norm": 0.27499887347221375, "learning_rate": 2.8712871287128714e-05, "loss": 1.7579, "step": 12210 }, { "epoch": 8.9, "grad_norm": 0.24832656979560852, "learning_rate": 2.7722772277227722e-05, "loss": 1.7566, "step": 12220 }, { "epoch": 8.91, "grad_norm": 0.24898388981819153, "learning_rate": 2.6732673267326734e-05, "loss": 1.7544, "step": 12230 }, { "epoch": 8.92, "grad_norm": 0.24266423285007477, "learning_rate": 2.5742574257425742e-05, "loss": 1.7559, "step": 12240 }, { "epoch": 8.92, "eval_accuracy": 0.6522573824099933, "eval_loss": 1.6079708337783813, "eval_runtime": 1089.9176, "eval_samples_per_second": 458.181, "eval_steps_per_second": 2.046, "step": 12240 }, { "epoch": 8.93, "grad_norm": 0.2438860386610031, "learning_rate": 2.4752475247524754e-05, "loss": 1.7554, "step": 12250 }, { "epoch": 8.93, "grad_norm": 0.22911418974399567, "learning_rate": 2.3762376237623762e-05, "loss": 1.7547, "step": 12260 }, { "epoch": 8.94, "grad_norm": 0.2550877034664154, "learning_rate": 2.277227722772277e-05, "loss": 1.7567, "step": 12270 }, { "epoch": 8.95, "grad_norm": 0.2409505546092987, "learning_rate": 2.1782178217821783e-05, "loss": 1.7556, "step": 12280 }, { "epoch": 8.95, "grad_norm": 0.23632997274398804, "learning_rate": 2.0792079207920794e-05, "loss": 1.7573, "step": 12290 }, { "epoch": 8.96, "grad_norm": 0.22292740643024445, "learning_rate": 1.9801980198019803e-05, "loss": 1.757, "step": 12300 }, { "epoch": 8.97, "grad_norm": 0.2350420504808426, "learning_rate": 1.881188118811881e-05, "loss": 1.756, "step": 12310 }, { "epoch": 8.98, "grad_norm": 0.22938278317451477, "learning_rate": 1.782178217821782e-05, "loss": 1.7562, "step": 12320 }, { "epoch": 8.98, "grad_norm": 0.2246268391609192, "learning_rate": 1.6831683168316834e-05, "loss": 1.7556, "step": 12330 }, { "epoch": 8.98, "eval_accuracy": 0.652376308176148, "eval_loss": 1.6073620319366455, "eval_runtime": 1088.9818, "eval_samples_per_second": 458.575, "eval_steps_per_second": 2.048, "step": 12330 }, { "epoch": 8.99, "grad_norm": 0.22820483148097992, "learning_rate": 1.5841584158415843e-05, "loss": 1.7564, "step": 12340 }, { "epoch": 9.0, "grad_norm": 0.2315167486667633, "learning_rate": 1.4851485148514851e-05, "loss": 1.7558, "step": 12350 }, { "epoch": 9.01, "grad_norm": 0.21513350307941437, "learning_rate": 1.3861386138613861e-05, "loss": 1.757, "step": 12360 }, { "epoch": 9.01, "grad_norm": 0.21538245677947998, "learning_rate": 1.2871287128712871e-05, "loss": 1.7527, "step": 12370 }, { "epoch": 9.02, "grad_norm": 0.22796376049518585, "learning_rate": 1.1881188118811881e-05, "loss": 1.7549, "step": 12380 }, { "epoch": 9.03, "grad_norm": 0.21846508979797363, "learning_rate": 1.0891089108910891e-05, "loss": 1.7527, "step": 12390 }, { "epoch": 9.03, "grad_norm": 0.2252340316772461, "learning_rate": 9.900990099009901e-06, "loss": 1.757, "step": 12400 }, { "epoch": 9.04, "grad_norm": 0.22679966688156128, "learning_rate": 8.91089108910891e-06, "loss": 1.7547, "step": 12410 }, { "epoch": 9.05, "grad_norm": 0.21749068796634674, "learning_rate": 7.920792079207921e-06, "loss": 1.755, "step": 12420 }, { "epoch": 9.05, "eval_accuracy": 0.6525192559694988, "eval_loss": 1.6068978309631348, "eval_runtime": 1087.147, "eval_samples_per_second": 459.349, "eval_steps_per_second": 2.051, "step": 12420 } ], "logging_steps": 10, "max_steps": 12500, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 90, "total_flos": 1.28938481325833e+18, "train_batch_size": 192, "trial_name": null, "trial_params": null }