HuggingfaceBest5ClassModel / trainer_state.json
Anders L|hr
Best 5 class model
d4b56e8
{
"best_metric": 0.7937062937062938,
"best_model_checkpoint": "wav2vec2-5Class-train-test-finetune/checkpoint-4122",
"epoch": 224.0,
"eval_steps": 500,
"global_step": 5432,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.99,
"eval_accuracy": 0.34265734265734266,
"eval_loss": 1.5984586477279663,
"eval_runtime": 5.3437,
"eval_samples_per_second": 53.521,
"eval_steps_per_second": 3.368,
"step": 24
},
{
"epoch": 1.98,
"eval_accuracy": 0.33916083916083917,
"eval_loss": 1.5969289541244507,
"eval_runtime": 3.8653,
"eval_samples_per_second": 73.992,
"eval_steps_per_second": 4.657,
"step": 48
},
{
"epoch": 2.06,
"grad_norm": 1.0544973611831665,
"learning_rate": 2.4999999999999998e-06,
"loss": 1.5969,
"step": 50
},
{
"epoch": 2.97,
"eval_accuracy": 0.32867132867132864,
"eval_loss": 1.5943816900253296,
"eval_runtime": 6.1748,
"eval_samples_per_second": 46.317,
"eval_steps_per_second": 2.915,
"step": 72
},
{
"epoch": 4.0,
"eval_accuracy": 0.3146853146853147,
"eval_loss": 1.5906767845153809,
"eval_runtime": 5.1678,
"eval_samples_per_second": 55.343,
"eval_steps_per_second": 3.483,
"step": 97
},
{
"epoch": 4.12,
"grad_norm": 0.8443157076835632,
"learning_rate": 4.9999999999999996e-06,
"loss": 1.5896,
"step": 100
},
{
"epoch": 4.99,
"eval_accuracy": 0.2972027972027972,
"eval_loss": 1.5860023498535156,
"eval_runtime": 4.9416,
"eval_samples_per_second": 57.876,
"eval_steps_per_second": 3.643,
"step": 121
},
{
"epoch": 5.98,
"eval_accuracy": 0.2692307692307692,
"eval_loss": 1.5806005001068115,
"eval_runtime": 4.1837,
"eval_samples_per_second": 68.36,
"eval_steps_per_second": 4.302,
"step": 145
},
{
"epoch": 6.19,
"grad_norm": 1.0938074588775635,
"learning_rate": 7.5e-06,
"loss": 1.5743,
"step": 150
},
{
"epoch": 6.97,
"eval_accuracy": 0.25874125874125875,
"eval_loss": 1.5742768049240112,
"eval_runtime": 7.1914,
"eval_samples_per_second": 39.77,
"eval_steps_per_second": 2.503,
"step": 169
},
{
"epoch": 8.0,
"eval_accuracy": 0.23426573426573427,
"eval_loss": 1.5664165019989014,
"eval_runtime": 5.6489,
"eval_samples_per_second": 50.629,
"eval_steps_per_second": 3.186,
"step": 194
},
{
"epoch": 8.25,
"grad_norm": 0.9692079424858093,
"learning_rate": 9.999999999999999e-06,
"loss": 1.5508,
"step": 200
},
{
"epoch": 8.99,
"eval_accuracy": 0.22727272727272727,
"eval_loss": 1.557572841644287,
"eval_runtime": 5.5182,
"eval_samples_per_second": 51.828,
"eval_steps_per_second": 3.262,
"step": 218
},
{
"epoch": 9.98,
"eval_accuracy": 0.22727272727272727,
"eval_loss": 1.5482373237609863,
"eval_runtime": 5.3205,
"eval_samples_per_second": 53.754,
"eval_steps_per_second": 3.383,
"step": 242
},
{
"epoch": 10.31,
"grad_norm": 1.02046799659729,
"learning_rate": 1.25e-05,
"loss": 1.5157,
"step": 250
},
{
"epoch": 10.97,
"eval_accuracy": 0.22727272727272727,
"eval_loss": 1.539355993270874,
"eval_runtime": 6.3116,
"eval_samples_per_second": 45.313,
"eval_steps_per_second": 2.852,
"step": 266
},
{
"epoch": 12.0,
"eval_accuracy": 0.22727272727272727,
"eval_loss": 1.5350520610809326,
"eval_runtime": 4.3422,
"eval_samples_per_second": 65.865,
"eval_steps_per_second": 4.145,
"step": 291
},
{
"epoch": 12.37,
"grad_norm": 1.6058833599090576,
"learning_rate": 1.5e-05,
"loss": 1.4534,
"step": 300
},
{
"epoch": 12.99,
"eval_accuracy": 0.22727272727272727,
"eval_loss": 1.5525730848312378,
"eval_runtime": 5.245,
"eval_samples_per_second": 54.528,
"eval_steps_per_second": 3.432,
"step": 315
},
{
"epoch": 13.98,
"eval_accuracy": 0.22727272727272727,
"eval_loss": 1.599926471710205,
"eval_runtime": 6.0088,
"eval_samples_per_second": 47.597,
"eval_steps_per_second": 2.996,
"step": 339
},
{
"epoch": 14.43,
"grad_norm": 0.8243080377578735,
"learning_rate": 1.7500000000000002e-05,
"loss": 1.3638,
"step": 350
},
{
"epoch": 14.97,
"eval_accuracy": 0.22727272727272727,
"eval_loss": 1.5896875858306885,
"eval_runtime": 4.8752,
"eval_samples_per_second": 58.664,
"eval_steps_per_second": 3.692,
"step": 363
},
{
"epoch": 16.0,
"eval_accuracy": 0.26573426573426573,
"eval_loss": 1.560091495513916,
"eval_runtime": 5.5082,
"eval_samples_per_second": 51.922,
"eval_steps_per_second": 3.268,
"step": 388
},
{
"epoch": 16.49,
"grad_norm": 0.7977257370948792,
"learning_rate": 1.9999999999999998e-05,
"loss": 1.2951,
"step": 400
},
{
"epoch": 16.99,
"eval_accuracy": 0.2937062937062937,
"eval_loss": 1.5349317789077759,
"eval_runtime": 4.7526,
"eval_samples_per_second": 60.178,
"eval_steps_per_second": 3.787,
"step": 412
},
{
"epoch": 17.98,
"eval_accuracy": 0.34265734265734266,
"eval_loss": 1.5053907632827759,
"eval_runtime": 4.8638,
"eval_samples_per_second": 58.801,
"eval_steps_per_second": 3.701,
"step": 436
},
{
"epoch": 18.56,
"grad_norm": 0.7064552903175354,
"learning_rate": 2.25e-05,
"loss": 1.2369,
"step": 450
},
{
"epoch": 18.97,
"eval_accuracy": 0.3741258741258741,
"eval_loss": 1.4689087867736816,
"eval_runtime": 4.3712,
"eval_samples_per_second": 65.428,
"eval_steps_per_second": 4.118,
"step": 460
},
{
"epoch": 20.0,
"eval_accuracy": 0.4370629370629371,
"eval_loss": 1.404613971710205,
"eval_runtime": 4.7203,
"eval_samples_per_second": 60.59,
"eval_steps_per_second": 3.813,
"step": 485
},
{
"epoch": 20.62,
"grad_norm": 0.598238468170166,
"learning_rate": 2.5e-05,
"loss": 1.1566,
"step": 500
},
{
"epoch": 20.99,
"eval_accuracy": 0.4405594405594406,
"eval_loss": 1.3691043853759766,
"eval_runtime": 6.6443,
"eval_samples_per_second": 43.044,
"eval_steps_per_second": 2.709,
"step": 509
},
{
"epoch": 21.98,
"eval_accuracy": 0.4825174825174825,
"eval_loss": 1.3120107650756836,
"eval_runtime": 4.9585,
"eval_samples_per_second": 57.679,
"eval_steps_per_second": 3.63,
"step": 533
},
{
"epoch": 22.68,
"grad_norm": 0.682925820350647,
"learning_rate": 2.75e-05,
"loss": 1.0676,
"step": 550
},
{
"epoch": 22.97,
"eval_accuracy": 0.486013986013986,
"eval_loss": 1.2839338779449463,
"eval_runtime": 4.0382,
"eval_samples_per_second": 70.824,
"eval_steps_per_second": 4.457,
"step": 557
},
{
"epoch": 24.0,
"eval_accuracy": 0.5104895104895105,
"eval_loss": 1.2549891471862793,
"eval_runtime": 5.1896,
"eval_samples_per_second": 55.11,
"eval_steps_per_second": 3.468,
"step": 582
},
{
"epoch": 24.74,
"grad_norm": 1.1368101835250854,
"learning_rate": 3e-05,
"loss": 0.992,
"step": 600
},
{
"epoch": 24.99,
"eval_accuracy": 0.5209790209790209,
"eval_loss": 1.2106566429138184,
"eval_runtime": 6.8941,
"eval_samples_per_second": 41.485,
"eval_steps_per_second": 2.611,
"step": 606
},
{
"epoch": 25.98,
"eval_accuracy": 0.5384615384615384,
"eval_loss": 1.1711338758468628,
"eval_runtime": 4.9707,
"eval_samples_per_second": 57.537,
"eval_steps_per_second": 3.621,
"step": 630
},
{
"epoch": 26.8,
"grad_norm": 0.9649831056594849,
"learning_rate": 2.9722222222222223e-05,
"loss": 0.9272,
"step": 650
},
{
"epoch": 26.97,
"eval_accuracy": 0.5594405594405595,
"eval_loss": 1.1318116188049316,
"eval_runtime": 5.5564,
"eval_samples_per_second": 51.472,
"eval_steps_per_second": 3.24,
"step": 654
},
{
"epoch": 28.0,
"eval_accuracy": 0.6153846153846154,
"eval_loss": 1.0594333410263062,
"eval_runtime": 4.6773,
"eval_samples_per_second": 61.147,
"eval_steps_per_second": 3.848,
"step": 679
},
{
"epoch": 28.87,
"grad_norm": 0.883937418460846,
"learning_rate": 2.9444444444444445e-05,
"loss": 0.8478,
"step": 700
},
{
"epoch": 28.99,
"eval_accuracy": 0.6013986013986014,
"eval_loss": 1.054669737815857,
"eval_runtime": 4.9219,
"eval_samples_per_second": 58.108,
"eval_steps_per_second": 3.657,
"step": 703
},
{
"epoch": 29.98,
"eval_accuracy": 0.6363636363636364,
"eval_loss": 0.9822685122489929,
"eval_runtime": 6.3133,
"eval_samples_per_second": 45.302,
"eval_steps_per_second": 2.851,
"step": 727
},
{
"epoch": 30.93,
"grad_norm": 1.3742878437042236,
"learning_rate": 2.9166666666666666e-05,
"loss": 0.7627,
"step": 750
},
{
"epoch": 30.97,
"eval_accuracy": 0.6398601398601399,
"eval_loss": 1.00295090675354,
"eval_runtime": 6.154,
"eval_samples_per_second": 46.473,
"eval_steps_per_second": 2.925,
"step": 751
},
{
"epoch": 32.0,
"eval_accuracy": 0.6608391608391608,
"eval_loss": 0.930969774723053,
"eval_runtime": 5.6747,
"eval_samples_per_second": 50.399,
"eval_steps_per_second": 3.172,
"step": 776
},
{
"epoch": 32.99,
"grad_norm": 1.329268217086792,
"learning_rate": 2.8888888888888888e-05,
"loss": 0.7266,
"step": 800
},
{
"epoch": 32.99,
"eval_accuracy": 0.6678321678321678,
"eval_loss": 0.9228739738464355,
"eval_runtime": 5.382,
"eval_samples_per_second": 53.14,
"eval_steps_per_second": 3.344,
"step": 800
},
{
"epoch": 33.98,
"eval_accuracy": 0.6958041958041958,
"eval_loss": 0.8684509992599487,
"eval_runtime": 4.8497,
"eval_samples_per_second": 58.973,
"eval_steps_per_second": 3.712,
"step": 824
},
{
"epoch": 34.97,
"eval_accuracy": 0.6643356643356644,
"eval_loss": 0.8954732418060303,
"eval_runtime": 5.2083,
"eval_samples_per_second": 54.912,
"eval_steps_per_second": 3.456,
"step": 848
},
{
"epoch": 35.05,
"grad_norm": 1.3892701864242554,
"learning_rate": 2.8611111111111113e-05,
"loss": 0.6906,
"step": 850
},
{
"epoch": 36.0,
"eval_accuracy": 0.6713286713286714,
"eval_loss": 0.9125654101371765,
"eval_runtime": 5.3068,
"eval_samples_per_second": 53.894,
"eval_steps_per_second": 3.392,
"step": 873
},
{
"epoch": 36.99,
"eval_accuracy": 0.6923076923076923,
"eval_loss": 0.8543534874916077,
"eval_runtime": 4.3351,
"eval_samples_per_second": 65.974,
"eval_steps_per_second": 4.152,
"step": 897
},
{
"epoch": 37.11,
"grad_norm": 0.836291491985321,
"learning_rate": 2.8333333333333332e-05,
"loss": 0.6721,
"step": 900
},
{
"epoch": 37.98,
"eval_accuracy": 0.6923076923076923,
"eval_loss": 0.8480322957038879,
"eval_runtime": 5.1861,
"eval_samples_per_second": 55.147,
"eval_steps_per_second": 3.471,
"step": 921
},
{
"epoch": 38.97,
"eval_accuracy": 0.7097902097902098,
"eval_loss": 0.8354606628417969,
"eval_runtime": 6.3247,
"eval_samples_per_second": 45.22,
"eval_steps_per_second": 2.846,
"step": 945
},
{
"epoch": 39.18,
"grad_norm": 1.6499431133270264,
"learning_rate": 2.8055555555555557e-05,
"loss": 0.6442,
"step": 950
},
{
"epoch": 40.0,
"eval_accuracy": 0.6958041958041958,
"eval_loss": 0.8412452340126038,
"eval_runtime": 5.2281,
"eval_samples_per_second": 54.704,
"eval_steps_per_second": 3.443,
"step": 970
},
{
"epoch": 40.99,
"eval_accuracy": 0.6888111888111889,
"eval_loss": 0.8356389999389648,
"eval_runtime": 4.8326,
"eval_samples_per_second": 59.181,
"eval_steps_per_second": 3.725,
"step": 994
},
{
"epoch": 41.24,
"grad_norm": 1.1766818761825562,
"learning_rate": 2.777777777777778e-05,
"loss": 0.6465,
"step": 1000
},
{
"epoch": 41.98,
"eval_accuracy": 0.7062937062937062,
"eval_loss": 0.8180016875267029,
"eval_runtime": 5.7926,
"eval_samples_per_second": 49.374,
"eval_steps_per_second": 3.107,
"step": 1018
},
{
"epoch": 42.97,
"eval_accuracy": 0.7027972027972028,
"eval_loss": 0.8103991150856018,
"eval_runtime": 5.5185,
"eval_samples_per_second": 51.825,
"eval_steps_per_second": 3.262,
"step": 1042
},
{
"epoch": 43.3,
"grad_norm": 0.9722403287887573,
"learning_rate": 2.75e-05,
"loss": 0.6086,
"step": 1050
},
{
"epoch": 44.0,
"eval_accuracy": 0.6958041958041958,
"eval_loss": 0.8162235617637634,
"eval_runtime": 4.9174,
"eval_samples_per_second": 58.161,
"eval_steps_per_second": 3.66,
"step": 1067
},
{
"epoch": 44.99,
"eval_accuracy": 0.7027972027972028,
"eval_loss": 0.7957289218902588,
"eval_runtime": 4.6891,
"eval_samples_per_second": 60.992,
"eval_steps_per_second": 3.839,
"step": 1091
},
{
"epoch": 45.36,
"grad_norm": 1.269113302230835,
"learning_rate": 2.7222222222222223e-05,
"loss": 0.5863,
"step": 1100
},
{
"epoch": 45.98,
"eval_accuracy": 0.6958041958041958,
"eval_loss": 0.8143528699874878,
"eval_runtime": 6.6805,
"eval_samples_per_second": 42.811,
"eval_steps_per_second": 2.694,
"step": 1115
},
{
"epoch": 46.97,
"eval_accuracy": 0.7027972027972028,
"eval_loss": 0.78568434715271,
"eval_runtime": 4.7422,
"eval_samples_per_second": 60.31,
"eval_steps_per_second": 3.796,
"step": 1139
},
{
"epoch": 47.42,
"grad_norm": 0.9775255918502808,
"learning_rate": 2.6944444444444445e-05,
"loss": 0.5877,
"step": 1150
},
{
"epoch": 48.0,
"eval_accuracy": 0.7132867132867133,
"eval_loss": 0.7764595150947571,
"eval_runtime": 5.76,
"eval_samples_per_second": 49.653,
"eval_steps_per_second": 3.125,
"step": 1164
},
{
"epoch": 48.99,
"eval_accuracy": 0.6993006993006993,
"eval_loss": 0.7881478071212769,
"eval_runtime": 5.4965,
"eval_samples_per_second": 52.033,
"eval_steps_per_second": 3.275,
"step": 1188
},
{
"epoch": 49.48,
"grad_norm": 1.540124773979187,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.5629,
"step": 1200
},
{
"epoch": 49.98,
"eval_accuracy": 0.7097902097902098,
"eval_loss": 0.7658265829086304,
"eval_runtime": 4.731,
"eval_samples_per_second": 60.452,
"eval_steps_per_second": 3.805,
"step": 1212
},
{
"epoch": 50.97,
"eval_accuracy": 0.7132867132867133,
"eval_loss": 0.7723098397254944,
"eval_runtime": 5.8352,
"eval_samples_per_second": 49.013,
"eval_steps_per_second": 3.085,
"step": 1236
},
{
"epoch": 51.55,
"grad_norm": 1.2498500347137451,
"learning_rate": 2.6388888888888892e-05,
"loss": 0.5476,
"step": 1250
},
{
"epoch": 52.0,
"eval_accuracy": 0.7097902097902098,
"eval_loss": 0.7603952884674072,
"eval_runtime": 4.448,
"eval_samples_per_second": 64.299,
"eval_steps_per_second": 4.047,
"step": 1261
},
{
"epoch": 52.99,
"eval_accuracy": 0.7202797202797203,
"eval_loss": 0.7554137706756592,
"eval_runtime": 6.4218,
"eval_samples_per_second": 44.536,
"eval_steps_per_second": 2.803,
"step": 1285
},
{
"epoch": 53.61,
"grad_norm": 0.9919388890266418,
"learning_rate": 2.6116666666666667e-05,
"loss": 0.5357,
"step": 1300
},
{
"epoch": 53.98,
"eval_accuracy": 0.7307692307692307,
"eval_loss": 0.7458928227424622,
"eval_runtime": 5.3791,
"eval_samples_per_second": 53.168,
"eval_steps_per_second": 3.346,
"step": 1309
},
{
"epoch": 54.97,
"eval_accuracy": 0.7132867132867133,
"eval_loss": 0.7632877230644226,
"eval_runtime": 5.278,
"eval_samples_per_second": 54.187,
"eval_steps_per_second": 3.41,
"step": 1333
},
{
"epoch": 55.67,
"grad_norm": 1.688183307647705,
"learning_rate": 2.5838888888888892e-05,
"loss": 0.5335,
"step": 1350
},
{
"epoch": 56.0,
"eval_accuracy": 0.7167832167832168,
"eval_loss": 0.768308162689209,
"eval_runtime": 5.7022,
"eval_samples_per_second": 50.156,
"eval_steps_per_second": 3.157,
"step": 1358
},
{
"epoch": 56.99,
"eval_accuracy": 0.7307692307692307,
"eval_loss": 0.7380541563034058,
"eval_runtime": 4.522,
"eval_samples_per_second": 63.247,
"eval_steps_per_second": 3.981,
"step": 1382
},
{
"epoch": 57.73,
"grad_norm": 1.4895784854888916,
"learning_rate": 2.556111111111111e-05,
"loss": 0.5107,
"step": 1400
},
{
"epoch": 57.98,
"eval_accuracy": 0.7377622377622378,
"eval_loss": 0.7308338284492493,
"eval_runtime": 4.4787,
"eval_samples_per_second": 63.857,
"eval_steps_per_second": 4.019,
"step": 1406
},
{
"epoch": 58.97,
"eval_accuracy": 0.7237762237762237,
"eval_loss": 0.7441032528877258,
"eval_runtime": 5.8744,
"eval_samples_per_second": 48.685,
"eval_steps_per_second": 3.064,
"step": 1430
},
{
"epoch": 59.79,
"grad_norm": 1.4925004243850708,
"learning_rate": 2.5283333333333332e-05,
"loss": 0.5105,
"step": 1450
},
{
"epoch": 60.0,
"eval_accuracy": 0.7307692307692307,
"eval_loss": 0.7481815218925476,
"eval_runtime": 7.272,
"eval_samples_per_second": 39.329,
"eval_steps_per_second": 2.475,
"step": 1455
},
{
"epoch": 60.99,
"eval_accuracy": 0.7342657342657343,
"eval_loss": 0.733482301235199,
"eval_runtime": 4.6235,
"eval_samples_per_second": 61.858,
"eval_steps_per_second": 3.893,
"step": 1479
},
{
"epoch": 61.86,
"grad_norm": 1.3200663328170776,
"learning_rate": 2.5005555555555558e-05,
"loss": 0.4914,
"step": 1500
},
{
"epoch": 61.98,
"eval_accuracy": 0.7447552447552448,
"eval_loss": 0.7241908311843872,
"eval_runtime": 4.8198,
"eval_samples_per_second": 59.338,
"eval_steps_per_second": 3.735,
"step": 1503
},
{
"epoch": 62.97,
"eval_accuracy": 0.7377622377622378,
"eval_loss": 0.7321043014526367,
"eval_runtime": 5.8929,
"eval_samples_per_second": 48.533,
"eval_steps_per_second": 3.055,
"step": 1527
},
{
"epoch": 63.92,
"grad_norm": 1.1309747695922852,
"learning_rate": 2.472777777777778e-05,
"loss": 0.4839,
"step": 1550
},
{
"epoch": 64.0,
"eval_accuracy": 0.7342657342657343,
"eval_loss": 0.7220665216445923,
"eval_runtime": 5.8635,
"eval_samples_per_second": 48.776,
"eval_steps_per_second": 3.07,
"step": 1552
},
{
"epoch": 64.99,
"eval_accuracy": 0.7412587412587412,
"eval_loss": 0.7136482000350952,
"eval_runtime": 4.3102,
"eval_samples_per_second": 66.354,
"eval_steps_per_second": 4.176,
"step": 1576
},
{
"epoch": 65.98,
"grad_norm": 1.1314157247543335,
"learning_rate": 2.4449999999999998e-05,
"loss": 0.4751,
"step": 1600
},
{
"epoch": 65.98,
"eval_accuracy": 0.7412587412587412,
"eval_loss": 0.7198111414909363,
"eval_runtime": 4.7841,
"eval_samples_per_second": 59.781,
"eval_steps_per_second": 3.762,
"step": 1600
},
{
"epoch": 66.97,
"eval_accuracy": 0.7377622377622378,
"eval_loss": 0.7145721912384033,
"eval_runtime": 6.347,
"eval_samples_per_second": 45.061,
"eval_steps_per_second": 2.836,
"step": 1624
},
{
"epoch": 68.0,
"eval_accuracy": 0.7447552447552448,
"eval_loss": 0.6970916390419006,
"eval_runtime": 5.6871,
"eval_samples_per_second": 50.289,
"eval_steps_per_second": 3.165,
"step": 1649
},
{
"epoch": 68.04,
"grad_norm": 2.397585153579712,
"learning_rate": 2.4172222222222223e-05,
"loss": 0.4639,
"step": 1650
},
{
"epoch": 68.99,
"eval_accuracy": 0.7272727272727273,
"eval_loss": 0.7201464176177979,
"eval_runtime": 4.4157,
"eval_samples_per_second": 64.769,
"eval_steps_per_second": 4.076,
"step": 1673
},
{
"epoch": 69.98,
"eval_accuracy": 0.7307692307692307,
"eval_loss": 0.7244682312011719,
"eval_runtime": 5.4392,
"eval_samples_per_second": 52.581,
"eval_steps_per_second": 3.309,
"step": 1697
},
{
"epoch": 70.1,
"grad_norm": 2.062610387802124,
"learning_rate": 2.3894444444444445e-05,
"loss": 0.4581,
"step": 1700
},
{
"epoch": 70.97,
"eval_accuracy": 0.7447552447552448,
"eval_loss": 0.7077587842941284,
"eval_runtime": 5.1002,
"eval_samples_per_second": 56.076,
"eval_steps_per_second": 3.529,
"step": 1721
},
{
"epoch": 72.0,
"eval_accuracy": 0.7517482517482518,
"eval_loss": 0.6957913637161255,
"eval_runtime": 4.4485,
"eval_samples_per_second": 64.291,
"eval_steps_per_second": 4.046,
"step": 1746
},
{
"epoch": 72.16,
"grad_norm": 2.7808456420898438,
"learning_rate": 2.3616666666666667e-05,
"loss": 0.4643,
"step": 1750
},
{
"epoch": 72.99,
"eval_accuracy": 0.7447552447552448,
"eval_loss": 0.7036928534507751,
"eval_runtime": 5.9101,
"eval_samples_per_second": 48.392,
"eval_steps_per_second": 3.046,
"step": 1770
},
{
"epoch": 73.98,
"eval_accuracy": 0.7482517482517482,
"eval_loss": 0.71629399061203,
"eval_runtime": 6.0211,
"eval_samples_per_second": 47.5,
"eval_steps_per_second": 2.989,
"step": 1794
},
{
"epoch": 74.23,
"grad_norm": 1.78495192527771,
"learning_rate": 2.333888888888889e-05,
"loss": 0.442,
"step": 1800
},
{
"epoch": 74.97,
"eval_accuracy": 0.7377622377622378,
"eval_loss": 0.6997957229614258,
"eval_runtime": 4.4212,
"eval_samples_per_second": 64.688,
"eval_steps_per_second": 4.071,
"step": 1818
},
{
"epoch": 76.0,
"eval_accuracy": 0.7447552447552448,
"eval_loss": 0.6946483850479126,
"eval_runtime": 4.0507,
"eval_samples_per_second": 70.605,
"eval_steps_per_second": 4.444,
"step": 1843
},
{
"epoch": 76.29,
"grad_norm": 1.7383118867874146,
"learning_rate": 2.306111111111111e-05,
"loss": 0.4305,
"step": 1850
},
{
"epoch": 76.99,
"eval_accuracy": 0.7552447552447552,
"eval_loss": 0.6857091784477234,
"eval_runtime": 4.1718,
"eval_samples_per_second": 68.556,
"eval_steps_per_second": 4.315,
"step": 1867
},
{
"epoch": 77.98,
"eval_accuracy": 0.7447552447552448,
"eval_loss": 0.6936307549476624,
"eval_runtime": 3.8781,
"eval_samples_per_second": 73.747,
"eval_steps_per_second": 4.641,
"step": 1891
},
{
"epoch": 78.35,
"grad_norm": 1.047067403793335,
"learning_rate": 2.2783333333333336e-05,
"loss": 0.4416,
"step": 1900
},
{
"epoch": 78.97,
"eval_accuracy": 0.7517482517482518,
"eval_loss": 0.6965110301971436,
"eval_runtime": 5.1318,
"eval_samples_per_second": 55.731,
"eval_steps_per_second": 3.508,
"step": 1915
},
{
"epoch": 80.0,
"eval_accuracy": 0.7482517482517482,
"eval_loss": 0.7017127871513367,
"eval_runtime": 4.3418,
"eval_samples_per_second": 65.871,
"eval_steps_per_second": 4.146,
"step": 1940
},
{
"epoch": 80.41,
"grad_norm": 1.5354928970336914,
"learning_rate": 2.2505555555555554e-05,
"loss": 0.428,
"step": 1950
},
{
"epoch": 80.99,
"eval_accuracy": 0.7552447552447552,
"eval_loss": 0.6970596313476562,
"eval_runtime": 5.973,
"eval_samples_per_second": 47.882,
"eval_steps_per_second": 3.014,
"step": 1964
},
{
"epoch": 81.98,
"eval_accuracy": 0.7552447552447552,
"eval_loss": 0.6897542476654053,
"eval_runtime": 5.0481,
"eval_samples_per_second": 56.655,
"eval_steps_per_second": 3.566,
"step": 1988
},
{
"epoch": 82.47,
"grad_norm": 1.7141317129135132,
"learning_rate": 2.2227777777777776e-05,
"loss": 0.4093,
"step": 2000
},
{
"epoch": 82.97,
"eval_accuracy": 0.7482517482517482,
"eval_loss": 0.7004020810127258,
"eval_runtime": 4.1986,
"eval_samples_per_second": 68.118,
"eval_steps_per_second": 4.287,
"step": 2012
},
{
"epoch": 84.0,
"eval_accuracy": 0.7552447552447552,
"eval_loss": 0.6867479681968689,
"eval_runtime": 4.6871,
"eval_samples_per_second": 61.018,
"eval_steps_per_second": 3.84,
"step": 2037
},
{
"epoch": 84.54,
"grad_norm": 2.0219666957855225,
"learning_rate": 2.195e-05,
"loss": 0.4148,
"step": 2050
},
{
"epoch": 84.99,
"eval_accuracy": 0.7377622377622378,
"eval_loss": 0.7070020437240601,
"eval_runtime": 5.9326,
"eval_samples_per_second": 48.208,
"eval_steps_per_second": 3.034,
"step": 2061
},
{
"epoch": 85.98,
"eval_accuracy": 0.7447552447552448,
"eval_loss": 0.7030305862426758,
"eval_runtime": 5.3564,
"eval_samples_per_second": 53.394,
"eval_steps_per_second": 3.36,
"step": 2085
},
{
"epoch": 86.6,
"grad_norm": 1.4678714275360107,
"learning_rate": 2.1672222222222223e-05,
"loss": 0.3923,
"step": 2100
},
{
"epoch": 86.97,
"eval_accuracy": 0.7587412587412588,
"eval_loss": 0.678174614906311,
"eval_runtime": 3.9745,
"eval_samples_per_second": 71.96,
"eval_steps_per_second": 4.529,
"step": 2109
},
{
"epoch": 88.0,
"eval_accuracy": 0.7412587412587412,
"eval_loss": 0.7166118621826172,
"eval_runtime": 4.0358,
"eval_samples_per_second": 70.866,
"eval_steps_per_second": 4.46,
"step": 2134
},
{
"epoch": 88.66,
"grad_norm": 1.589543342590332,
"learning_rate": 2.1394444444444445e-05,
"loss": 0.3964,
"step": 2150
},
{
"epoch": 88.99,
"eval_accuracy": 0.7482517482517482,
"eval_loss": 0.7075912952423096,
"eval_runtime": 5.0331,
"eval_samples_per_second": 56.823,
"eval_steps_per_second": 3.576,
"step": 2158
},
{
"epoch": 89.98,
"eval_accuracy": 0.7657342657342657,
"eval_loss": 0.6867172122001648,
"eval_runtime": 5.386,
"eval_samples_per_second": 53.101,
"eval_steps_per_second": 3.342,
"step": 2182
},
{
"epoch": 90.72,
"grad_norm": 1.3886605501174927,
"learning_rate": 2.1116666666666667e-05,
"loss": 0.3846,
"step": 2200
},
{
"epoch": 90.97,
"eval_accuracy": 0.7517482517482518,
"eval_loss": 0.6913285851478577,
"eval_runtime": 5.5324,
"eval_samples_per_second": 51.696,
"eval_steps_per_second": 3.254,
"step": 2206
},
{
"epoch": 92.0,
"eval_accuracy": 0.7482517482517482,
"eval_loss": 0.7160294651985168,
"eval_runtime": 5.2753,
"eval_samples_per_second": 54.215,
"eval_steps_per_second": 3.412,
"step": 2231
},
{
"epoch": 92.78,
"grad_norm": 2.4106783866882324,
"learning_rate": 2.083888888888889e-05,
"loss": 0.3654,
"step": 2250
},
{
"epoch": 92.99,
"eval_accuracy": 0.7517482517482518,
"eval_loss": 0.6765207052230835,
"eval_runtime": 5.5671,
"eval_samples_per_second": 51.373,
"eval_steps_per_second": 3.233,
"step": 2255
},
{
"epoch": 93.98,
"eval_accuracy": 0.7657342657342657,
"eval_loss": 0.6881967186927795,
"eval_runtime": 3.8228,
"eval_samples_per_second": 74.814,
"eval_steps_per_second": 4.709,
"step": 2279
},
{
"epoch": 94.85,
"grad_norm": 0.8871183395385742,
"learning_rate": 2.0561111111111114e-05,
"loss": 0.3577,
"step": 2300
},
{
"epoch": 94.97,
"eval_accuracy": 0.7552447552447552,
"eval_loss": 0.6852585673332214,
"eval_runtime": 4.7228,
"eval_samples_per_second": 60.557,
"eval_steps_per_second": 3.811,
"step": 2303
},
{
"epoch": 96.0,
"eval_accuracy": 0.7552447552447552,
"eval_loss": 0.7158808708190918,
"eval_runtime": 5.6504,
"eval_samples_per_second": 50.616,
"eval_steps_per_second": 3.186,
"step": 2328
},
{
"epoch": 96.91,
"grad_norm": 1.0019863843917847,
"learning_rate": 2.0283333333333333e-05,
"loss": 0.37,
"step": 2350
},
{
"epoch": 96.99,
"eval_accuracy": 0.7657342657342657,
"eval_loss": 0.6943120360374451,
"eval_runtime": 4.8337,
"eval_samples_per_second": 59.168,
"eval_steps_per_second": 3.724,
"step": 2352
},
{
"epoch": 97.98,
"eval_accuracy": 0.7587412587412588,
"eval_loss": 0.7010317444801331,
"eval_runtime": 4.6874,
"eval_samples_per_second": 61.015,
"eval_steps_per_second": 3.84,
"step": 2376
},
{
"epoch": 98.97,
"grad_norm": 1.2908928394317627,
"learning_rate": 2.0005555555555555e-05,
"loss": 0.3473,
"step": 2400
},
{
"epoch": 98.97,
"eval_accuracy": 0.7727272727272727,
"eval_loss": 0.693758487701416,
"eval_runtime": 4.7585,
"eval_samples_per_second": 60.103,
"eval_steps_per_second": 3.783,
"step": 2400
},
{
"epoch": 100.0,
"eval_accuracy": 0.7587412587412588,
"eval_loss": 0.6918778419494629,
"eval_runtime": 6.6891,
"eval_samples_per_second": 42.756,
"eval_steps_per_second": 2.691,
"step": 2425
},
{
"epoch": 100.99,
"eval_accuracy": 0.7552447552447552,
"eval_loss": 0.6849302053451538,
"eval_runtime": 4.4685,
"eval_samples_per_second": 64.003,
"eval_steps_per_second": 4.028,
"step": 2449
},
{
"epoch": 101.03,
"grad_norm": 1.1730871200561523,
"learning_rate": 1.972777777777778e-05,
"loss": 0.3587,
"step": 2450
},
{
"epoch": 101.98,
"eval_accuracy": 0.7587412587412588,
"eval_loss": 0.6855939030647278,
"eval_runtime": 4.3434,
"eval_samples_per_second": 65.847,
"eval_steps_per_second": 4.144,
"step": 2473
},
{
"epoch": 102.97,
"eval_accuracy": 0.7517482517482518,
"eval_loss": 0.7046144604682922,
"eval_runtime": 4.7166,
"eval_samples_per_second": 60.637,
"eval_steps_per_second": 3.816,
"step": 2497
},
{
"epoch": 103.09,
"grad_norm": 1.3693217039108276,
"learning_rate": 1.945e-05,
"loss": 0.3429,
"step": 2500
},
{
"epoch": 104.0,
"eval_accuracy": 0.7727272727272727,
"eval_loss": 0.6892997622489929,
"eval_runtime": 5.3868,
"eval_samples_per_second": 53.092,
"eval_steps_per_second": 3.341,
"step": 2522
},
{
"epoch": 104.99,
"eval_accuracy": 0.7622377622377622,
"eval_loss": 0.6913393139839172,
"eval_runtime": 5.09,
"eval_samples_per_second": 56.188,
"eval_steps_per_second": 3.536,
"step": 2546
},
{
"epoch": 105.15,
"grad_norm": 1.923829436302185,
"learning_rate": 1.9172222222222224e-05,
"loss": 0.3549,
"step": 2550
},
{
"epoch": 105.98,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.6880810856819153,
"eval_runtime": 4.6668,
"eval_samples_per_second": 61.283,
"eval_steps_per_second": 3.857,
"step": 2570
},
{
"epoch": 106.97,
"eval_accuracy": 0.7692307692307693,
"eval_loss": 0.7097887396812439,
"eval_runtime": 6.4652,
"eval_samples_per_second": 44.237,
"eval_steps_per_second": 2.784,
"step": 2594
},
{
"epoch": 107.22,
"grad_norm": 2.702012062072754,
"learning_rate": 1.8894444444444446e-05,
"loss": 0.3403,
"step": 2600
},
{
"epoch": 108.0,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.6878336668014526,
"eval_runtime": 4.6923,
"eval_samples_per_second": 60.951,
"eval_steps_per_second": 3.836,
"step": 2619
},
{
"epoch": 108.99,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.695954442024231,
"eval_runtime": 4.4809,
"eval_samples_per_second": 63.827,
"eval_steps_per_second": 4.017,
"step": 2643
},
{
"epoch": 109.28,
"grad_norm": 2.3427536487579346,
"learning_rate": 1.8616666666666667e-05,
"loss": 0.3253,
"step": 2650
},
{
"epoch": 109.98,
"eval_accuracy": 0.7727272727272727,
"eval_loss": 0.7005948424339294,
"eval_runtime": 4.8882,
"eval_samples_per_second": 58.508,
"eval_steps_per_second": 3.682,
"step": 2667
},
{
"epoch": 110.97,
"eval_accuracy": 0.7692307692307693,
"eval_loss": 0.6916196346282959,
"eval_runtime": 5.2891,
"eval_samples_per_second": 54.073,
"eval_steps_per_second": 3.403,
"step": 2691
},
{
"epoch": 111.34,
"grad_norm": 2.178089141845703,
"learning_rate": 1.833888888888889e-05,
"loss": 0.3332,
"step": 2700
},
{
"epoch": 112.0,
"eval_accuracy": 0.7657342657342657,
"eval_loss": 0.7059447765350342,
"eval_runtime": 4.7437,
"eval_samples_per_second": 60.291,
"eval_steps_per_second": 3.795,
"step": 2716
},
{
"epoch": 112.99,
"eval_accuracy": 0.7867132867132867,
"eval_loss": 0.6904045939445496,
"eval_runtime": 4.9942,
"eval_samples_per_second": 57.267,
"eval_steps_per_second": 3.604,
"step": 2740
},
{
"epoch": 113.4,
"grad_norm": 1.1625444889068604,
"learning_rate": 1.806111111111111e-05,
"loss": 0.3188,
"step": 2750
},
{
"epoch": 113.98,
"eval_accuracy": 0.7727272727272727,
"eval_loss": 0.6970774531364441,
"eval_runtime": 6.4809,
"eval_samples_per_second": 44.13,
"eval_steps_per_second": 2.777,
"step": 2764
},
{
"epoch": 114.97,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.700820803642273,
"eval_runtime": 5.2617,
"eval_samples_per_second": 54.355,
"eval_steps_per_second": 3.421,
"step": 2788
},
{
"epoch": 115.46,
"grad_norm": 1.2394715547561646,
"learning_rate": 1.7783333333333333e-05,
"loss": 0.3112,
"step": 2800
},
{
"epoch": 116.0,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7002130150794983,
"eval_runtime": 5.0937,
"eval_samples_per_second": 56.147,
"eval_steps_per_second": 3.534,
"step": 2813
},
{
"epoch": 116.99,
"eval_accuracy": 0.7692307692307693,
"eval_loss": 0.6909505724906921,
"eval_runtime": 4.7575,
"eval_samples_per_second": 60.116,
"eval_steps_per_second": 3.784,
"step": 2837
},
{
"epoch": 117.53,
"grad_norm": 2.4334964752197266,
"learning_rate": 1.7505555555555558e-05,
"loss": 0.3153,
"step": 2850
},
{
"epoch": 117.98,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.6957750916481018,
"eval_runtime": 4.8105,
"eval_samples_per_second": 59.453,
"eval_steps_per_second": 3.742,
"step": 2861
},
{
"epoch": 118.97,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.6867520213127136,
"eval_runtime": 4.5411,
"eval_samples_per_second": 62.98,
"eval_steps_per_second": 3.964,
"step": 2885
},
{
"epoch": 119.59,
"grad_norm": 0.769097089767456,
"learning_rate": 1.7227777777777777e-05,
"loss": 0.3006,
"step": 2900
},
{
"epoch": 120.0,
"eval_accuracy": 0.7727272727272727,
"eval_loss": 0.6890790462493896,
"eval_runtime": 4.5864,
"eval_samples_per_second": 62.358,
"eval_steps_per_second": 3.925,
"step": 2910
},
{
"epoch": 120.99,
"eval_accuracy": 0.7657342657342657,
"eval_loss": 0.6889089941978455,
"eval_runtime": 6.5804,
"eval_samples_per_second": 43.462,
"eval_steps_per_second": 2.735,
"step": 2934
},
{
"epoch": 121.65,
"grad_norm": 1.8714542388916016,
"learning_rate": 1.695e-05,
"loss": 0.2967,
"step": 2950
},
{
"epoch": 121.98,
"eval_accuracy": 0.7657342657342657,
"eval_loss": 0.6935350894927979,
"eval_runtime": 4.7491,
"eval_samples_per_second": 60.223,
"eval_steps_per_second": 3.79,
"step": 2958
},
{
"epoch": 122.97,
"eval_accuracy": 0.7692307692307693,
"eval_loss": 0.7058219909667969,
"eval_runtime": 4.8941,
"eval_samples_per_second": 58.438,
"eval_steps_per_second": 3.678,
"step": 2982
},
{
"epoch": 123.71,
"grad_norm": 2.062924385070801,
"learning_rate": 1.6672222222222224e-05,
"loss": 0.2939,
"step": 3000
},
{
"epoch": 124.0,
"eval_accuracy": 0.7657342657342657,
"eval_loss": 0.7220865488052368,
"eval_runtime": 5.0487,
"eval_samples_per_second": 56.648,
"eval_steps_per_second": 3.565,
"step": 3007
},
{
"epoch": 124.99,
"eval_accuracy": 0.7727272727272727,
"eval_loss": 0.6857044696807861,
"eval_runtime": 5.6134,
"eval_samples_per_second": 50.95,
"eval_steps_per_second": 3.207,
"step": 3031
},
{
"epoch": 125.77,
"grad_norm": 1.7039302587509155,
"learning_rate": 1.6394444444444446e-05,
"loss": 0.3101,
"step": 3050
},
{
"epoch": 125.98,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.6742061972618103,
"eval_runtime": 5.3609,
"eval_samples_per_second": 53.349,
"eval_steps_per_second": 3.358,
"step": 3055
},
{
"epoch": 126.97,
"eval_accuracy": 0.7727272727272727,
"eval_loss": 0.7029407620429993,
"eval_runtime": 5.8891,
"eval_samples_per_second": 48.564,
"eval_steps_per_second": 3.056,
"step": 3079
},
{
"epoch": 127.84,
"grad_norm": 1.434970736503601,
"learning_rate": 1.6116666666666668e-05,
"loss": 0.284,
"step": 3100
},
{
"epoch": 128.0,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.682050347328186,
"eval_runtime": 5.1437,
"eval_samples_per_second": 55.602,
"eval_steps_per_second": 3.499,
"step": 3104
},
{
"epoch": 128.99,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.68370121717453,
"eval_runtime": 4.2733,
"eval_samples_per_second": 66.927,
"eval_steps_per_second": 4.212,
"step": 3128
},
{
"epoch": 129.9,
"grad_norm": 1.320789098739624,
"learning_rate": 1.583888888888889e-05,
"loss": 0.2902,
"step": 3150
},
{
"epoch": 129.98,
"eval_accuracy": 0.7727272727272727,
"eval_loss": 0.6823462843894958,
"eval_runtime": 5.7566,
"eval_samples_per_second": 49.682,
"eval_steps_per_second": 3.127,
"step": 3152
},
{
"epoch": 130.97,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.6950440406799316,
"eval_runtime": 4.9248,
"eval_samples_per_second": 58.074,
"eval_steps_per_second": 3.655,
"step": 3176
},
{
"epoch": 131.96,
"grad_norm": 2.1280930042266846,
"learning_rate": 1.556111111111111e-05,
"loss": 0.301,
"step": 3200
},
{
"epoch": 132.0,
"eval_accuracy": 0.7727272727272727,
"eval_loss": 0.6800761818885803,
"eval_runtime": 8.1328,
"eval_samples_per_second": 35.166,
"eval_steps_per_second": 2.213,
"step": 3201
},
{
"epoch": 132.99,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.6867505311965942,
"eval_runtime": 4.2532,
"eval_samples_per_second": 67.244,
"eval_steps_per_second": 4.232,
"step": 3225
},
{
"epoch": 133.98,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7061284184455872,
"eval_runtime": 5.3031,
"eval_samples_per_second": 53.93,
"eval_steps_per_second": 3.394,
"step": 3249
},
{
"epoch": 134.02,
"grad_norm": 1.532638669013977,
"learning_rate": 1.5283333333333333e-05,
"loss": 0.2736,
"step": 3250
},
{
"epoch": 134.97,
"eval_accuracy": 0.7727272727272727,
"eval_loss": 0.7114368677139282,
"eval_runtime": 4.6536,
"eval_samples_per_second": 61.458,
"eval_steps_per_second": 3.868,
"step": 3273
},
{
"epoch": 136.0,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.6914551854133606,
"eval_runtime": 4.5505,
"eval_samples_per_second": 62.851,
"eval_steps_per_second": 3.956,
"step": 3298
},
{
"epoch": 136.08,
"grad_norm": 2.0108492374420166,
"learning_rate": 1.5005555555555555e-05,
"loss": 0.2931,
"step": 3300
},
{
"epoch": 136.99,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7055917978286743,
"eval_runtime": 5.3067,
"eval_samples_per_second": 53.894,
"eval_steps_per_second": 3.392,
"step": 3322
},
{
"epoch": 137.98,
"eval_accuracy": 0.7727272727272727,
"eval_loss": 0.7026935815811157,
"eval_runtime": 5.186,
"eval_samples_per_second": 55.149,
"eval_steps_per_second": 3.471,
"step": 3346
},
{
"epoch": 138.14,
"grad_norm": 1.0804469585418701,
"learning_rate": 1.4727777777777779e-05,
"loss": 0.2864,
"step": 3350
},
{
"epoch": 138.97,
"eval_accuracy": 0.7657342657342657,
"eval_loss": 0.6983500719070435,
"eval_runtime": 6.955,
"eval_samples_per_second": 41.122,
"eval_steps_per_second": 2.588,
"step": 3370
},
{
"epoch": 140.0,
"eval_accuracy": 0.7657342657342657,
"eval_loss": 0.7168787121772766,
"eval_runtime": 4.234,
"eval_samples_per_second": 67.548,
"eval_steps_per_second": 4.251,
"step": 3395
},
{
"epoch": 140.21,
"grad_norm": 2.370694637298584,
"learning_rate": 1.445e-05,
"loss": 0.2765,
"step": 3400
},
{
"epoch": 140.99,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.6960318088531494,
"eval_runtime": 5.0294,
"eval_samples_per_second": 56.865,
"eval_steps_per_second": 3.579,
"step": 3419
},
{
"epoch": 141.98,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.6990492343902588,
"eval_runtime": 5.2727,
"eval_samples_per_second": 54.242,
"eval_steps_per_second": 3.414,
"step": 3443
},
{
"epoch": 142.27,
"grad_norm": 1.6676194667816162,
"learning_rate": 1.4172222222222222e-05,
"loss": 0.2808,
"step": 3450
},
{
"epoch": 142.97,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.706200897693634,
"eval_runtime": 4.5273,
"eval_samples_per_second": 63.173,
"eval_steps_per_second": 3.976,
"step": 3467
},
{
"epoch": 144.0,
"eval_accuracy": 0.7657342657342657,
"eval_loss": 0.6821764707565308,
"eval_runtime": 5.3614,
"eval_samples_per_second": 53.344,
"eval_steps_per_second": 3.357,
"step": 3492
},
{
"epoch": 144.33,
"grad_norm": 1.9151145219802856,
"learning_rate": 1.3894444444444444e-05,
"loss": 0.2712,
"step": 3500
},
{
"epoch": 144.99,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.7063603401184082,
"eval_runtime": 4.9088,
"eval_samples_per_second": 58.263,
"eval_steps_per_second": 3.667,
"step": 3516
},
{
"epoch": 145.98,
"eval_accuracy": 0.7692307692307693,
"eval_loss": 0.7150112390518188,
"eval_runtime": 7.2044,
"eval_samples_per_second": 39.698,
"eval_steps_per_second": 2.498,
"step": 3540
},
{
"epoch": 146.39,
"grad_norm": 1.5093848705291748,
"learning_rate": 1.3622222222222223e-05,
"loss": 0.2726,
"step": 3550
},
{
"epoch": 146.97,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.696849524974823,
"eval_runtime": 4.9386,
"eval_samples_per_second": 57.911,
"eval_steps_per_second": 3.645,
"step": 3564
},
{
"epoch": 148.0,
"eval_accuracy": 0.7727272727272727,
"eval_loss": 0.7086759209632874,
"eval_runtime": 4.4363,
"eval_samples_per_second": 64.468,
"eval_steps_per_second": 4.057,
"step": 3589
},
{
"epoch": 148.45,
"grad_norm": 1.4403679370880127,
"learning_rate": 1.3344444444444444e-05,
"loss": 0.2607,
"step": 3600
},
{
"epoch": 148.99,
"eval_accuracy": 0.7692307692307693,
"eval_loss": 0.7129560112953186,
"eval_runtime": 5.3809,
"eval_samples_per_second": 53.15,
"eval_steps_per_second": 3.345,
"step": 3613
},
{
"epoch": 149.98,
"eval_accuracy": 0.7902097902097902,
"eval_loss": 0.7080287933349609,
"eval_runtime": 5.8187,
"eval_samples_per_second": 49.152,
"eval_steps_per_second": 3.093,
"step": 3637
},
{
"epoch": 150.52,
"grad_norm": 2.036515235900879,
"learning_rate": 1.3066666666666666e-05,
"loss": 0.2546,
"step": 3650
},
{
"epoch": 150.97,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.7088435888290405,
"eval_runtime": 4.8742,
"eval_samples_per_second": 58.677,
"eval_steps_per_second": 3.693,
"step": 3661
},
{
"epoch": 152.0,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7030193209648132,
"eval_runtime": 4.9492,
"eval_samples_per_second": 57.787,
"eval_steps_per_second": 3.637,
"step": 3686
},
{
"epoch": 152.58,
"grad_norm": 1.200052261352539,
"learning_rate": 1.2788888888888888e-05,
"loss": 0.2563,
"step": 3700
},
{
"epoch": 152.99,
"eval_accuracy": 0.7692307692307693,
"eval_loss": 0.7077969908714294,
"eval_runtime": 4.614,
"eval_samples_per_second": 61.985,
"eval_steps_per_second": 3.901,
"step": 3710
},
{
"epoch": 153.98,
"eval_accuracy": 0.7727272727272727,
"eval_loss": 0.700455904006958,
"eval_runtime": 5.7657,
"eval_samples_per_second": 49.604,
"eval_steps_per_second": 3.122,
"step": 3734
},
{
"epoch": 154.64,
"grad_norm": 2.2751214504241943,
"learning_rate": 1.2511111111111112e-05,
"loss": 0.2531,
"step": 3750
},
{
"epoch": 154.97,
"eval_accuracy": 0.7727272727272727,
"eval_loss": 0.7160292267799377,
"eval_runtime": 5.1079,
"eval_samples_per_second": 55.992,
"eval_steps_per_second": 3.524,
"step": 3758
},
{
"epoch": 156.0,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7175909876823425,
"eval_runtime": 5.4035,
"eval_samples_per_second": 52.929,
"eval_steps_per_second": 3.331,
"step": 3783
},
{
"epoch": 156.7,
"grad_norm": 1.9024412631988525,
"learning_rate": 1.2233333333333334e-05,
"loss": 0.2446,
"step": 3800
},
{
"epoch": 156.99,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.7190600037574768,
"eval_runtime": 4.3633,
"eval_samples_per_second": 65.546,
"eval_steps_per_second": 4.125,
"step": 3807
},
{
"epoch": 157.98,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.719641387462616,
"eval_runtime": 5.0426,
"eval_samples_per_second": 56.717,
"eval_steps_per_second": 3.57,
"step": 3831
},
{
"epoch": 158.76,
"grad_norm": 3.471806287765503,
"learning_rate": 1.1955555555555556e-05,
"loss": 0.2479,
"step": 3850
},
{
"epoch": 158.97,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7073430418968201,
"eval_runtime": 3.6336,
"eval_samples_per_second": 78.711,
"eval_steps_per_second": 4.954,
"step": 3855
},
{
"epoch": 160.0,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7328661680221558,
"eval_runtime": 5.2625,
"eval_samples_per_second": 54.347,
"eval_steps_per_second": 3.42,
"step": 3880
},
{
"epoch": 160.82,
"grad_norm": 2.1171793937683105,
"learning_rate": 1.1677777777777777e-05,
"loss": 0.2523,
"step": 3900
},
{
"epoch": 160.99,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7158821821212769,
"eval_runtime": 6.5877,
"eval_samples_per_second": 43.414,
"eval_steps_per_second": 2.732,
"step": 3904
},
{
"epoch": 161.98,
"eval_accuracy": 0.7692307692307693,
"eval_loss": 0.719171404838562,
"eval_runtime": 4.5674,
"eval_samples_per_second": 62.618,
"eval_steps_per_second": 3.941,
"step": 3928
},
{
"epoch": 162.89,
"grad_norm": 1.7515395879745483,
"learning_rate": 1.1400000000000001e-05,
"loss": 0.2523,
"step": 3950
},
{
"epoch": 162.97,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.7281435132026672,
"eval_runtime": 4.4866,
"eval_samples_per_second": 63.746,
"eval_steps_per_second": 4.012,
"step": 3952
},
{
"epoch": 164.0,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7078841328620911,
"eval_runtime": 4.4241,
"eval_samples_per_second": 64.645,
"eval_steps_per_second": 4.069,
"step": 3977
},
{
"epoch": 164.95,
"grad_norm": 1.456335186958313,
"learning_rate": 1.1122222222222223e-05,
"loss": 0.2422,
"step": 4000
},
{
"epoch": 164.99,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.7161521911621094,
"eval_runtime": 5.1239,
"eval_samples_per_second": 55.817,
"eval_steps_per_second": 3.513,
"step": 4001
},
{
"epoch": 165.98,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7190020084381104,
"eval_runtime": 3.4488,
"eval_samples_per_second": 82.926,
"eval_steps_per_second": 5.219,
"step": 4025
},
{
"epoch": 166.97,
"eval_accuracy": 0.7762237762237763,
"eval_loss": 0.7311248779296875,
"eval_runtime": 5.0389,
"eval_samples_per_second": 56.759,
"eval_steps_per_second": 3.572,
"step": 4049
},
{
"epoch": 167.01,
"grad_norm": 1.2554075717926025,
"learning_rate": 1.0844444444444445e-05,
"loss": 0.242,
"step": 4050
},
{
"epoch": 168.0,
"eval_accuracy": 0.7902097902097902,
"eval_loss": 0.7110462188720703,
"eval_runtime": 4.4612,
"eval_samples_per_second": 64.108,
"eval_steps_per_second": 4.035,
"step": 4074
},
{
"epoch": 168.99,
"eval_accuracy": 0.7867132867132867,
"eval_loss": 0.7028501629829407,
"eval_runtime": 6.955,
"eval_samples_per_second": 41.122,
"eval_steps_per_second": 2.588,
"step": 4098
},
{
"epoch": 169.07,
"grad_norm": 2.8003265857696533,
"learning_rate": 1.0566666666666667e-05,
"loss": 0.2392,
"step": 4100
},
{
"epoch": 169.98,
"eval_accuracy": 0.7937062937062938,
"eval_loss": 0.7108554840087891,
"eval_runtime": 5.0033,
"eval_samples_per_second": 57.162,
"eval_steps_per_second": 3.598,
"step": 4122
},
{
"epoch": 170.97,
"eval_accuracy": 0.7902097902097902,
"eval_loss": 0.7106384634971619,
"eval_runtime": 5.1984,
"eval_samples_per_second": 55.017,
"eval_steps_per_second": 3.463,
"step": 4146
},
{
"epoch": 171.13,
"grad_norm": 2.1897969245910645,
"learning_rate": 1.028888888888889e-05,
"loss": 0.247,
"step": 4150
},
{
"epoch": 172.0,
"eval_accuracy": 0.7867132867132867,
"eval_loss": 0.7151694297790527,
"eval_runtime": 5.1963,
"eval_samples_per_second": 55.039,
"eval_steps_per_second": 3.464,
"step": 4171
},
{
"epoch": 172.99,
"eval_accuracy": 0.7657342657342657,
"eval_loss": 0.7254167795181274,
"eval_runtime": 4.4466,
"eval_samples_per_second": 64.319,
"eval_steps_per_second": 4.048,
"step": 4195
},
{
"epoch": 173.2,
"grad_norm": 2.769357681274414,
"learning_rate": 1.0011111111111112e-05,
"loss": 0.2341,
"step": 4200
},
{
"epoch": 173.98,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7290962338447571,
"eval_runtime": 6.2221,
"eval_samples_per_second": 45.965,
"eval_steps_per_second": 2.893,
"step": 4219
},
{
"epoch": 174.97,
"eval_accuracy": 0.7867132867132867,
"eval_loss": 0.7088623046875,
"eval_runtime": 4.3709,
"eval_samples_per_second": 65.433,
"eval_steps_per_second": 4.118,
"step": 4243
},
{
"epoch": 175.26,
"grad_norm": 2.044703483581543,
"learning_rate": 9.733333333333332e-06,
"loss": 0.2317,
"step": 4250
},
{
"epoch": 176.0,
"eval_accuracy": 0.7902097902097902,
"eval_loss": 0.7185826897621155,
"eval_runtime": 5.4095,
"eval_samples_per_second": 52.87,
"eval_steps_per_second": 3.327,
"step": 4268
},
{
"epoch": 176.99,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7167823314666748,
"eval_runtime": 4.9506,
"eval_samples_per_second": 57.77,
"eval_steps_per_second": 3.636,
"step": 4292
},
{
"epoch": 177.32,
"grad_norm": 1.078834056854248,
"learning_rate": 9.455555555555556e-06,
"loss": 0.2269,
"step": 4300
},
{
"epoch": 177.98,
"eval_accuracy": 0.7902097902097902,
"eval_loss": 0.7237738966941833,
"eval_runtime": 4.781,
"eval_samples_per_second": 59.82,
"eval_steps_per_second": 3.765,
"step": 4316
},
{
"epoch": 178.97,
"eval_accuracy": 0.7867132867132867,
"eval_loss": 0.7131801247596741,
"eval_runtime": 4.6869,
"eval_samples_per_second": 61.022,
"eval_steps_per_second": 3.841,
"step": 4340
},
{
"epoch": 179.38,
"grad_norm": 2.008120536804199,
"learning_rate": 9.177777777777778e-06,
"loss": 0.2283,
"step": 4350
},
{
"epoch": 180.0,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7384253144264221,
"eval_runtime": 4.5879,
"eval_samples_per_second": 62.338,
"eval_steps_per_second": 3.923,
"step": 4365
},
{
"epoch": 180.99,
"eval_accuracy": 0.7902097902097902,
"eval_loss": 0.7002861499786377,
"eval_runtime": 5.3238,
"eval_samples_per_second": 53.721,
"eval_steps_per_second": 3.381,
"step": 4389
},
{
"epoch": 181.44,
"grad_norm": 1.9518792629241943,
"learning_rate": 8.900000000000001e-06,
"loss": 0.2303,
"step": 4400
},
{
"epoch": 181.98,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7278482913970947,
"eval_runtime": 5.8358,
"eval_samples_per_second": 49.008,
"eval_steps_per_second": 3.084,
"step": 4413
},
{
"epoch": 182.97,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7143127918243408,
"eval_runtime": 6.1229,
"eval_samples_per_second": 46.71,
"eval_steps_per_second": 2.94,
"step": 4437
},
{
"epoch": 183.51,
"grad_norm": 1.0936890840530396,
"learning_rate": 8.622222222222221e-06,
"loss": 0.2109,
"step": 4450
},
{
"epoch": 184.0,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7406834363937378,
"eval_runtime": 5.0467,
"eval_samples_per_second": 56.671,
"eval_steps_per_second": 3.567,
"step": 4462
},
{
"epoch": 184.99,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7053534388542175,
"eval_runtime": 5.279,
"eval_samples_per_second": 54.177,
"eval_steps_per_second": 3.41,
"step": 4486
},
{
"epoch": 185.57,
"grad_norm": 2.9350059032440186,
"learning_rate": 8.344444444444445e-06,
"loss": 0.2261,
"step": 4500
},
{
"epoch": 185.98,
"eval_accuracy": 0.7727272727272727,
"eval_loss": 0.7260809540748596,
"eval_runtime": 5.4165,
"eval_samples_per_second": 52.802,
"eval_steps_per_second": 3.323,
"step": 4510
},
{
"epoch": 186.97,
"eval_accuracy": 0.7902097902097902,
"eval_loss": 0.7240064144134521,
"eval_runtime": 5.4866,
"eval_samples_per_second": 52.127,
"eval_steps_per_second": 3.281,
"step": 4534
},
{
"epoch": 187.63,
"grad_norm": 1.8322782516479492,
"learning_rate": 8.066666666666667e-06,
"loss": 0.2282,
"step": 4550
},
{
"epoch": 188.0,
"eval_accuracy": 0.7867132867132867,
"eval_loss": 0.7199599146842957,
"eval_runtime": 4.6736,
"eval_samples_per_second": 61.195,
"eval_steps_per_second": 3.851,
"step": 4559
},
{
"epoch": 188.99,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7102844715118408,
"eval_runtime": 5.4219,
"eval_samples_per_second": 52.749,
"eval_steps_per_second": 3.32,
"step": 4583
},
{
"epoch": 189.69,
"grad_norm": 1.8777916431427002,
"learning_rate": 7.78888888888889e-06,
"loss": 0.2321,
"step": 4600
},
{
"epoch": 189.98,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7083376049995422,
"eval_runtime": 5.9634,
"eval_samples_per_second": 47.959,
"eval_steps_per_second": 3.018,
"step": 4607
},
{
"epoch": 190.97,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7244677543640137,
"eval_runtime": 5.2078,
"eval_samples_per_second": 54.918,
"eval_steps_per_second": 3.456,
"step": 4631
},
{
"epoch": 191.75,
"grad_norm": 1.5277408361434937,
"learning_rate": 7.5111111111111105e-06,
"loss": 0.2261,
"step": 4650
},
{
"epoch": 192.0,
"eval_accuracy": 0.7867132867132867,
"eval_loss": 0.7124583721160889,
"eval_runtime": 5.7079,
"eval_samples_per_second": 50.106,
"eval_steps_per_second": 3.154,
"step": 4656
},
{
"epoch": 192.99,
"eval_accuracy": 0.7867132867132867,
"eval_loss": 0.7308976054191589,
"eval_runtime": 5.3404,
"eval_samples_per_second": 53.554,
"eval_steps_per_second": 3.371,
"step": 4680
},
{
"epoch": 193.81,
"grad_norm": 2.095749616622925,
"learning_rate": 7.233333333333333e-06,
"loss": 0.2231,
"step": 4700
},
{
"epoch": 193.98,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7237818837165833,
"eval_runtime": 4.6666,
"eval_samples_per_second": 61.286,
"eval_steps_per_second": 3.857,
"step": 4704
},
{
"epoch": 194.97,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7253320217132568,
"eval_runtime": 5.8059,
"eval_samples_per_second": 49.261,
"eval_steps_per_second": 3.1,
"step": 4728
},
{
"epoch": 195.88,
"grad_norm": 1.6955636739730835,
"learning_rate": 6.955555555555556e-06,
"loss": 0.2083,
"step": 4750
},
{
"epoch": 196.0,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7240011692047119,
"eval_runtime": 6.0767,
"eval_samples_per_second": 47.065,
"eval_steps_per_second": 2.962,
"step": 4753
},
{
"epoch": 196.99,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7131750583648682,
"eval_runtime": 5.3063,
"eval_samples_per_second": 53.898,
"eval_steps_per_second": 3.392,
"step": 4777
},
{
"epoch": 197.94,
"grad_norm": 0.8933289051055908,
"learning_rate": 6.677777777777778e-06,
"loss": 0.2116,
"step": 4800
},
{
"epoch": 197.98,
"eval_accuracy": 0.7867132867132867,
"eval_loss": 0.7169559597969055,
"eval_runtime": 5.5713,
"eval_samples_per_second": 51.335,
"eval_steps_per_second": 3.231,
"step": 4801
},
{
"epoch": 198.97,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7265609502792358,
"eval_runtime": 4.1397,
"eval_samples_per_second": 69.087,
"eval_steps_per_second": 4.348,
"step": 4825
},
{
"epoch": 200.0,
"grad_norm": 2.175414562225342,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.2219,
"step": 4850
},
{
"epoch": 200.0,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7162622213363647,
"eval_runtime": 5.2016,
"eval_samples_per_second": 54.984,
"eval_steps_per_second": 3.461,
"step": 4850
},
{
"epoch": 200.99,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7302048802375793,
"eval_runtime": 4.9222,
"eval_samples_per_second": 58.104,
"eval_steps_per_second": 3.657,
"step": 4874
},
{
"epoch": 201.98,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7223746180534363,
"eval_runtime": 4.6884,
"eval_samples_per_second": 61.002,
"eval_steps_per_second": 3.839,
"step": 4898
},
{
"epoch": 202.06,
"grad_norm": 2.053739309310913,
"learning_rate": 6.1222222222222224e-06,
"loss": 0.2183,
"step": 4900
},
{
"epoch": 202.97,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7179226279258728,
"eval_runtime": 4.5556,
"eval_samples_per_second": 62.78,
"eval_steps_per_second": 3.951,
"step": 4922
},
{
"epoch": 204.0,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7245286107063293,
"eval_runtime": 5.7474,
"eval_samples_per_second": 49.762,
"eval_steps_per_second": 3.132,
"step": 4947
},
{
"epoch": 204.12,
"grad_norm": 1.1081063747406006,
"learning_rate": 5.844444444444444e-06,
"loss": 0.2053,
"step": 4950
},
{
"epoch": 204.99,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7344977259635925,
"eval_runtime": 5.4178,
"eval_samples_per_second": 52.789,
"eval_steps_per_second": 3.322,
"step": 4971
},
{
"epoch": 205.98,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7249557971954346,
"eval_runtime": 5.6352,
"eval_samples_per_second": 50.753,
"eval_steps_per_second": 3.194,
"step": 4995
},
{
"epoch": 206.19,
"grad_norm": 1.09213125705719,
"learning_rate": 5.566666666666667e-06,
"loss": 0.2113,
"step": 5000
},
{
"epoch": 206.97,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7246001958847046,
"eval_runtime": 4.9071,
"eval_samples_per_second": 58.283,
"eval_steps_per_second": 3.668,
"step": 5019
},
{
"epoch": 208.0,
"eval_accuracy": 0.7867132867132867,
"eval_loss": 0.7270117998123169,
"eval_runtime": 5.8385,
"eval_samples_per_second": 48.985,
"eval_steps_per_second": 3.083,
"step": 5044
},
{
"epoch": 208.25,
"grad_norm": 1.6693130731582642,
"learning_rate": 5.288888888888889e-06,
"loss": 0.2152,
"step": 5050
},
{
"epoch": 208.99,
"eval_accuracy": 0.7867132867132867,
"eval_loss": 0.7285901308059692,
"eval_runtime": 5.489,
"eval_samples_per_second": 52.104,
"eval_steps_per_second": 3.279,
"step": 5068
},
{
"epoch": 209.98,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7332947254180908,
"eval_runtime": 5.3017,
"eval_samples_per_second": 53.945,
"eval_steps_per_second": 3.395,
"step": 5092
},
{
"epoch": 210.31,
"grad_norm": 2.0511515140533447,
"learning_rate": 5.011111111111112e-06,
"loss": 0.2129,
"step": 5100
},
{
"epoch": 210.97,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7307863831520081,
"eval_runtime": 5.2991,
"eval_samples_per_second": 53.971,
"eval_steps_per_second": 3.397,
"step": 5116
},
{
"epoch": 212.0,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7176437973976135,
"eval_runtime": 4.9452,
"eval_samples_per_second": 57.834,
"eval_steps_per_second": 3.64,
"step": 5141
},
{
"epoch": 212.37,
"grad_norm": 1.8491023778915405,
"learning_rate": 4.7333333333333335e-06,
"loss": 0.2173,
"step": 5150
},
{
"epoch": 212.99,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7334882020950317,
"eval_runtime": 4.9602,
"eval_samples_per_second": 57.659,
"eval_steps_per_second": 3.629,
"step": 5165
},
{
"epoch": 213.98,
"eval_accuracy": 0.7797202797202797,
"eval_loss": 0.7268483638763428,
"eval_runtime": 5.885,
"eval_samples_per_second": 48.598,
"eval_steps_per_second": 3.059,
"step": 5189
},
{
"epoch": 214.43,
"grad_norm": 1.2067769765853882,
"learning_rate": 4.455555555555556e-06,
"loss": 0.2042,
"step": 5200
},
{
"epoch": 214.97,
"eval_accuracy": 0.7902097902097902,
"eval_loss": 0.7299237847328186,
"eval_runtime": 5.7645,
"eval_samples_per_second": 49.614,
"eval_steps_per_second": 3.123,
"step": 5213
},
{
"epoch": 216.0,
"eval_accuracy": 0.7902097902097902,
"eval_loss": 0.7360625863075256,
"eval_runtime": 4.7143,
"eval_samples_per_second": 60.667,
"eval_steps_per_second": 3.818,
"step": 5238
},
{
"epoch": 216.49,
"grad_norm": 1.3863427639007568,
"learning_rate": 4.177777777777777e-06,
"loss": 0.2112,
"step": 5250
},
{
"epoch": 216.99,
"eval_accuracy": 0.7902097902097902,
"eval_loss": 0.723866879940033,
"eval_runtime": 5.3445,
"eval_samples_per_second": 53.513,
"eval_steps_per_second": 3.368,
"step": 5262
},
{
"epoch": 217.98,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.7252445220947266,
"eval_runtime": 4.6314,
"eval_samples_per_second": 61.753,
"eval_steps_per_second": 3.887,
"step": 5286
},
{
"epoch": 218.56,
"grad_norm": 1.1177924871444702,
"learning_rate": 3.9e-06,
"loss": 0.2007,
"step": 5300
},
{
"epoch": 218.97,
"eval_accuracy": 0.7867132867132867,
"eval_loss": 0.719983696937561,
"eval_runtime": 4.865,
"eval_samples_per_second": 58.787,
"eval_steps_per_second": 3.7,
"step": 5310
},
{
"epoch": 220.0,
"eval_accuracy": 0.7867132867132867,
"eval_loss": 0.7195786237716675,
"eval_runtime": 5.5422,
"eval_samples_per_second": 51.604,
"eval_steps_per_second": 3.248,
"step": 5335
},
{
"epoch": 220.62,
"grad_norm": 1.413304090499878,
"learning_rate": 3.6222222222222226e-06,
"loss": 0.2163,
"step": 5350
},
{
"epoch": 220.99,
"eval_accuracy": 0.7902097902097902,
"eval_loss": 0.7309580445289612,
"eval_runtime": 5.2512,
"eval_samples_per_second": 54.463,
"eval_steps_per_second": 3.428,
"step": 5359
},
{
"epoch": 221.98,
"eval_accuracy": 0.7867132867132867,
"eval_loss": 0.7313971519470215,
"eval_runtime": 5.1151,
"eval_samples_per_second": 55.913,
"eval_steps_per_second": 3.519,
"step": 5383
},
{
"epoch": 222.68,
"grad_norm": 3.0471901893615723,
"learning_rate": 3.3444444444444445e-06,
"loss": 0.2141,
"step": 5400
},
{
"epoch": 222.97,
"eval_accuracy": 0.7832167832167832,
"eval_loss": 0.727938175201416,
"eval_runtime": 4.6405,
"eval_samples_per_second": 61.631,
"eval_steps_per_second": 3.879,
"step": 5407
},
{
"epoch": 224.0,
"eval_accuracy": 0.7902097902097902,
"eval_loss": 0.725923478603363,
"eval_runtime": 5.0906,
"eval_samples_per_second": 56.182,
"eval_steps_per_second": 3.536,
"step": 5432
}
],
"logging_steps": 50,
"max_steps": 6000,
"num_input_tokens_seen": 0,
"num_train_epochs": 250,
"save_steps": 500,
"total_flos": 3.037085846065152e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}