spycoder's picture
yay
13b148c verified
{
"best_metric": 0.5304816365242004,
"best_model_checkpoint": "./vit-base-beans/checkpoint-1600",
"epoch": 4.0,
"eval_steps": 100,
"global_step": 1736,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02304147465437788,
"grad_norm": 2.396202564239502,
"learning_rate": 0.00019884792626728113,
"loss": 1.8485,
"step": 10
},
{
"epoch": 0.04608294930875576,
"grad_norm": 1.289166808128357,
"learning_rate": 0.00019769585253456222,
"loss": 1.5911,
"step": 20
},
{
"epoch": 0.06912442396313365,
"grad_norm": 2.512033462524414,
"learning_rate": 0.00019654377880184333,
"loss": 1.4806,
"step": 30
},
{
"epoch": 0.09216589861751152,
"grad_norm": 2.6234657764434814,
"learning_rate": 0.00019539170506912442,
"loss": 1.3684,
"step": 40
},
{
"epoch": 0.1152073732718894,
"grad_norm": 2.335149049758911,
"learning_rate": 0.00019423963133640554,
"loss": 1.4012,
"step": 50
},
{
"epoch": 0.1382488479262673,
"grad_norm": 3.386568546295166,
"learning_rate": 0.00019308755760368663,
"loss": 1.2248,
"step": 60
},
{
"epoch": 0.16129032258064516,
"grad_norm": 1.9273797273635864,
"learning_rate": 0.00019193548387096775,
"loss": 1.144,
"step": 70
},
{
"epoch": 0.18433179723502305,
"grad_norm": 2.2117414474487305,
"learning_rate": 0.00019078341013824886,
"loss": 1.0101,
"step": 80
},
{
"epoch": 0.2073732718894009,
"grad_norm": 3.1132171154022217,
"learning_rate": 0.00018963133640552998,
"loss": 1.1411,
"step": 90
},
{
"epoch": 0.2304147465437788,
"grad_norm": 3.0585570335388184,
"learning_rate": 0.00018847926267281107,
"loss": 1.0791,
"step": 100
},
{
"epoch": 0.2304147465437788,
"eval_accuracy": 0.6335113484646195,
"eval_loss": 1.0347875356674194,
"eval_runtime": 11.9052,
"eval_samples_per_second": 125.828,
"eval_steps_per_second": 15.791,
"step": 100
},
{
"epoch": 0.2534562211981567,
"grad_norm": 2.400747299194336,
"learning_rate": 0.00018732718894009219,
"loss": 1.04,
"step": 110
},
{
"epoch": 0.2764976958525346,
"grad_norm": 2.432607412338257,
"learning_rate": 0.00018617511520737328,
"loss": 1.0396,
"step": 120
},
{
"epoch": 0.2995391705069124,
"grad_norm": 2.5169568061828613,
"learning_rate": 0.0001850230414746544,
"loss": 0.9925,
"step": 130
},
{
"epoch": 0.3225806451612903,
"grad_norm": 2.450554847717285,
"learning_rate": 0.00018387096774193548,
"loss": 1.0361,
"step": 140
},
{
"epoch": 0.3456221198156682,
"grad_norm": 1.5931885242462158,
"learning_rate": 0.0001827188940092166,
"loss": 0.9851,
"step": 150
},
{
"epoch": 0.3686635944700461,
"grad_norm": 1.8019052743911743,
"learning_rate": 0.0001815668202764977,
"loss": 0.8847,
"step": 160
},
{
"epoch": 0.391705069124424,
"grad_norm": 2.283034086227417,
"learning_rate": 0.0001804147465437788,
"loss": 0.8507,
"step": 170
},
{
"epoch": 0.4147465437788018,
"grad_norm": 2.5878796577453613,
"learning_rate": 0.0001792626728110599,
"loss": 0.9579,
"step": 180
},
{
"epoch": 0.4377880184331797,
"grad_norm": 3.469618558883667,
"learning_rate": 0.000178110599078341,
"loss": 0.9453,
"step": 190
},
{
"epoch": 0.4608294930875576,
"grad_norm": 1.9743025302886963,
"learning_rate": 0.00017695852534562213,
"loss": 0.9415,
"step": 200
},
{
"epoch": 0.4608294930875576,
"eval_accuracy": 0.6448598130841121,
"eval_loss": 0.9576324820518494,
"eval_runtime": 11.862,
"eval_samples_per_second": 126.285,
"eval_steps_per_second": 15.849,
"step": 200
},
{
"epoch": 0.4838709677419355,
"grad_norm": 3.031723976135254,
"learning_rate": 0.00017580645161290325,
"loss": 0.7819,
"step": 210
},
{
"epoch": 0.5069124423963134,
"grad_norm": 2.2470805644989014,
"learning_rate": 0.00017465437788018436,
"loss": 0.8163,
"step": 220
},
{
"epoch": 0.5299539170506913,
"grad_norm": 1.723471760749817,
"learning_rate": 0.00017350230414746545,
"loss": 0.6728,
"step": 230
},
{
"epoch": 0.5529953917050692,
"grad_norm": 3.93212628364563,
"learning_rate": 0.00017235023041474657,
"loss": 0.684,
"step": 240
},
{
"epoch": 0.576036866359447,
"grad_norm": 1.4867981672286987,
"learning_rate": 0.00017119815668202766,
"loss": 0.8527,
"step": 250
},
{
"epoch": 0.5990783410138248,
"grad_norm": 2.4340641498565674,
"learning_rate": 0.00017004608294930878,
"loss": 1.0102,
"step": 260
},
{
"epoch": 0.6221198156682027,
"grad_norm": 2.8441660404205322,
"learning_rate": 0.00016889400921658987,
"loss": 0.7739,
"step": 270
},
{
"epoch": 0.6451612903225806,
"grad_norm": 1.6598294973373413,
"learning_rate": 0.00016774193548387098,
"loss": 0.7442,
"step": 280
},
{
"epoch": 0.6682027649769585,
"grad_norm": 3.455202102661133,
"learning_rate": 0.00016658986175115207,
"loss": 0.7643,
"step": 290
},
{
"epoch": 0.6912442396313364,
"grad_norm": 2.480116367340088,
"learning_rate": 0.0001654377880184332,
"loss": 0.7839,
"step": 300
},
{
"epoch": 0.6912442396313364,
"eval_accuracy": 0.6662216288384513,
"eval_loss": 0.89629727602005,
"eval_runtime": 11.7103,
"eval_samples_per_second": 127.921,
"eval_steps_per_second": 16.054,
"step": 300
},
{
"epoch": 0.7142857142857143,
"grad_norm": 3.3055620193481445,
"learning_rate": 0.00016428571428571428,
"loss": 0.639,
"step": 310
},
{
"epoch": 0.7373271889400922,
"grad_norm": 1.8542070388793945,
"learning_rate": 0.0001631336405529954,
"loss": 0.8931,
"step": 320
},
{
"epoch": 0.7603686635944701,
"grad_norm": 1.6089766025543213,
"learning_rate": 0.00016198156682027649,
"loss": 0.9023,
"step": 330
},
{
"epoch": 0.783410138248848,
"grad_norm": 1.5780836343765259,
"learning_rate": 0.0001608294930875576,
"loss": 0.7285,
"step": 340
},
{
"epoch": 0.8064516129032258,
"grad_norm": 3.153092384338379,
"learning_rate": 0.00015967741935483872,
"loss": 0.8702,
"step": 350
},
{
"epoch": 0.8294930875576036,
"grad_norm": 2.3161656856536865,
"learning_rate": 0.00015852534562211984,
"loss": 0.7343,
"step": 360
},
{
"epoch": 0.8525345622119815,
"grad_norm": 1.7923251390457153,
"learning_rate": 0.00015737327188940093,
"loss": 0.7986,
"step": 370
},
{
"epoch": 0.8755760368663594,
"grad_norm": 2.7093405723571777,
"learning_rate": 0.00015622119815668204,
"loss": 0.6377,
"step": 380
},
{
"epoch": 0.8986175115207373,
"grad_norm": 4.7555251121521,
"learning_rate": 0.00015506912442396313,
"loss": 0.8223,
"step": 390
},
{
"epoch": 0.9216589861751152,
"grad_norm": 2.78916072845459,
"learning_rate": 0.00015391705069124425,
"loss": 0.7181,
"step": 400
},
{
"epoch": 0.9216589861751152,
"eval_accuracy": 0.6962616822429907,
"eval_loss": 0.8479276299476624,
"eval_runtime": 11.6609,
"eval_samples_per_second": 128.464,
"eval_steps_per_second": 16.122,
"step": 400
},
{
"epoch": 0.9447004608294931,
"grad_norm": 2.4783871173858643,
"learning_rate": 0.00015276497695852537,
"loss": 0.7422,
"step": 410
},
{
"epoch": 0.967741935483871,
"grad_norm": 2.8775382041931152,
"learning_rate": 0.00015161290322580646,
"loss": 0.6255,
"step": 420
},
{
"epoch": 0.9907834101382489,
"grad_norm": 2.3851194381713867,
"learning_rate": 0.00015046082949308757,
"loss": 0.7266,
"step": 430
},
{
"epoch": 1.0138248847926268,
"grad_norm": 5.285385608673096,
"learning_rate": 0.00014930875576036866,
"loss": 0.6283,
"step": 440
},
{
"epoch": 1.0368663594470047,
"grad_norm": 1.691789984703064,
"learning_rate": 0.00014815668202764978,
"loss": 0.4918,
"step": 450
},
{
"epoch": 1.0599078341013826,
"grad_norm": 2.8921382427215576,
"learning_rate": 0.00014700460829493087,
"loss": 0.5787,
"step": 460
},
{
"epoch": 1.0829493087557605,
"grad_norm": 3.1509757041931152,
"learning_rate": 0.00014585253456221199,
"loss": 0.4906,
"step": 470
},
{
"epoch": 1.1059907834101383,
"grad_norm": 3.2979822158813477,
"learning_rate": 0.0001447004608294931,
"loss": 0.5715,
"step": 480
},
{
"epoch": 1.129032258064516,
"grad_norm": 3.3389899730682373,
"learning_rate": 0.00014354838709677422,
"loss": 0.5411,
"step": 490
},
{
"epoch": 1.1520737327188941,
"grad_norm": 0.9589664936065674,
"learning_rate": 0.0001423963133640553,
"loss": 0.3995,
"step": 500
},
{
"epoch": 1.1520737327188941,
"eval_accuracy": 0.7169559412550067,
"eval_loss": 0.7820530533790588,
"eval_runtime": 11.5056,
"eval_samples_per_second": 130.197,
"eval_steps_per_second": 16.34,
"step": 500
},
{
"epoch": 1.1751152073732718,
"grad_norm": 2.248042106628418,
"learning_rate": 0.00014124423963133643,
"loss": 0.5057,
"step": 510
},
{
"epoch": 1.1981566820276497,
"grad_norm": 3.944963216781616,
"learning_rate": 0.00014009216589861752,
"loss": 0.5005,
"step": 520
},
{
"epoch": 1.2211981566820276,
"grad_norm": 2.7981412410736084,
"learning_rate": 0.00013894009216589863,
"loss": 0.6703,
"step": 530
},
{
"epoch": 1.2442396313364055,
"grad_norm": 1.683069109916687,
"learning_rate": 0.00013778801843317972,
"loss": 0.5394,
"step": 540
},
{
"epoch": 1.2672811059907834,
"grad_norm": 1.2122957706451416,
"learning_rate": 0.00013663594470046084,
"loss": 0.4775,
"step": 550
},
{
"epoch": 1.2903225806451613,
"grad_norm": 1.4005225896835327,
"learning_rate": 0.00013548387096774193,
"loss": 0.4467,
"step": 560
},
{
"epoch": 1.3133640552995391,
"grad_norm": 2.5969114303588867,
"learning_rate": 0.00013433179723502305,
"loss": 0.4289,
"step": 570
},
{
"epoch": 1.336405529953917,
"grad_norm": 3.344553232192993,
"learning_rate": 0.00013317972350230414,
"loss": 0.4631,
"step": 580
},
{
"epoch": 1.359447004608295,
"grad_norm": 1.6798585653305054,
"learning_rate": 0.00013202764976958525,
"loss": 0.4329,
"step": 590
},
{
"epoch": 1.3824884792626728,
"grad_norm": 1.3849396705627441,
"learning_rate": 0.00013087557603686637,
"loss": 0.5025,
"step": 600
},
{
"epoch": 1.3824884792626728,
"eval_accuracy": 0.7837116154873164,
"eval_loss": 0.6299713253974915,
"eval_runtime": 11.705,
"eval_samples_per_second": 127.979,
"eval_steps_per_second": 16.061,
"step": 600
},
{
"epoch": 1.4055299539170507,
"grad_norm": 2.550548791885376,
"learning_rate": 0.00012972350230414746,
"loss": 0.4463,
"step": 610
},
{
"epoch": 1.4285714285714286,
"grad_norm": 3.063411235809326,
"learning_rate": 0.00012857142857142858,
"loss": 0.3624,
"step": 620
},
{
"epoch": 1.4516129032258065,
"grad_norm": 6.676961898803711,
"learning_rate": 0.0001274193548387097,
"loss": 0.4446,
"step": 630
},
{
"epoch": 1.4746543778801844,
"grad_norm": 0.8720624446868896,
"learning_rate": 0.0001262672811059908,
"loss": 0.5162,
"step": 640
},
{
"epoch": 1.4976958525345623,
"grad_norm": 2.214848041534424,
"learning_rate": 0.0001251152073732719,
"loss": 0.2978,
"step": 650
},
{
"epoch": 1.52073732718894,
"grad_norm": 5.083272457122803,
"learning_rate": 0.00012396313364055302,
"loss": 0.5157,
"step": 660
},
{
"epoch": 1.543778801843318,
"grad_norm": 4.042588710784912,
"learning_rate": 0.0001228110599078341,
"loss": 0.5338,
"step": 670
},
{
"epoch": 1.5668202764976957,
"grad_norm": 3.1029160022735596,
"learning_rate": 0.00012165898617511522,
"loss": 0.4767,
"step": 680
},
{
"epoch": 1.5898617511520738,
"grad_norm": 1.4430710077285767,
"learning_rate": 0.00012050691244239631,
"loss": 0.5531,
"step": 690
},
{
"epoch": 1.6129032258064515,
"grad_norm": 11.178030967712402,
"learning_rate": 0.00011935483870967743,
"loss": 0.4985,
"step": 700
},
{
"epoch": 1.6129032258064515,
"eval_accuracy": 0.7489986648865153,
"eval_loss": 0.7058817744255066,
"eval_runtime": 11.9139,
"eval_samples_per_second": 125.736,
"eval_steps_per_second": 15.78,
"step": 700
},
{
"epoch": 1.6359447004608296,
"grad_norm": 3.918297529220581,
"learning_rate": 0.00011820276497695852,
"loss": 0.5471,
"step": 710
},
{
"epoch": 1.6589861751152073,
"grad_norm": 2.7170467376708984,
"learning_rate": 0.00011705069124423964,
"loss": 0.4797,
"step": 720
},
{
"epoch": 1.6820276497695854,
"grad_norm": 1.0436949729919434,
"learning_rate": 0.00011589861751152074,
"loss": 0.427,
"step": 730
},
{
"epoch": 1.705069124423963,
"grad_norm": 3.6829638481140137,
"learning_rate": 0.00011474654377880186,
"loss": 0.5121,
"step": 740
},
{
"epoch": 1.728110599078341,
"grad_norm": 1.8748345375061035,
"learning_rate": 0.00011359447004608295,
"loss": 0.4227,
"step": 750
},
{
"epoch": 1.7511520737327189,
"grad_norm": 4.548758506774902,
"learning_rate": 0.00011244239631336406,
"loss": 0.3164,
"step": 760
},
{
"epoch": 1.7741935483870968,
"grad_norm": 3.4847280979156494,
"learning_rate": 0.00011129032258064515,
"loss": 0.5092,
"step": 770
},
{
"epoch": 1.7972350230414746,
"grad_norm": 1.8869714736938477,
"learning_rate": 0.00011013824884792627,
"loss": 0.4472,
"step": 780
},
{
"epoch": 1.8202764976958525,
"grad_norm": 3.899409770965576,
"learning_rate": 0.00010898617511520739,
"loss": 0.4708,
"step": 790
},
{
"epoch": 1.8433179723502304,
"grad_norm": 1.543060541152954,
"learning_rate": 0.00010783410138248849,
"loss": 0.4388,
"step": 800
},
{
"epoch": 1.8433179723502304,
"eval_accuracy": 0.7857142857142857,
"eval_loss": 0.5893343091011047,
"eval_runtime": 11.4174,
"eval_samples_per_second": 131.203,
"eval_steps_per_second": 16.466,
"step": 800
},
{
"epoch": 1.8663594470046083,
"grad_norm": 5.587724208831787,
"learning_rate": 0.0001066820276497696,
"loss": 0.4264,
"step": 810
},
{
"epoch": 1.8894009216589862,
"grad_norm": 7.794037342071533,
"learning_rate": 0.0001055299539170507,
"loss": 0.4513,
"step": 820
},
{
"epoch": 1.912442396313364,
"grad_norm": 3.597796678543091,
"learning_rate": 0.00010437788018433181,
"loss": 0.437,
"step": 830
},
{
"epoch": 1.935483870967742,
"grad_norm": 2.825336217880249,
"learning_rate": 0.0001032258064516129,
"loss": 0.5202,
"step": 840
},
{
"epoch": 1.9585253456221197,
"grad_norm": 1.8002281188964844,
"learning_rate": 0.00010207373271889402,
"loss": 0.3283,
"step": 850
},
{
"epoch": 1.9815668202764978,
"grad_norm": 6.496976375579834,
"learning_rate": 0.00010092165898617512,
"loss": 0.2887,
"step": 860
},
{
"epoch": 2.0046082949308754,
"grad_norm": 2.1674392223358154,
"learning_rate": 9.976958525345623e-05,
"loss": 0.3299,
"step": 870
},
{
"epoch": 2.0276497695852536,
"grad_norm": 0.475057989358902,
"learning_rate": 9.861751152073733e-05,
"loss": 0.2049,
"step": 880
},
{
"epoch": 2.0506912442396312,
"grad_norm": 2.232353687286377,
"learning_rate": 9.746543778801845e-05,
"loss": 0.2598,
"step": 890
},
{
"epoch": 2.0737327188940093,
"grad_norm": 3.595874309539795,
"learning_rate": 9.631336405529955e-05,
"loss": 0.2389,
"step": 900
},
{
"epoch": 2.0737327188940093,
"eval_accuracy": 0.807743658210948,
"eval_loss": 0.5928804278373718,
"eval_runtime": 11.7831,
"eval_samples_per_second": 127.131,
"eval_steps_per_second": 15.955,
"step": 900
},
{
"epoch": 2.096774193548387,
"grad_norm": 2.4027860164642334,
"learning_rate": 9.516129032258065e-05,
"loss": 0.2023,
"step": 910
},
{
"epoch": 2.119815668202765,
"grad_norm": 4.1582560539245605,
"learning_rate": 9.400921658986176e-05,
"loss": 0.2389,
"step": 920
},
{
"epoch": 2.142857142857143,
"grad_norm": 3.8105199337005615,
"learning_rate": 9.285714285714286e-05,
"loss": 0.2054,
"step": 930
},
{
"epoch": 2.165898617511521,
"grad_norm": 4.042884826660156,
"learning_rate": 9.170506912442398e-05,
"loss": 0.2445,
"step": 940
},
{
"epoch": 2.1889400921658986,
"grad_norm": 3.3385071754455566,
"learning_rate": 9.055299539170508e-05,
"loss": 0.2578,
"step": 950
},
{
"epoch": 2.2119815668202767,
"grad_norm": 2.232977867126465,
"learning_rate": 8.940092165898618e-05,
"loss": 0.2168,
"step": 960
},
{
"epoch": 2.2350230414746544,
"grad_norm": 4.8774847984313965,
"learning_rate": 8.824884792626729e-05,
"loss": 0.1978,
"step": 970
},
{
"epoch": 2.258064516129032,
"grad_norm": 2.6131808757781982,
"learning_rate": 8.709677419354839e-05,
"loss": 0.223,
"step": 980
},
{
"epoch": 2.28110599078341,
"grad_norm": 1.6126481294631958,
"learning_rate": 8.594470046082949e-05,
"loss": 0.3882,
"step": 990
},
{
"epoch": 2.3041474654377883,
"grad_norm": 1.6977124214172363,
"learning_rate": 8.479262672811061e-05,
"loss": 0.2767,
"step": 1000
},
{
"epoch": 2.3041474654377883,
"eval_accuracy": 0.8090787716955942,
"eval_loss": 0.5795237421989441,
"eval_runtime": 11.3869,
"eval_samples_per_second": 131.555,
"eval_steps_per_second": 16.51,
"step": 1000
},
{
"epoch": 2.327188940092166,
"grad_norm": 5.384529113769531,
"learning_rate": 8.364055299539171e-05,
"loss": 0.2478,
"step": 1010
},
{
"epoch": 2.3502304147465436,
"grad_norm": 7.527071952819824,
"learning_rate": 8.248847926267282e-05,
"loss": 0.1614,
"step": 1020
},
{
"epoch": 2.3732718894009217,
"grad_norm": 3.253967523574829,
"learning_rate": 8.133640552995392e-05,
"loss": 0.1988,
"step": 1030
},
{
"epoch": 2.3963133640552994,
"grad_norm": 2.3061683177948,
"learning_rate": 8.018433179723502e-05,
"loss": 0.2267,
"step": 1040
},
{
"epoch": 2.4193548387096775,
"grad_norm": 5.240030288696289,
"learning_rate": 7.903225806451613e-05,
"loss": 0.3522,
"step": 1050
},
{
"epoch": 2.442396313364055,
"grad_norm": 5.367170810699463,
"learning_rate": 7.788018433179723e-05,
"loss": 0.21,
"step": 1060
},
{
"epoch": 2.4654377880184333,
"grad_norm": 2.52602219581604,
"learning_rate": 7.672811059907835e-05,
"loss": 0.208,
"step": 1070
},
{
"epoch": 2.488479262672811,
"grad_norm": 3.110276937484741,
"learning_rate": 7.557603686635945e-05,
"loss": 0.1624,
"step": 1080
},
{
"epoch": 2.511520737327189,
"grad_norm": 3.7577178478240967,
"learning_rate": 7.442396313364057e-05,
"loss": 0.2187,
"step": 1090
},
{
"epoch": 2.5345622119815667,
"grad_norm": 0.886064887046814,
"learning_rate": 7.327188940092167e-05,
"loss": 0.2387,
"step": 1100
},
{
"epoch": 2.5345622119815667,
"eval_accuracy": 0.8090787716955942,
"eval_loss": 0.6099982857704163,
"eval_runtime": 11.7513,
"eval_samples_per_second": 127.476,
"eval_steps_per_second": 15.998,
"step": 1100
},
{
"epoch": 2.557603686635945,
"grad_norm": 0.9772585034370422,
"learning_rate": 7.211981566820277e-05,
"loss": 0.2289,
"step": 1110
},
{
"epoch": 2.5806451612903225,
"grad_norm": 5.879600524902344,
"learning_rate": 7.096774193548388e-05,
"loss": 0.2592,
"step": 1120
},
{
"epoch": 2.6036866359447006,
"grad_norm": 5.125580310821533,
"learning_rate": 6.981566820276498e-05,
"loss": 0.1801,
"step": 1130
},
{
"epoch": 2.6267281105990783,
"grad_norm": 4.4502692222595215,
"learning_rate": 6.86635944700461e-05,
"loss": 0.3577,
"step": 1140
},
{
"epoch": 2.6497695852534564,
"grad_norm": 0.543267548084259,
"learning_rate": 6.75115207373272e-05,
"loss": 0.1313,
"step": 1150
},
{
"epoch": 2.672811059907834,
"grad_norm": 1.4891630411148071,
"learning_rate": 6.63594470046083e-05,
"loss": 0.1858,
"step": 1160
},
{
"epoch": 2.6958525345622117,
"grad_norm": 2.359645366668701,
"learning_rate": 6.52073732718894e-05,
"loss": 0.2059,
"step": 1170
},
{
"epoch": 2.71889400921659,
"grad_norm": 2.5760185718536377,
"learning_rate": 6.405529953917051e-05,
"loss": 0.2378,
"step": 1180
},
{
"epoch": 2.741935483870968,
"grad_norm": 0.24703356623649597,
"learning_rate": 6.290322580645161e-05,
"loss": 0.1487,
"step": 1190
},
{
"epoch": 2.7649769585253456,
"grad_norm": 0.22307877242565155,
"learning_rate": 6.175115207373272e-05,
"loss": 0.1691,
"step": 1200
},
{
"epoch": 2.7649769585253456,
"eval_accuracy": 0.8070761014686249,
"eval_loss": 0.6174820065498352,
"eval_runtime": 11.265,
"eval_samples_per_second": 132.978,
"eval_steps_per_second": 16.689,
"step": 1200
},
{
"epoch": 2.7880184331797233,
"grad_norm": 2.50034761428833,
"learning_rate": 6.0599078341013825e-05,
"loss": 0.2148,
"step": 1210
},
{
"epoch": 2.8110599078341014,
"grad_norm": 0.3251860439777374,
"learning_rate": 5.944700460829493e-05,
"loss": 0.1538,
"step": 1220
},
{
"epoch": 2.8341013824884795,
"grad_norm": 3.687969446182251,
"learning_rate": 5.829493087557604e-05,
"loss": 0.2445,
"step": 1230
},
{
"epoch": 2.857142857142857,
"grad_norm": 7.214417457580566,
"learning_rate": 5.714285714285714e-05,
"loss": 0.229,
"step": 1240
},
{
"epoch": 2.880184331797235,
"grad_norm": 2.587062120437622,
"learning_rate": 5.5990783410138245e-05,
"loss": 0.1999,
"step": 1250
},
{
"epoch": 2.903225806451613,
"grad_norm": 4.365920066833496,
"learning_rate": 5.4838709677419355e-05,
"loss": 0.1061,
"step": 1260
},
{
"epoch": 2.9262672811059907,
"grad_norm": 3.7295572757720947,
"learning_rate": 5.368663594470046e-05,
"loss": 0.3093,
"step": 1270
},
{
"epoch": 2.9493087557603688,
"grad_norm": 2.4992685317993164,
"learning_rate": 5.253456221198156e-05,
"loss": 0.1644,
"step": 1280
},
{
"epoch": 2.9723502304147464,
"grad_norm": 5.495995998382568,
"learning_rate": 5.138248847926268e-05,
"loss": 0.2393,
"step": 1290
},
{
"epoch": 2.9953917050691246,
"grad_norm": 2.1380579471588135,
"learning_rate": 5.023041474654379e-05,
"loss": 0.1738,
"step": 1300
},
{
"epoch": 2.9953917050691246,
"eval_accuracy": 0.8197596795727636,
"eval_loss": 0.5877332091331482,
"eval_runtime": 11.4089,
"eval_samples_per_second": 131.301,
"eval_steps_per_second": 16.478,
"step": 1300
},
{
"epoch": 3.0184331797235022,
"grad_norm": 6.119831085205078,
"learning_rate": 4.9078341013824885e-05,
"loss": 0.075,
"step": 1310
},
{
"epoch": 3.0414746543778803,
"grad_norm": 0.25446683168411255,
"learning_rate": 4.792626728110599e-05,
"loss": 0.0528,
"step": 1320
},
{
"epoch": 3.064516129032258,
"grad_norm": 0.32773900032043457,
"learning_rate": 4.67741935483871e-05,
"loss": 0.0551,
"step": 1330
},
{
"epoch": 3.087557603686636,
"grad_norm": 0.8912816643714905,
"learning_rate": 4.562211981566821e-05,
"loss": 0.0799,
"step": 1340
},
{
"epoch": 3.110599078341014,
"grad_norm": 0.6732431054115295,
"learning_rate": 4.447004608294931e-05,
"loss": 0.0327,
"step": 1350
},
{
"epoch": 3.133640552995392,
"grad_norm": 5.909882545471191,
"learning_rate": 4.3317972350230415e-05,
"loss": 0.108,
"step": 1360
},
{
"epoch": 3.1566820276497696,
"grad_norm": 1.3546661138534546,
"learning_rate": 4.2165898617511525e-05,
"loss": 0.1057,
"step": 1370
},
{
"epoch": 3.1797235023041477,
"grad_norm": 0.09205944836139679,
"learning_rate": 4.101382488479263e-05,
"loss": 0.045,
"step": 1380
},
{
"epoch": 3.2027649769585254,
"grad_norm": 0.12445586174726486,
"learning_rate": 3.986175115207373e-05,
"loss": 0.0391,
"step": 1390
},
{
"epoch": 3.225806451612903,
"grad_norm": 0.49267128109931946,
"learning_rate": 3.870967741935484e-05,
"loss": 0.0397,
"step": 1400
},
{
"epoch": 3.225806451612903,
"eval_accuracy": 0.835781041388518,
"eval_loss": 0.576629102230072,
"eval_runtime": 11.5658,
"eval_samples_per_second": 129.52,
"eval_steps_per_second": 16.255,
"step": 1400
},
{
"epoch": 3.248847926267281,
"grad_norm": 0.24710910022258759,
"learning_rate": 3.7557603686635945e-05,
"loss": 0.0982,
"step": 1410
},
{
"epoch": 3.271889400921659,
"grad_norm": 1.3541345596313477,
"learning_rate": 3.640552995391705e-05,
"loss": 0.1062,
"step": 1420
},
{
"epoch": 3.294930875576037,
"grad_norm": 0.07805185765028,
"learning_rate": 3.525345622119816e-05,
"loss": 0.0367,
"step": 1430
},
{
"epoch": 3.3179723502304146,
"grad_norm": 0.704824686050415,
"learning_rate": 3.410138248847927e-05,
"loss": 0.0576,
"step": 1440
},
{
"epoch": 3.3410138248847927,
"grad_norm": 3.216744899749756,
"learning_rate": 3.294930875576037e-05,
"loss": 0.123,
"step": 1450
},
{
"epoch": 3.3640552995391704,
"grad_norm": 3.2812583446502686,
"learning_rate": 3.1797235023041475e-05,
"loss": 0.0535,
"step": 1460
},
{
"epoch": 3.3870967741935485,
"grad_norm": 0.09345371276140213,
"learning_rate": 3.0645161290322585e-05,
"loss": 0.0363,
"step": 1470
},
{
"epoch": 3.410138248847926,
"grad_norm": 0.5610162019729614,
"learning_rate": 2.9493087557603688e-05,
"loss": 0.0903,
"step": 1480
},
{
"epoch": 3.4331797235023043,
"grad_norm": 1.413180947303772,
"learning_rate": 2.8341013824884795e-05,
"loss": 0.0792,
"step": 1490
},
{
"epoch": 3.456221198156682,
"grad_norm": 6.735473155975342,
"learning_rate": 2.7188940092165898e-05,
"loss": 0.03,
"step": 1500
},
{
"epoch": 3.456221198156682,
"eval_accuracy": 0.8371161548731643,
"eval_loss": 0.5680701732635498,
"eval_runtime": 11.6369,
"eval_samples_per_second": 128.728,
"eval_steps_per_second": 16.155,
"step": 1500
},
{
"epoch": 3.47926267281106,
"grad_norm": 1.4329415559768677,
"learning_rate": 2.6036866359447005e-05,
"loss": 0.0206,
"step": 1510
},
{
"epoch": 3.5023041474654377,
"grad_norm": 0.0513407364487648,
"learning_rate": 2.488479262672811e-05,
"loss": 0.0637,
"step": 1520
},
{
"epoch": 3.525345622119816,
"grad_norm": 0.09985367208719254,
"learning_rate": 2.3732718894009218e-05,
"loss": 0.0829,
"step": 1530
},
{
"epoch": 3.5483870967741935,
"grad_norm": 0.0632900595664978,
"learning_rate": 2.258064516129032e-05,
"loss": 0.0329,
"step": 1540
},
{
"epoch": 3.571428571428571,
"grad_norm": 0.23229588568210602,
"learning_rate": 2.1428571428571428e-05,
"loss": 0.0709,
"step": 1550
},
{
"epoch": 3.5944700460829493,
"grad_norm": 0.15025608241558075,
"learning_rate": 2.0276497695852538e-05,
"loss": 0.1135,
"step": 1560
},
{
"epoch": 3.6175115207373274,
"grad_norm": 5.933778285980225,
"learning_rate": 1.912442396313364e-05,
"loss": 0.1093,
"step": 1570
},
{
"epoch": 3.640552995391705,
"grad_norm": 0.06949874013662338,
"learning_rate": 1.7972350230414748e-05,
"loss": 0.0498,
"step": 1580
},
{
"epoch": 3.6635944700460827,
"grad_norm": 0.09838402271270752,
"learning_rate": 1.682027649769585e-05,
"loss": 0.0598,
"step": 1590
},
{
"epoch": 3.686635944700461,
"grad_norm": 0.9366612434387207,
"learning_rate": 1.5668202764976958e-05,
"loss": 0.092,
"step": 1600
},
{
"epoch": 3.686635944700461,
"eval_accuracy": 0.8451268357810414,
"eval_loss": 0.5304816365242004,
"eval_runtime": 11.6024,
"eval_samples_per_second": 129.111,
"eval_steps_per_second": 16.203,
"step": 1600
},
{
"epoch": 3.709677419354839,
"grad_norm": 0.04733530059456825,
"learning_rate": 1.4516129032258066e-05,
"loss": 0.0276,
"step": 1610
},
{
"epoch": 3.7327188940092166,
"grad_norm": 0.08603022992610931,
"learning_rate": 1.3364055299539171e-05,
"loss": 0.0347,
"step": 1620
},
{
"epoch": 3.7557603686635943,
"grad_norm": 0.041543856263160706,
"learning_rate": 1.2211981566820276e-05,
"loss": 0.026,
"step": 1630
},
{
"epoch": 3.7788018433179724,
"grad_norm": 0.24026305973529816,
"learning_rate": 1.1059907834101383e-05,
"loss": 0.0496,
"step": 1640
},
{
"epoch": 3.80184331797235,
"grad_norm": 0.03894612938165665,
"learning_rate": 9.90783410138249e-06,
"loss": 0.0365,
"step": 1650
},
{
"epoch": 3.824884792626728,
"grad_norm": 4.442405700683594,
"learning_rate": 8.755760368663595e-06,
"loss": 0.0402,
"step": 1660
},
{
"epoch": 3.847926267281106,
"grad_norm": 0.032657474279403687,
"learning_rate": 7.603686635944701e-06,
"loss": 0.0596,
"step": 1670
},
{
"epoch": 3.870967741935484,
"grad_norm": 2.9635491371154785,
"learning_rate": 6.451612903225806e-06,
"loss": 0.0835,
"step": 1680
},
{
"epoch": 3.8940092165898617,
"grad_norm": 0.06601913273334503,
"learning_rate": 5.299539170506913e-06,
"loss": 0.0277,
"step": 1690
},
{
"epoch": 3.9170506912442398,
"grad_norm": 0.22990980744361877,
"learning_rate": 4.147465437788019e-06,
"loss": 0.0416,
"step": 1700
},
{
"epoch": 3.9170506912442398,
"eval_accuracy": 0.8471295060080107,
"eval_loss": 0.5442608594894409,
"eval_runtime": 11.486,
"eval_samples_per_second": 130.42,
"eval_steps_per_second": 16.368,
"step": 1700
},
{
"epoch": 3.9400921658986174,
"grad_norm": 0.06300857663154602,
"learning_rate": 2.9953917050691243e-06,
"loss": 0.0331,
"step": 1710
},
{
"epoch": 3.9631336405529956,
"grad_norm": 1.9846687316894531,
"learning_rate": 1.8433179723502305e-06,
"loss": 0.04,
"step": 1720
},
{
"epoch": 3.986175115207373,
"grad_norm": 0.4808693826198578,
"learning_rate": 6.912442396313364e-07,
"loss": 0.0494,
"step": 1730
},
{
"epoch": 4.0,
"step": 1736,
"total_flos": 2.1525139607212524e+18,
"train_loss": 0.4232822818690181,
"train_runtime": 559.5799,
"train_samples_per_second": 49.637,
"train_steps_per_second": 3.102
}
],
"logging_steps": 10,
"max_steps": 1736,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.1525139607212524e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}