Desm0nt's picture
Upload trainer_state.json (#1)
181301a verified
raw
history blame
54.3 kB
{
"best_metric": 1.52509904,
"best_model_checkpoint": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v0-20240629-080216\\checkpoint-300",
"epoch": 3.5225048923679063,
"eval_steps": 50,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"acc": 0.49833804,
"epoch": 0.003913894324853229,
"grad_norm": 0.77734375,
"learning_rate": 2.745098039215686e-06,
"loss": 2.37747383,
"memory(GiB)": 17.35,
"step": 1,
"train_speed(iter/s)": 0.076826
},
{
"acc": 0.50652587,
"epoch": 0.019569471624266144,
"grad_norm": 1.140625,
"learning_rate": 1.372549019607843e-05,
"loss": 2.29183841,
"memory(GiB)": 19.33,
"step": 5,
"train_speed(iter/s)": 0.082188
},
{
"acc": 0.52587533,
"epoch": 0.03913894324853229,
"grad_norm": 0.68359375,
"learning_rate": 2.745098039215686e-05,
"loss": 2.22724895,
"memory(GiB)": 19.89,
"step": 10,
"train_speed(iter/s)": 0.082805
},
{
"acc": 0.52128973,
"epoch": 0.05870841487279843,
"grad_norm": 0.8359375,
"learning_rate": 4.117647058823529e-05,
"loss": 2.27491264,
"memory(GiB)": 19.24,
"step": 15,
"train_speed(iter/s)": 0.082482
},
{
"acc": 0.51135335,
"epoch": 0.07827788649706457,
"grad_norm": 0.66015625,
"learning_rate": 5.490196078431372e-05,
"loss": 2.32762127,
"memory(GiB)": 19.86,
"step": 20,
"train_speed(iter/s)": 0.082557
},
{
"acc": 0.54442377,
"epoch": 0.09784735812133072,
"grad_norm": 0.65625,
"learning_rate": 6.862745098039214e-05,
"loss": 2.09772224,
"memory(GiB)": 19.05,
"step": 25,
"train_speed(iter/s)": 0.082348
},
{
"acc": 0.5545311,
"epoch": 0.11741682974559686,
"grad_norm": 0.62109375,
"learning_rate": 8.235294117647058e-05,
"loss": 2.00072975,
"memory(GiB)": 19.89,
"step": 30,
"train_speed(iter/s)": 0.082166
},
{
"acc": 0.57092514,
"epoch": 0.136986301369863,
"grad_norm": 0.9296875,
"learning_rate": 9.6078431372549e-05,
"loss": 1.94450474,
"memory(GiB)": 19.16,
"step": 35,
"train_speed(iter/s)": 0.081966
},
{
"acc": 0.56716595,
"epoch": 0.15655577299412915,
"grad_norm": 0.7734375,
"learning_rate": 0.00010980392156862745,
"loss": 1.90242462,
"memory(GiB)": 19.62,
"step": 40,
"train_speed(iter/s)": 0.081987
},
{
"acc": 0.57822714,
"epoch": 0.1761252446183953,
"grad_norm": 0.74609375,
"learning_rate": 0.00012352941176470587,
"loss": 1.83147659,
"memory(GiB)": 19.99,
"step": 45,
"train_speed(iter/s)": 0.081878
},
{
"acc": 0.57696896,
"epoch": 0.19569471624266144,
"grad_norm": 0.85546875,
"learning_rate": 0.00013725490196078428,
"loss": 1.82299595,
"memory(GiB)": 19.11,
"step": 50,
"train_speed(iter/s)": 0.081843
},
{
"epoch": 0.19569471624266144,
"eval_acc": 0.583503534956795,
"eval_loss": 1.8029242753982544,
"eval_runtime": 85.1254,
"eval_samples_per_second": 0.893,
"eval_steps_per_second": 0.446,
"step": 50
},
{
"acc": 0.59343066,
"epoch": 0.21526418786692758,
"grad_norm": 1.0,
"learning_rate": 0.0001399941138119636,
"loss": 1.82339039,
"memory(GiB)": 22.92,
"step": 55,
"train_speed(iter/s)": 0.072544
},
{
"acc": 0.58571839,
"epoch": 0.23483365949119372,
"grad_norm": 0.7734375,
"learning_rate": 0.00013997020286964757,
"loss": 1.80549526,
"memory(GiB)": 19.43,
"step": 60,
"train_speed(iter/s)": 0.073269
},
{
"acc": 0.60369935,
"epoch": 0.25440313111545987,
"grad_norm": 0.99609375,
"learning_rate": 0.0001399279055646442,
"loss": 1.6768074,
"memory(GiB)": 19.57,
"step": 65,
"train_speed(iter/s)": 0.073897
},
{
"acc": 0.58763909,
"epoch": 0.273972602739726,
"grad_norm": 1.1640625,
"learning_rate": 0.00013986723301159307,
"loss": 1.79169483,
"memory(GiB)": 19.48,
"step": 70,
"train_speed(iter/s)": 0.074533
},
{
"acc": 0.58979025,
"epoch": 0.29354207436399216,
"grad_norm": 0.69140625,
"learning_rate": 0.00013978820115367462,
"loss": 1.72388344,
"memory(GiB)": 19.35,
"step": 75,
"train_speed(iter/s)": 0.075045
},
{
"acc": 0.59725327,
"epoch": 0.3131115459882583,
"grad_norm": 0.75,
"learning_rate": 0.00013969083075842048,
"loss": 1.70864868,
"memory(GiB)": 19.49,
"step": 80,
"train_speed(iter/s)": 0.075523
},
{
"acc": 0.60098982,
"epoch": 0.33268101761252444,
"grad_norm": 4.59375,
"learning_rate": 0.00013957514741225646,
"loss": 1.67311764,
"memory(GiB)": 20.01,
"step": 85,
"train_speed(iter/s)": 0.075928
},
{
"acc": 0.58315139,
"epoch": 0.3522504892367906,
"grad_norm": 0.8359375,
"learning_rate": 0.00013944118151377894,
"loss": 1.74437752,
"memory(GiB)": 20.14,
"step": 90,
"train_speed(iter/s)": 0.076154
},
{
"acc": 0.6138227,
"epoch": 0.37181996086105673,
"grad_norm": 0.75,
"learning_rate": 0.0001392889682657671,
"loss": 1.63750076,
"memory(GiB)": 19.59,
"step": 95,
"train_speed(iter/s)": 0.076253
},
{
"acc": 0.63383026,
"epoch": 0.3913894324853229,
"grad_norm": 0.8515625,
"learning_rate": 0.00013911854766593233,
"loss": 1.56653557,
"memory(GiB)": 19.5,
"step": 100,
"train_speed(iter/s)": 0.076386
},
{
"epoch": 0.3913894324853229,
"eval_acc": 0.604241948153967,
"eval_loss": 1.6681365966796875,
"eval_runtime": 72.2811,
"eval_samples_per_second": 1.051,
"eval_steps_per_second": 0.526,
"step": 100
},
{
"acc": 0.61646304,
"epoch": 0.410958904109589,
"grad_norm": 0.73046875,
"learning_rate": 0.00013892996449640807,
"loss": 1.59651537,
"memory(GiB)": 22.5,
"step": 105,
"train_speed(iter/s)": 0.072857
},
{
"acc": 0.60897431,
"epoch": 0.43052837573385516,
"grad_norm": 0.83984375,
"learning_rate": 0.00013872326831198205,
"loss": 1.70257473,
"memory(GiB)": 19.42,
"step": 110,
"train_speed(iter/s)": 0.073309
},
{
"acc": 0.58328586,
"epoch": 0.4500978473581213,
"grad_norm": 0.9453125,
"learning_rate": 0.00013849851342707462,
"loss": 1.71216717,
"memory(GiB)": 19.47,
"step": 115,
"train_speed(iter/s)": 0.073753
},
{
"acc": 0.62397904,
"epoch": 0.46966731898238745,
"grad_norm": 0.80078125,
"learning_rate": 0.0001382557589014664,
"loss": 1.54239073,
"memory(GiB)": 19.33,
"step": 120,
"train_speed(iter/s)": 0.074078
},
{
"acc": 0.60271235,
"epoch": 0.4892367906066536,
"grad_norm": 1.171875,
"learning_rate": 0.0001379950685247788,
"loss": 1.72333088,
"memory(GiB)": 19.37,
"step": 125,
"train_speed(iter/s)": 0.074428
},
{
"acc": 0.5755064,
"epoch": 0.5088062622309197,
"grad_norm": 0.94921875,
"learning_rate": 0.00013771651079971182,
"loss": 1.81728477,
"memory(GiB)": 19.52,
"step": 130,
"train_speed(iter/s)": 0.074768
},
{
"acc": 0.5844254,
"epoch": 0.5283757338551859,
"grad_norm": 0.8515625,
"learning_rate": 0.00013742015892404325,
"loss": 1.77252998,
"memory(GiB)": 19.51,
"step": 135,
"train_speed(iter/s)": 0.075066
},
{
"acc": 0.5998323,
"epoch": 0.547945205479452,
"grad_norm": 0.8671875,
"learning_rate": 0.0001371060907713942,
"loss": 1.69012871,
"memory(GiB)": 19.54,
"step": 140,
"train_speed(iter/s)": 0.07528
},
{
"acc": 0.62686119,
"epoch": 0.5675146771037182,
"grad_norm": 0.68359375,
"learning_rate": 0.00013677438887076603,
"loss": 1.66314449,
"memory(GiB)": 19.54,
"step": 145,
"train_speed(iter/s)": 0.075467
},
{
"acc": 0.59954901,
"epoch": 0.5870841487279843,
"grad_norm": 0.6328125,
"learning_rate": 0.00013642514038485367,
"loss": 1.67525444,
"memory(GiB)": 19.55,
"step": 150,
"train_speed(iter/s)": 0.075722
},
{
"epoch": 0.5870841487279843,
"eval_acc": 0.6184603299293009,
"eval_loss": 1.5965631008148193,
"eval_runtime": 72.3005,
"eval_samples_per_second": 1.051,
"eval_steps_per_second": 0.526,
"step": 150
},
{
"acc": 0.585955,
"epoch": 0.6066536203522505,
"grad_norm": 0.9375,
"learning_rate": 0.00013605843708714162,
"loss": 1.7486639,
"memory(GiB)": 23.22,
"step": 155,
"train_speed(iter/s)": 0.073368
},
{
"acc": 0.62769904,
"epoch": 0.6262230919765166,
"grad_norm": 0.7265625,
"learning_rate": 0.00013567437533778826,
"loss": 1.55238762,
"memory(GiB)": 19.62,
"step": 160,
"train_speed(iter/s)": 0.073628
},
{
"acc": 0.63651643,
"epoch": 0.6457925636007827,
"grad_norm": 0.80078125,
"learning_rate": 0.00013527305605830488,
"loss": 1.54306393,
"memory(GiB)": 19.88,
"step": 165,
"train_speed(iter/s)": 0.073903
},
{
"acc": 0.59288979,
"epoch": 0.6653620352250489,
"grad_norm": 0.703125,
"learning_rate": 0.0001348545847050361,
"loss": 1.69727612,
"memory(GiB)": 19.58,
"step": 170,
"train_speed(iter/s)": 0.074077
},
{
"acc": 0.61248484,
"epoch": 0.684931506849315,
"grad_norm": 0.9140625,
"learning_rate": 0.00013441907124144866,
"loss": 1.65900764,
"memory(GiB)": 19.49,
"step": 175,
"train_speed(iter/s)": 0.074329
},
{
"acc": 0.61740661,
"epoch": 0.7045009784735812,
"grad_norm": 0.90625,
"learning_rate": 0.0001339666301092358,
"loss": 1.6518961,
"memory(GiB)": 19.68,
"step": 180,
"train_speed(iter/s)": 0.074558
},
{
"acc": 0.62250223,
"epoch": 0.7240704500978473,
"grad_norm": 0.84765625,
"learning_rate": 0.00013349738019824512,
"loss": 1.55100412,
"memory(GiB)": 19.34,
"step": 185,
"train_speed(iter/s)": 0.07477
},
{
"acc": 0.61055808,
"epoch": 0.7436399217221135,
"grad_norm": 0.90625,
"learning_rate": 0.00013301144481523718,
"loss": 1.67241592,
"memory(GiB)": 19.56,
"step": 190,
"train_speed(iter/s)": 0.075006
},
{
"acc": 0.6389596,
"epoch": 0.7632093933463796,
"grad_norm": 0.83203125,
"learning_rate": 0.00013250895165148384,
"loss": 1.54227753,
"memory(GiB)": 19.29,
"step": 195,
"train_speed(iter/s)": 0.075192
},
{
"acc": 0.59149747,
"epoch": 0.7827788649706457,
"grad_norm": 0.68359375,
"learning_rate": 0.00013199003274921416,
"loss": 1.71190453,
"memory(GiB)": 19.35,
"step": 200,
"train_speed(iter/s)": 0.075393
},
{
"epoch": 0.7827788649706457,
"eval_acc": 0.6241162608012569,
"eval_loss": 1.5573129653930664,
"eval_runtime": 69.5471,
"eval_samples_per_second": 1.093,
"eval_steps_per_second": 0.546,
"step": 200
},
{
"acc": 0.62623324,
"epoch": 0.8023483365949119,
"grad_norm": 0.81640625,
"learning_rate": 0.00013145482446691724,
"loss": 1.55779324,
"memory(GiB)": 20.56,
"step": 205,
"train_speed(iter/s)": 0.073671
},
{
"acc": 0.61495056,
"epoch": 0.821917808219178,
"grad_norm": 1.03125,
"learning_rate": 0.00013090346744351058,
"loss": 1.56424398,
"memory(GiB)": 19.48,
"step": 210,
"train_speed(iter/s)": 0.073902
},
{
"acc": 0.59643593,
"epoch": 0.8414872798434442,
"grad_norm": 1.0703125,
"learning_rate": 0.00013033610656138395,
"loss": 1.62190418,
"memory(GiB)": 19.5,
"step": 215,
"train_speed(iter/s)": 0.074133
},
{
"acc": 0.63052382,
"epoch": 0.8610567514677103,
"grad_norm": 0.59765625,
"learning_rate": 0.00012975289090832792,
"loss": 1.53521852,
"memory(GiB)": 19.53,
"step": 220,
"train_speed(iter/s)": 0.074334
},
{
"acc": 0.61408448,
"epoch": 0.8806262230919765,
"grad_norm": 0.7734375,
"learning_rate": 0.00012915397373835754,
"loss": 1.59712257,
"memory(GiB)": 19.52,
"step": 225,
"train_speed(iter/s)": 0.074533
},
{
"acc": 0.62307076,
"epoch": 0.9001956947162426,
"grad_norm": 0.66796875,
"learning_rate": 0.00012853951243144105,
"loss": 1.57903328,
"memory(GiB)": 19.49,
"step": 230,
"train_speed(iter/s)": 0.074719
},
{
"acc": 0.61717134,
"epoch": 0.9197651663405088,
"grad_norm": 0.84375,
"learning_rate": 0.00012790966845214457,
"loss": 1.61422024,
"memory(GiB)": 19.25,
"step": 235,
"train_speed(iter/s)": 0.074916
},
{
"acc": 0.62549253,
"epoch": 0.9393346379647749,
"grad_norm": 0.8125,
"learning_rate": 0.0001272646073072033,
"loss": 1.62806015,
"memory(GiB)": 19.36,
"step": 240,
"train_speed(iter/s)": 0.0751
},
{
"acc": 0.61903515,
"epoch": 0.958904109589041,
"grad_norm": 0.74609375,
"learning_rate": 0.0001266044985020307,
"loss": 1.55927486,
"memory(GiB)": 19.36,
"step": 245,
"train_speed(iter/s)": 0.075266
},
{
"acc": 0.61238952,
"epoch": 0.9784735812133072,
"grad_norm": 0.87890625,
"learning_rate": 0.00012592951549617683,
"loss": 1.52888412,
"memory(GiB)": 19.33,
"step": 250,
"train_speed(iter/s)": 0.075438
},
{
"epoch": 0.9784735812133072,
"eval_acc": 0.6267085624509033,
"eval_loss": 1.5281730890274048,
"eval_runtime": 69.069,
"eval_samples_per_second": 1.1,
"eval_steps_per_second": 0.55,
"step": 250
},
{
"acc": 0.63230977,
"epoch": 0.9980430528375733,
"grad_norm": 0.84765625,
"learning_rate": 0.00012523983565774753,
"loss": 1.53058205,
"memory(GiB)": 19.46,
"step": 255,
"train_speed(iter/s)": 0.074081
},
{
"acc": 0.66042156,
"epoch": 1.0176125244618395,
"grad_norm": 0.76171875,
"learning_rate": 0.00012453564021679692,
"loss": 1.37123928,
"memory(GiB)": 20.18,
"step": 260,
"train_speed(iter/s)": 0.074295
},
{
"acc": 0.67253222,
"epoch": 1.0371819960861057,
"grad_norm": 0.76953125,
"learning_rate": 0.00012381711421770455,
"loss": 1.28407507,
"memory(GiB)": 19.7,
"step": 265,
"train_speed(iter/s)": 0.074448
},
{
"acc": 0.66850777,
"epoch": 1.0567514677103718,
"grad_norm": 0.98046875,
"learning_rate": 0.0001230844464705507,
"loss": 1.27961807,
"memory(GiB)": 19.58,
"step": 270,
"train_speed(iter/s)": 0.07459
},
{
"acc": 0.67196817,
"epoch": 1.076320939334638,
"grad_norm": 0.9140625,
"learning_rate": 0.00012233782950150186,
"loss": 1.28494987,
"memory(GiB)": 19.61,
"step": 275,
"train_speed(iter/s)": 0.074728
},
{
"acc": 0.67708378,
"epoch": 1.095890410958904,
"grad_norm": 0.87109375,
"learning_rate": 0.00012157745950221989,
"loss": 1.29551096,
"memory(GiB)": 19.63,
"step": 280,
"train_speed(iter/s)": 0.074881
},
{
"acc": 0.66973438,
"epoch": 1.1154598825831703,
"grad_norm": 1.0859375,
"learning_rate": 0.0001208035362783079,
"loss": 1.27705774,
"memory(GiB)": 19.49,
"step": 285,
"train_speed(iter/s)": 0.075029
},
{
"acc": 0.6750237,
"epoch": 1.1350293542074363,
"grad_norm": 1.0859375,
"learning_rate": 0.00012001626319680648,
"loss": 1.25660419,
"memory(GiB)": 19.55,
"step": 290,
"train_speed(iter/s)": 0.07515
},
{
"acc": 0.624368,
"epoch": 1.1545988258317026,
"grad_norm": 1.1953125,
"learning_rate": 0.00011921584713275411,
"loss": 1.5070508,
"memory(GiB)": 19.52,
"step": 295,
"train_speed(iter/s)": 0.075278
},
{
"acc": 0.66252189,
"epoch": 1.1741682974559686,
"grad_norm": 0.828125,
"learning_rate": 0.0001184024984148257,
"loss": 1.32014723,
"memory(GiB)": 19.92,
"step": 300,
"train_speed(iter/s)": 0.075433
},
{
"epoch": 1.1741682974559686,
"eval_acc": 0.6282796543597801,
"eval_loss": 1.5250990390777588,
"eval_runtime": 70.3986,
"eval_samples_per_second": 1.08,
"eval_steps_per_second": 0.54,
"step": 300
},
{
"acc": 0.67028356,
"epoch": 1.1937377690802349,
"grad_norm": 1.7109375,
"learning_rate": 0.00011757643077006372,
"loss": 1.28037386,
"memory(GiB)": 22.6,
"step": 305,
"train_speed(iter/s)": 0.074243
},
{
"acc": 0.655305,
"epoch": 1.213307240704501,
"grad_norm": 1.1015625,
"learning_rate": 0.00011673786126771617,
"loss": 1.31057158,
"memory(GiB)": 19.72,
"step": 310,
"train_speed(iter/s)": 0.074392
},
{
"acc": 0.66528535,
"epoch": 1.2328767123287672,
"grad_norm": 1.6171875,
"learning_rate": 0.0001158870102621965,
"loss": 1.29698696,
"memory(GiB)": 19.08,
"step": 315,
"train_speed(iter/s)": 0.074534
},
{
"acc": 0.66950455,
"epoch": 1.2524461839530332,
"grad_norm": 1.2421875,
"learning_rate": 0.00011502410133517998,
"loss": 1.27706356,
"memory(GiB)": 19.87,
"step": 320,
"train_speed(iter/s)": 0.074667
},
{
"acc": 0.65843534,
"epoch": 1.2720156555772995,
"grad_norm": 1.2265625,
"learning_rate": 0.0001141493612368524,
"loss": 1.30308371,
"memory(GiB)": 19.87,
"step": 325,
"train_speed(iter/s)": 0.0748
},
{
"acc": 0.66441913,
"epoch": 1.2915851272015655,
"grad_norm": 1.2578125,
"learning_rate": 0.00011326301982632583,
"loss": 1.26109972,
"memory(GiB)": 19.09,
"step": 330,
"train_speed(iter/s)": 0.074935
},
{
"acc": 0.68711085,
"epoch": 1.3111545988258317,
"grad_norm": 0.95703125,
"learning_rate": 0.00011236531001123771,
"loss": 1.19278584,
"memory(GiB)": 19.73,
"step": 335,
"train_speed(iter/s)": 0.075053
},
{
"acc": 0.66676803,
"epoch": 1.3307240704500978,
"grad_norm": 1.96875,
"learning_rate": 0.0001114564676865486,
"loss": 1.3068346,
"memory(GiB)": 19.84,
"step": 340,
"train_speed(iter/s)": 0.075151
},
{
"acc": 0.66865935,
"epoch": 1.350293542074364,
"grad_norm": 1.2421875,
"learning_rate": 0.00011053673167255516,
"loss": 1.30573978,
"memory(GiB)": 19.66,
"step": 345,
"train_speed(iter/s)": 0.075271
},
{
"acc": 0.66606102,
"epoch": 1.36986301369863,
"grad_norm": 0.76171875,
"learning_rate": 0.00010960634365213437,
"loss": 1.26872787,
"memory(GiB)": 19.73,
"step": 350,
"train_speed(iter/s)": 0.075377
},
{
"epoch": 1.36986301369863,
"eval_acc": 0.6315003927729772,
"eval_loss": 1.5066882371902466,
"eval_runtime": 72.5685,
"eval_samples_per_second": 1.047,
"eval_steps_per_second": 0.524,
"step": 350
},
{
"acc": 0.67307239,
"epoch": 1.3894324853228963,
"grad_norm": 1.1796875,
"learning_rate": 0.0001086655481072354,
"loss": 1.27917318,
"memory(GiB)": 22.92,
"step": 355,
"train_speed(iter/s)": 0.074318
},
{
"acc": 0.65870218,
"epoch": 1.4090019569471623,
"grad_norm": 3.609375,
"learning_rate": 0.00010771459225463617,
"loss": 1.33731461,
"memory(GiB)": 19.67,
"step": 360,
"train_speed(iter/s)": 0.074416
},
{
"acc": 0.68150563,
"epoch": 1.4285714285714286,
"grad_norm": 0.9296875,
"learning_rate": 0.00010675372598098113,
"loss": 1.20515957,
"memory(GiB)": 19.99,
"step": 365,
"train_speed(iter/s)": 0.07451
},
{
"acc": 0.66793504,
"epoch": 1.4481409001956946,
"grad_norm": 1.03125,
"learning_rate": 0.00010578320177711743,
"loss": 1.31133595,
"memory(GiB)": 19.9,
"step": 370,
"train_speed(iter/s)": 0.074613
},
{
"acc": 0.66840873,
"epoch": 1.467710371819961,
"grad_norm": 0.9453125,
"learning_rate": 0.00010480327467174705,
"loss": 1.27730675,
"memory(GiB)": 19.91,
"step": 375,
"train_speed(iter/s)": 0.074709
},
{
"acc": 0.6621439,
"epoch": 1.487279843444227,
"grad_norm": 0.7890625,
"learning_rate": 0.00010381420216441152,
"loss": 1.29670372,
"memory(GiB)": 19.65,
"step": 380,
"train_speed(iter/s)": 0.074824
},
{
"acc": 0.66805882,
"epoch": 1.5068493150684932,
"grad_norm": 0.8203125,
"learning_rate": 0.00010281624415782804,
"loss": 1.23922901,
"memory(GiB)": 19.77,
"step": 385,
"train_speed(iter/s)": 0.074927
},
{
"acc": 0.66435666,
"epoch": 1.5264187866927594,
"grad_norm": 0.82421875,
"learning_rate": 0.0001018096628895935,
"loss": 1.27945633,
"memory(GiB)": 19.79,
"step": 390,
"train_speed(iter/s)": 0.075033
},
{
"acc": 0.68444743,
"epoch": 1.5459882583170255,
"grad_norm": 0.98046875,
"learning_rate": 0.00010079472286327533,
"loss": 1.2325819,
"memory(GiB)": 19.55,
"step": 395,
"train_speed(iter/s)": 0.075133
},
{
"acc": 0.68633671,
"epoch": 1.5655577299412915,
"grad_norm": 1.171875,
"learning_rate": 9.977169077890672e-05,
"loss": 1.26248102,
"memory(GiB)": 19.79,
"step": 400,
"train_speed(iter/s)": 0.075233
},
{
"epoch": 1.5655577299412915,
"eval_acc": 0.6297721916732129,
"eval_loss": 1.5114485025405884,
"eval_runtime": 70.7985,
"eval_samples_per_second": 1.073,
"eval_steps_per_second": 0.537,
"step": 400
},
{
"acc": 0.67859097,
"epoch": 1.5851272015655578,
"grad_norm": 1.046875,
"learning_rate": 9.874083546290482e-05,
"loss": 1.2065486,
"memory(GiB)": 22.72,
"step": 405,
"train_speed(iter/s)": 0.074347
},
{
"acc": 0.66178751,
"epoch": 1.604696673189824,
"grad_norm": 0.96484375,
"learning_rate": 9.770242779743008e-05,
"loss": 1.30969448,
"memory(GiB)": 20.13,
"step": 410,
"train_speed(iter/s)": 0.074453
},
{
"acc": 0.65872512,
"epoch": 1.62426614481409,
"grad_norm": 0.74609375,
"learning_rate": 9.665674064920533e-05,
"loss": 1.27483397,
"memory(GiB)": 20.17,
"step": 415,
"train_speed(iter/s)": 0.074534
},
{
"acc": 0.66567349,
"epoch": 1.643835616438356,
"grad_norm": 0.87109375,
"learning_rate": 9.560404879781353e-05,
"loss": 1.31585007,
"memory(GiB)": 20.07,
"step": 420,
"train_speed(iter/s)": 0.074639
},
{
"acc": 0.66216898,
"epoch": 1.6634050880626223,
"grad_norm": 0.85546875,
"learning_rate": 9.454462886349281e-05,
"loss": 1.32738457,
"memory(GiB)": 19.43,
"step": 425,
"train_speed(iter/s)": 0.074732
},
{
"acc": 0.6608973,
"epoch": 1.6829745596868886,
"grad_norm": 1.1328125,
"learning_rate": 9.347875923444772e-05,
"loss": 1.2792593,
"memory(GiB)": 20.05,
"step": 430,
"train_speed(iter/s)": 0.074827
},
{
"acc": 0.65830297,
"epoch": 1.7025440313111546,
"grad_norm": 0.94921875,
"learning_rate": 9.240671999369607e-05,
"loss": 1.34132614,
"memory(GiB)": 19.82,
"step": 435,
"train_speed(iter/s)": 0.074914
},
{
"acc": 0.68926673,
"epoch": 1.7221135029354206,
"grad_norm": 0.76953125,
"learning_rate": 9.132879284547038e-05,
"loss": 1.15266266,
"memory(GiB)": 19.28,
"step": 440,
"train_speed(iter/s)": 0.074997
},
{
"acc": 0.65699558,
"epoch": 1.741682974559687,
"grad_norm": 0.96484375,
"learning_rate": 9.024526104119312e-05,
"loss": 1.32417459,
"memory(GiB)": 19.29,
"step": 445,
"train_speed(iter/s)": 0.075079
},
{
"acc": 0.68860197,
"epoch": 1.7612524461839532,
"grad_norm": 0.8203125,
"learning_rate": 8.91564093050458e-05,
"loss": 1.20134068,
"memory(GiB)": 19.33,
"step": 450,
"train_speed(iter/s)": 0.07515
},
{
"epoch": 1.7612524461839532,
"eval_acc": 0.6351924587588373,
"eval_loss": 1.4908838272094727,
"eval_runtime": 71.5161,
"eval_samples_per_second": 1.063,
"eval_steps_per_second": 0.531,
"step": 450
},
{
"acc": 0.65404687,
"epoch": 1.7808219178082192,
"grad_norm": 1.0078125,
"learning_rate": 8.806252375915052e-05,
"loss": 1.31502724,
"memory(GiB)": 19.13,
"step": 455,
"train_speed(iter/s)": 0.074358
},
{
"acc": 0.69379678,
"epoch": 1.8003913894324852,
"grad_norm": 1.1015625,
"learning_rate": 8.696389184838471e-05,
"loss": 1.1870966,
"memory(GiB)": 20.18,
"step": 460,
"train_speed(iter/s)": 0.074437
},
{
"acc": 0.67447538,
"epoch": 1.8199608610567515,
"grad_norm": 1.2890625,
"learning_rate": 8.586080226484789e-05,
"loss": 1.19511604,
"memory(GiB)": 20.09,
"step": 465,
"train_speed(iter/s)": 0.074531
},
{
"acc": 0.67230067,
"epoch": 1.8395303326810177,
"grad_norm": 1.0390625,
"learning_rate": 8.475354487200092e-05,
"loss": 1.30591021,
"memory(GiB)": 19.29,
"step": 470,
"train_speed(iter/s)": 0.074608
},
{
"acc": 0.65006552,
"epoch": 1.8590998043052838,
"grad_norm": 3.21875,
"learning_rate": 8.364241062849732e-05,
"loss": 1.35613279,
"memory(GiB)": 19.51,
"step": 475,
"train_speed(iter/s)": 0.07469
},
{
"acc": 0.66248426,
"epoch": 1.8786692759295498,
"grad_norm": 1.0703125,
"learning_rate": 8.252769151172682e-05,
"loss": 1.34706697,
"memory(GiB)": 19.16,
"step": 480,
"train_speed(iter/s)": 0.074779
},
{
"acc": 0.66462736,
"epoch": 1.898238747553816,
"grad_norm": 0.8515625,
"learning_rate": 8.140968044109134e-05,
"loss": 1.31343336,
"memory(GiB)": 19.17,
"step": 485,
"train_speed(iter/s)": 0.07486
},
{
"acc": 0.65373287,
"epoch": 1.9178082191780823,
"grad_norm": 1.078125,
"learning_rate": 8.028867120103326e-05,
"loss": 1.31145601,
"memory(GiB)": 19.46,
"step": 490,
"train_speed(iter/s)": 0.074941
},
{
"acc": 0.6731041,
"epoch": 1.9373776908023483,
"grad_norm": 0.89453125,
"learning_rate": 7.916495836383648e-05,
"loss": 1.24272699,
"memory(GiB)": 19.45,
"step": 495,
"train_speed(iter/s)": 0.075011
},
{
"acc": 0.66485052,
"epoch": 1.9569471624266144,
"grad_norm": 1.03125,
"learning_rate": 7.80388372122204e-05,
"loss": 1.28164721,
"memory(GiB)": 19.24,
"step": 500,
"train_speed(iter/s)": 0.07509
},
{
"epoch": 1.9569471624266144,
"eval_acc": 0.6349567949725059,
"eval_loss": 1.483258843421936,
"eval_runtime": 72.4797,
"eval_samples_per_second": 1.049,
"eval_steps_per_second": 0.524,
"step": 500
},
{
"acc": 0.68325486,
"epoch": 1.9765166340508806,
"grad_norm": 1.2890625,
"learning_rate": 7.691060366174728e-05,
"loss": 1.2257865,
"memory(GiB)": 22.98,
"step": 505,
"train_speed(iter/s)": 0.074371
},
{
"acc": 0.68977013,
"epoch": 1.9960861056751469,
"grad_norm": 1.0234375,
"learning_rate": 7.578055418306327e-05,
"loss": 1.25723343,
"memory(GiB)": 19.56,
"step": 510,
"train_speed(iter/s)": 0.074471
},
{
"acc": 0.72185702,
"epoch": 2.015655577299413,
"grad_norm": 0.7890625,
"learning_rate": 7.464898572399353e-05,
"loss": 1.01715631,
"memory(GiB)": 20.07,
"step": 515,
"train_speed(iter/s)": 0.074591
},
{
"acc": 0.71889682,
"epoch": 2.035225048923679,
"grad_norm": 1.0625,
"learning_rate": 7.351619563151208e-05,
"loss": 1.03077154,
"memory(GiB)": 19.92,
"step": 520,
"train_speed(iter/s)": 0.074683
},
{
"acc": 0.7505311,
"epoch": 2.0547945205479454,
"grad_norm": 1.9609375,
"learning_rate": 7.238248157360663e-05,
"loss": 0.93218956,
"memory(GiB)": 19.85,
"step": 525,
"train_speed(iter/s)": 0.07477
},
{
"acc": 0.7315311,
"epoch": 2.0743639921722115,
"grad_norm": 1.1875,
"learning_rate": 7.124814146105921e-05,
"loss": 0.96330833,
"memory(GiB)": 19.87,
"step": 530,
"train_speed(iter/s)": 0.074853
},
{
"acc": 0.75555606,
"epoch": 2.0939334637964775,
"grad_norm": 1.3515625,
"learning_rate": 7.011347336916277e-05,
"loss": 0.86877937,
"memory(GiB)": 18.46,
"step": 535,
"train_speed(iter/s)": 0.074938
},
{
"acc": 0.74034052,
"epoch": 2.1135029354207435,
"grad_norm": 1.546875,
"learning_rate": 6.897877545939475e-05,
"loss": 0.90922012,
"memory(GiB)": 19.89,
"step": 540,
"train_speed(iter/s)": 0.075027
},
{
"acc": 0.72400937,
"epoch": 2.1330724070450096,
"grad_norm": 1.90625,
"learning_rate": 6.784434590106808e-05,
"loss": 0.98424711,
"memory(GiB)": 19.11,
"step": 545,
"train_speed(iter/s)": 0.075114
},
{
"acc": 0.77706275,
"epoch": 2.152641878669276,
"grad_norm": 1.359375,
"learning_rate": 6.671048279297972e-05,
"loss": 0.80820856,
"memory(GiB)": 19.86,
"step": 550,
"train_speed(iter/s)": 0.075193
},
{
"epoch": 2.152641878669276,
"eval_acc": 0.6260015710919089,
"eval_loss": 1.6081812381744385,
"eval_runtime": 68.6973,
"eval_samples_per_second": 1.106,
"eval_steps_per_second": 0.553,
"step": 550
},
{
"acc": 0.75351696,
"epoch": 2.172211350293542,
"grad_norm": 2.015625,
"learning_rate": 6.55774840850782e-05,
"loss": 0.86192131,
"memory(GiB)": 22.21,
"step": 555,
"train_speed(iter/s)": 0.074578
},
{
"acc": 0.74249997,
"epoch": 2.191780821917808,
"grad_norm": 1.4609375,
"learning_rate": 6.444564750017003e-05,
"loss": 0.91982813,
"memory(GiB)": 19.87,
"step": 560,
"train_speed(iter/s)": 0.074665
},
{
"acc": 0.73636398,
"epoch": 2.2113502935420746,
"grad_norm": 1.9375,
"learning_rate": 6.331527045568573e-05,
"loss": 0.93448582,
"memory(GiB)": 19.33,
"step": 565,
"train_speed(iter/s)": 0.074752
},
{
"acc": 0.74081583,
"epoch": 2.2309197651663406,
"grad_norm": 2.21875,
"learning_rate": 6.218664998552634e-05,
"loss": 0.94956303,
"memory(GiB)": 19.8,
"step": 570,
"train_speed(iter/s)": 0.074842
},
{
"acc": 0.74573116,
"epoch": 2.2504892367906066,
"grad_norm": 2.546875,
"learning_rate": 6.106008266201046e-05,
"loss": 0.88486786,
"memory(GiB)": 19.92,
"step": 575,
"train_speed(iter/s)": 0.074925
},
{
"acc": 0.75495067,
"epoch": 2.2700587084148727,
"grad_norm": 2.09375,
"learning_rate": 5.9935864517942844e-05,
"loss": 0.84776802,
"memory(GiB)": 19.89,
"step": 580,
"train_speed(iter/s)": 0.075
},
{
"acc": 0.74743519,
"epoch": 2.2896281800391387,
"grad_norm": 1.5859375,
"learning_rate": 5.881429096882449e-05,
"loss": 0.92330503,
"memory(GiB)": 19.03,
"step": 585,
"train_speed(iter/s)": 0.075076
},
{
"acc": 0.74913769,
"epoch": 2.309197651663405,
"grad_norm": 1.6640625,
"learning_rate": 5.769565673522515e-05,
"loss": 0.92942295,
"memory(GiB)": 20.04,
"step": 590,
"train_speed(iter/s)": 0.075149
},
{
"acc": 0.74875064,
"epoch": 2.328767123287671,
"grad_norm": 1.25,
"learning_rate": 5.658025576533832e-05,
"loss": 0.90142069,
"memory(GiB)": 19.96,
"step": 595,
"train_speed(iter/s)": 0.075215
},
{
"acc": 0.74648356,
"epoch": 2.3483365949119372,
"grad_norm": 1.65625,
"learning_rate": 5.546838115773929e-05,
"loss": 0.91528139,
"memory(GiB)": 19.84,
"step": 600,
"train_speed(iter/s)": 0.075292
},
{
"epoch": 2.3483365949119372,
"eval_acc": 0.6284367635506677,
"eval_loss": 1.593437910079956,
"eval_runtime": 68.9856,
"eval_samples_per_second": 1.102,
"eval_steps_per_second": 0.551,
"step": 600
},
{
"acc": 0.75246172,
"epoch": 2.3679060665362037,
"grad_norm": 1.2109375,
"learning_rate": 5.4360325084366416e-05,
"loss": 0.87402363,
"memory(GiB)": 22.69,
"step": 605,
"train_speed(iter/s)": 0.074706
},
{
"acc": 0.74078665,
"epoch": 2.3874755381604698,
"grad_norm": 1.0390625,
"learning_rate": 5.3256378713745815e-05,
"loss": 0.91142588,
"memory(GiB)": 20.15,
"step": 610,
"train_speed(iter/s)": 0.074788
},
{
"acc": 0.75772052,
"epoch": 2.407045009784736,
"grad_norm": 2.03125,
"learning_rate": 5.21568321344799e-05,
"loss": 0.85517597,
"memory(GiB)": 19.37,
"step": 615,
"train_speed(iter/s)": 0.074857
},
{
"acc": 0.75341692,
"epoch": 2.426614481409002,
"grad_norm": 1.40625,
"learning_rate": 5.10619742790194e-05,
"loss": 0.87981377,
"memory(GiB)": 18.91,
"step": 620,
"train_speed(iter/s)": 0.074925
},
{
"acc": 0.76221485,
"epoch": 2.446183953033268,
"grad_norm": 5.5625,
"learning_rate": 4.9972092847739603e-05,
"loss": 0.89623175,
"memory(GiB)": 20.27,
"step": 625,
"train_speed(iter/s)": 0.074994
},
{
"acc": 0.74322577,
"epoch": 2.4657534246575343,
"grad_norm": 1.6796875,
"learning_rate": 4.8887474233339963e-05,
"loss": 0.89493027,
"memory(GiB)": 19.38,
"step": 630,
"train_speed(iter/s)": 0.075068
},
{
"acc": 0.74455509,
"epoch": 2.4853228962818004,
"grad_norm": 1.3046875,
"learning_rate": 4.780840344558753e-05,
"loss": 0.92399101,
"memory(GiB)": 19.32,
"step": 635,
"train_speed(iter/s)": 0.075143
},
{
"acc": 0.75597148,
"epoch": 2.5048923679060664,
"grad_norm": 1.65625,
"learning_rate": 4.673516403642383e-05,
"loss": 0.86396818,
"memory(GiB)": 19.52,
"step": 640,
"train_speed(iter/s)": 0.075214
},
{
"acc": 0.75100412,
"epoch": 2.524461839530333,
"grad_norm": 1.5390625,
"learning_rate": 4.5668038025454554e-05,
"loss": 0.89630232,
"memory(GiB)": 19.54,
"step": 645,
"train_speed(iter/s)": 0.07528
},
{
"acc": 0.74814,
"epoch": 2.544031311154599,
"grad_norm": 1.7265625,
"learning_rate": 4.460730582584228e-05,
"loss": 0.90660105,
"memory(GiB)": 19.46,
"step": 650,
"train_speed(iter/s)": 0.075343
},
{
"epoch": 2.544031311154599,
"eval_acc": 0.6304006284367636,
"eval_loss": 1.6207610368728638,
"eval_runtime": 68.9365,
"eval_samples_per_second": 1.102,
"eval_steps_per_second": 0.551,
"step": 650
},
{
"acc": 0.74153934,
"epoch": 2.563600782778865,
"grad_norm": 2.328125,
"learning_rate": 4.3553246170621e-05,
"loss": 0.90404129,
"memory(GiB)": 19.38,
"step": 655,
"train_speed(iter/s)": 0.074813
},
{
"acc": 0.76082869,
"epoch": 2.583170254403131,
"grad_norm": 1.5390625,
"learning_rate": 4.2506136039452357e-05,
"loss": 0.90251627,
"memory(GiB)": 20.24,
"step": 660,
"train_speed(iter/s)": 0.074877
},
{
"acc": 0.76424356,
"epoch": 2.602739726027397,
"grad_norm": 1.109375,
"learning_rate": 4.146625058584251e-05,
"loss": 0.85076065,
"memory(GiB)": 19.4,
"step": 665,
"train_speed(iter/s)": 0.07494
},
{
"acc": 0.75788155,
"epoch": 2.6223091976516635,
"grad_norm": 1.828125,
"learning_rate": 4.043386306483886e-05,
"loss": 0.8638917,
"memory(GiB)": 18.71,
"step": 670,
"train_speed(iter/s)": 0.075
},
{
"acc": 0.74567804,
"epoch": 2.6418786692759295,
"grad_norm": 1.5078125,
"learning_rate": 3.940924476122573e-05,
"loss": 0.91406345,
"memory(GiB)": 19.53,
"step": 675,
"train_speed(iter/s)": 0.075062
},
{
"acc": 0.77229648,
"epoch": 2.6614481409001955,
"grad_norm": 1.3984375,
"learning_rate": 3.839266491823776e-05,
"loss": 0.79556112,
"memory(GiB)": 19.59,
"step": 680,
"train_speed(iter/s)": 0.075125
},
{
"acc": 0.7331708,
"epoch": 2.681017612524462,
"grad_norm": 1.6015625,
"learning_rate": 3.73843906668096e-05,
"loss": 0.95133247,
"memory(GiB)": 19.69,
"step": 685,
"train_speed(iter/s)": 0.075185
},
{
"acc": 0.76955137,
"epoch": 2.700587084148728,
"grad_norm": 1.4140625,
"learning_rate": 3.6384686955380996e-05,
"loss": 0.82770052,
"memory(GiB)": 19.53,
"step": 690,
"train_speed(iter/s)": 0.075245
},
{
"acc": 0.73245583,
"epoch": 2.720156555772994,
"grad_norm": 1.59375,
"learning_rate": 3.539381648027495e-05,
"loss": 0.93347349,
"memory(GiB)": 19.38,
"step": 695,
"train_speed(iter/s)": 0.075313
},
{
"acc": 0.7664053,
"epoch": 2.73972602739726,
"grad_norm": 1.4296875,
"learning_rate": 3.441203961666818e-05,
"loss": 0.84118309,
"memory(GiB)": 19.55,
"step": 700,
"train_speed(iter/s)": 0.075373
},
{
"epoch": 2.73972602739726,
"eval_acc": 0.628750981932443,
"eval_loss": 1.5982366800308228,
"eval_runtime": 69.1268,
"eval_samples_per_second": 1.099,
"eval_steps_per_second": 0.55,
"step": 700
},
{
"acc": 0.74386759,
"epoch": 2.759295499021526,
"grad_norm": 2.21875,
"learning_rate": 3.343961435017094e-05,
"loss": 0.92712116,
"memory(GiB)": 23.1,
"step": 705,
"train_speed(iter/s)": 0.074881
},
{
"acc": 0.75352135,
"epoch": 2.7788649706457926,
"grad_norm": 1.5625,
"learning_rate": 3.247679620903533e-05,
"loss": 0.90610752,
"memory(GiB)": 19.56,
"step": 710,
"train_speed(iter/s)": 0.074934
},
{
"acc": 0.75765467,
"epoch": 2.7984344422700587,
"grad_norm": 4.4375,
"learning_rate": 3.1523838197008956e-05,
"loss": 0.88628139,
"memory(GiB)": 19.44,
"step": 715,
"train_speed(iter/s)": 0.074999
},
{
"acc": 0.763375,
"epoch": 2.8180039138943247,
"grad_norm": 1.1640625,
"learning_rate": 3.058099072685204e-05,
"loss": 0.86159172,
"memory(GiB)": 19.5,
"step": 720,
"train_speed(iter/s)": 0.075059
},
{
"acc": 0.75694184,
"epoch": 2.837573385518591,
"grad_norm": 1.6171875,
"learning_rate": 2.964850155453543e-05,
"loss": 0.85433092,
"memory(GiB)": 19.38,
"step": 725,
"train_speed(iter/s)": 0.075121
},
{
"acc": 0.76086893,
"epoch": 2.857142857142857,
"grad_norm": 1.5859375,
"learning_rate": 2.8726615714136827e-05,
"loss": 0.8608798,
"memory(GiB)": 19.58,
"step": 730,
"train_speed(iter/s)": 0.075181
},
{
"acc": 0.74008894,
"epoch": 2.8767123287671232,
"grad_norm": 1.4375,
"learning_rate": 2.7815575453452058e-05,
"loss": 0.98413734,
"memory(GiB)": 19.59,
"step": 735,
"train_speed(iter/s)": 0.075242
},
{
"acc": 0.75941825,
"epoch": 2.8962818003913893,
"grad_norm": 1.7734375,
"learning_rate": 2.6915620170338612e-05,
"loss": 0.85438929,
"memory(GiB)": 19.39,
"step": 740,
"train_speed(iter/s)": 0.075307
},
{
"acc": 0.77891464,
"epoch": 2.9158512720156553,
"grad_norm": 1.7265625,
"learning_rate": 2.6026986349808058e-05,
"loss": 0.79716868,
"memory(GiB)": 19.61,
"step": 745,
"train_speed(iter/s)": 0.075361
},
{
"acc": 0.75023217,
"epoch": 2.935420743639922,
"grad_norm": 1.28125,
"learning_rate": 2.514990750188399e-05,
"loss": 0.85774508,
"memory(GiB)": 18.86,
"step": 750,
"train_speed(iter/s)": 0.075417
},
{
"epoch": 2.935420743639922,
"eval_acc": 0.6324430479183032,
"eval_loss": 1.5986852645874023,
"eval_runtime": 69.3348,
"eval_samples_per_second": 1.096,
"eval_steps_per_second": 0.548,
"step": 750
},
{
"acc": 0.74531512,
"epoch": 2.954990215264188,
"grad_norm": 1.5625,
"learning_rate": 2.4284614100241538e-05,
"loss": 0.93483381,
"memory(GiB)": 23.14,
"step": 755,
"train_speed(iter/s)": 0.074953
},
{
"acc": 0.76761031,
"epoch": 2.974559686888454,
"grad_norm": 1.6171875,
"learning_rate": 2.343133352164477e-05,
"loss": 0.84630623,
"memory(GiB)": 19.36,
"step": 760,
"train_speed(iter/s)": 0.075015
},
{
"acc": 0.75018072,
"epoch": 2.9941291585127203,
"grad_norm": 1.5703125,
"learning_rate": 2.2590289986198136e-05,
"loss": 0.89352074,
"memory(GiB)": 19.6,
"step": 765,
"train_speed(iter/s)": 0.075072
},
{
"acc": 0.80383377,
"epoch": 3.0136986301369864,
"grad_norm": 1.453125,
"learning_rate": 2.1761704498427003e-05,
"loss": 0.68276234,
"memory(GiB)": 19.62,
"step": 770,
"train_speed(iter/s)": 0.075153
},
{
"acc": 0.82252359,
"epoch": 3.0332681017612524,
"grad_norm": 1.328125,
"learning_rate": 2.094579478920358e-05,
"loss": 0.64008789,
"memory(GiB)": 19.76,
"step": 775,
"train_speed(iter/s)": 0.075213
},
{
"acc": 0.83448801,
"epoch": 3.0528375733855184,
"grad_norm": 1.8828125,
"learning_rate": 2.0142775258532654e-05,
"loss": 0.61610913,
"memory(GiB)": 19.59,
"step": 780,
"train_speed(iter/s)": 0.075271
},
{
"acc": 0.83116817,
"epoch": 3.072407045009785,
"grad_norm": 1.5546875,
"learning_rate": 1.9352856919212994e-05,
"loss": 0.58688097,
"memory(GiB)": 19.53,
"step": 785,
"train_speed(iter/s)": 0.075323
},
{
"acc": 0.82525949,
"epoch": 3.091976516634051,
"grad_norm": 1.4375,
"learning_rate": 1.8576247341388544e-05,
"loss": 0.62312498,
"memory(GiB)": 19.85,
"step": 790,
"train_speed(iter/s)": 0.07537
},
{
"acc": 0.81645441,
"epoch": 3.111545988258317,
"grad_norm": 1.65625,
"learning_rate": 1.7813150598004313e-05,
"loss": 0.62203112,
"memory(GiB)": 19.79,
"step": 795,
"train_speed(iter/s)": 0.075423
},
{
"acc": 0.83432789,
"epoch": 3.131115459882583,
"grad_norm": 1.5859375,
"learning_rate": 1.7063767211181333e-05,
"loss": 0.60077624,
"memory(GiB)": 19.52,
"step": 800,
"train_speed(iter/s)": 0.07548
},
{
"epoch": 3.131115459882583,
"eval_acc": 0.6209740769835035,
"eval_loss": 1.7955598831176758,
"eval_runtime": 69.0109,
"eval_samples_per_second": 1.101,
"eval_steps_per_second": 0.551,
"step": 800
},
{
"acc": 0.82124023,
"epoch": 3.1506849315068495,
"grad_norm": 1.7578125,
"learning_rate": 1.6328294099524644e-05,
"loss": 0.60847788,
"memory(GiB)": 22.65,
"step": 805,
"train_speed(iter/s)": 0.075043
},
{
"acc": 0.83265171,
"epoch": 3.1702544031311155,
"grad_norm": 4.09375,
"learning_rate": 1.5606924526378136e-05,
"loss": 0.57863126,
"memory(GiB)": 18.89,
"step": 810,
"train_speed(iter/s)": 0.07509
},
{
"acc": 0.8407362,
"epoch": 3.1898238747553815,
"grad_norm": 1.1796875,
"learning_rate": 1.4899848049039881e-05,
"loss": 0.53706379,
"memory(GiB)": 19.37,
"step": 815,
"train_speed(iter/s)": 0.075142
},
{
"acc": 0.82116756,
"epoch": 3.2093933463796476,
"grad_norm": 1.859375,
"learning_rate": 1.4207250468951426e-05,
"loss": 0.64039102,
"memory(GiB)": 19.52,
"step": 820,
"train_speed(iter/s)": 0.075197
},
{
"acc": 0.85004549,
"epoch": 3.228962818003914,
"grad_norm": 1.0390625,
"learning_rate": 1.3529313782874023e-05,
"loss": 0.53315983,
"memory(GiB)": 19.52,
"step": 825,
"train_speed(iter/s)": 0.07525
},
{
"acc": 0.83273296,
"epoch": 3.24853228962818,
"grad_norm": 1.578125,
"learning_rate": 1.2866216135064487e-05,
"loss": 0.58545351,
"memory(GiB)": 19.36,
"step": 830,
"train_speed(iter/s)": 0.075303
},
{
"acc": 0.80788403,
"epoch": 3.268101761252446,
"grad_norm": 2.296875,
"learning_rate": 1.2218131770463487e-05,
"loss": 0.67468171,
"memory(GiB)": 19.28,
"step": 835,
"train_speed(iter/s)": 0.075356
},
{
"acc": 0.8440134,
"epoch": 3.287671232876712,
"grad_norm": 1.21875,
"learning_rate": 1.1585230988908576e-05,
"loss": 0.55293651,
"memory(GiB)": 19.37,
"step": 840,
"train_speed(iter/s)": 0.07541
},
{
"acc": 0.81569691,
"epoch": 3.3072407045009786,
"grad_norm": 1.671875,
"learning_rate": 1.0967680100383645e-05,
"loss": 0.61190109,
"memory(GiB)": 18.09,
"step": 845,
"train_speed(iter/s)": 0.075466
},
{
"acc": 0.84766483,
"epoch": 3.3268101761252447,
"grad_norm": 1.8046875,
"learning_rate": 1.0365641381317113e-05,
"loss": 0.52525816,
"memory(GiB)": 19.31,
"step": 850,
"train_speed(iter/s)": 0.075523
},
{
"epoch": 3.3268101761252447,
"eval_acc": 0.6203456402199529,
"eval_loss": 1.7881730794906616,
"eval_runtime": 69.1552,
"eval_samples_per_second": 1.099,
"eval_steps_per_second": 0.549,
"step": 850
},
{
"acc": 0.84491625,
"epoch": 3.3463796477495107,
"grad_norm": 1.8046875,
"learning_rate": 9.779273031939692e-06,
"loss": 0.56272998,
"memory(GiB)": 23.04,
"step": 855,
"train_speed(iter/s)": 0.07511
},
{
"acc": 0.84104662,
"epoch": 3.3659491193737767,
"grad_norm": 1.796875,
"learning_rate": 9.20872913471363e-06,
"loss": 0.57019663,
"memory(GiB)": 19.42,
"step": 860,
"train_speed(iter/s)": 0.075157
},
{
"acc": 0.84433002,
"epoch": 3.385518590998043,
"grad_norm": 1.6484375,
"learning_rate": 8.654159613843715e-06,
"loss": 0.55449514,
"memory(GiB)": 19.59,
"step": 865,
"train_speed(iter/s)": 0.07521
},
{
"acc": 0.80005312,
"epoch": 3.4050880626223092,
"grad_norm": 1.46875,
"learning_rate": 8.115710195881068e-06,
"loss": 0.73595409,
"memory(GiB)": 19.36,
"step": 870,
"train_speed(iter/s)": 0.075258
},
{
"acc": 0.83217945,
"epoch": 3.4246575342465753,
"grad_norm": 3.328125,
"learning_rate": 7.593522371429972e-06,
"loss": 0.58270836,
"memory(GiB)": 19.58,
"step": 875,
"train_speed(iter/s)": 0.075306
},
{
"acc": 0.82742786,
"epoch": 3.4442270058708413,
"grad_norm": 1.234375,
"learning_rate": 7.0877333579678585e-06,
"loss": 0.59052157,
"memory(GiB)": 19.6,
"step": 880,
"train_speed(iter/s)": 0.075358
},
{
"acc": 0.81994705,
"epoch": 3.4637964774951078,
"grad_norm": 1.7578125,
"learning_rate": 6.598476063788036e-06,
"loss": 0.62256751,
"memory(GiB)": 19.56,
"step": 885,
"train_speed(iter/s)": 0.075405
},
{
"acc": 0.8157341,
"epoch": 3.483365949119374,
"grad_norm": 1.8203125,
"learning_rate": 6.12587905307477e-06,
"loss": 0.66806622,
"memory(GiB)": 19.49,
"step": 890,
"train_speed(iter/s)": 0.075454
},
{
"acc": 0.82838688,
"epoch": 3.50293542074364,
"grad_norm": 1.515625,
"learning_rate": 5.67006651212008e-06,
"loss": 0.63044977,
"memory(GiB)": 19.54,
"step": 895,
"train_speed(iter/s)": 0.075497
},
{
"acc": 0.79130597,
"epoch": 3.5225048923679063,
"grad_norm": 1.640625,
"learning_rate": 5.2311582166906605e-06,
"loss": 0.7558567,
"memory(GiB)": 19.28,
"step": 900,
"train_speed(iter/s)": 0.07555
},
{
"epoch": 3.5225048923679063,
"eval_acc": 0.6211311861743912,
"eval_loss": 1.7854998111724854,
"eval_runtime": 69.2434,
"eval_samples_per_second": 1.098,
"eval_steps_per_second": 0.549,
"step": 900
}
],
"logging_steps": 5,
"max_steps": 1020,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.605539502350213e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}