Initial commit of fine-tuned model

Browse files

Files changed (5) hide show

adapter_model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +685 -5

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:473f0d4a80ac0df61e01088932e49cb6824225d5d4bf3f360908abdfb9a580f3
 size 54543184

 version https://git-lfs.github.com/spec/v1
+oid sha256:d4906ed3de48fb0de630d14e495577b07cec0878941b9c89aa8081b1d906b340
 size 54543184

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bfe44e4195868e44c40aacc562681b1950bdc51cc1f36a646ad67202229b307f
 size 109130618

 version https://git-lfs.github.com/spec/v1
+oid sha256:d2e45fb0c3cffb55cff93b8eab278a57f60af5f661a94d98aca9a63030091bee
 size 109130618

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bc2acf47f68caa7d18b212606808dac263cf20e72e1bf6e3dd9688c46d8616b4
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:c74c344198d9960751843e4c11fd2221f86d11dba060b029f4f7201c81ce036e
 size 14244

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d03f8df28661f098b1ac9b3cb4165605c8f0f87b62ef0eff1506af174fbbc387
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:fe9f6904a7003fa25bc1c72394e6d3f620e7e1016a868c0c50ad1d7ebd7f9390
 size 1064

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-  "best_metric": 2.096661329269409,
-  "best_model_checkpoint": "/data/sunggeunan/ICL/src/outputs/Meta-Llama-3-8B-Instruct_qa_ft_QA_mrqa_nq_SQuAD_3shot_1docs/checkpoint-385",
-  "epoch": 4.0,
   "eval_steps": 100,
-  "global_step": 385,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2734,6 +2734,686 @@
       "eval_samples_per_second": 2.703,
       "eval_steps_per_second": 0.432,
       "step": 385
     }
   ],
   "logging_steps": 1,
@@ -2753,7 +3433,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 4.55214242753151e+18,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 2.0890417098999023,
+  "best_model_checkpoint": "/data/sunggeunan/ICL/src/outputs/Meta-Llama-3-8B-Instruct_qa_ft_QA_mrqa_nq_SQuAD_3shot_1docs/checkpoint-481",
+  "epoch": 4.997402597402598,
   "eval_steps": 100,
+  "global_step": 481,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 2.703,
       "eval_steps_per_second": 0.432,
       "step": 385
+    },
+    {
+      "epoch": 4.01038961038961,
+      "grad_norm": 0.3841288685798645,
+      "learning_rate": 3.9087719298245615e-07,
+      "loss": 2.1132,
+      "step": 386
+    },
+    {
+      "epoch": 4.020779220779221,
+      "grad_norm": 0.3716009855270386,
+      "learning_rate": 3.9052631578947363e-07,
+      "loss": 2.1147,
+      "step": 387
+    },
+    {
+      "epoch": 4.031168831168831,
+      "grad_norm": 0.37986499071121216,
+      "learning_rate": 3.9017543859649123e-07,
+      "loss": 2.085,
+      "step": 388
+    },
+    {
+      "epoch": 4.041558441558442,
+      "grad_norm": 0.37856024503707886,
+      "learning_rate": 3.8982456140350877e-07,
+      "loss": 2.0922,
+      "step": 389
+    },
+    {
+      "epoch": 4.0519480519480515,
+      "grad_norm": 0.3767644464969635,
+      "learning_rate": 3.894736842105263e-07,
+      "loss": 2.0762,
+      "step": 390
+    },
+    {
+      "epoch": 4.062337662337662,
+      "grad_norm": 0.3725319802761078,
+      "learning_rate": 3.8912280701754385e-07,
+      "loss": 2.0878,
+      "step": 391
+    },
+    {
+      "epoch": 4.072727272727272,
+      "grad_norm": 0.35704779624938965,
+      "learning_rate": 3.8877192982456134e-07,
+      "loss": 2.0943,
+      "step": 392
+    },
+    {
+      "epoch": 4.083116883116883,
+      "grad_norm": 0.3813813626766205,
+      "learning_rate": 3.8842105263157893e-07,
+      "loss": 2.0769,
+      "step": 393
+    },
+    {
+      "epoch": 4.093506493506493,
+      "grad_norm": 0.3729550540447235,
+      "learning_rate": 3.8807017543859647e-07,
+      "loss": 2.0747,
+      "step": 394
+    },
+    {
+      "epoch": 4.103896103896104,
+      "grad_norm": 0.37814322113990784,
+      "learning_rate": 3.87719298245614e-07,
+      "loss": 2.0834,
+      "step": 395
+    },
+    {
+      "epoch": 4.114285714285714,
+      "grad_norm": 0.3847734034061432,
+      "learning_rate": 3.8736842105263155e-07,
+      "loss": 2.0945,
+      "step": 396
+    },
+    {
+      "epoch": 4.124675324675325,
+      "grad_norm": 0.3798567056655884,
+      "learning_rate": 3.8701754385964915e-07,
+      "loss": 2.0756,
+      "step": 397
+    },
+    {
+      "epoch": 4.135064935064935,
+      "grad_norm": 0.3700884282588959,
+      "learning_rate": 3.8666666666666664e-07,
+      "loss": 2.0621,
+      "step": 398
+    },
+    {
+      "epoch": 4.1454545454545455,
+      "grad_norm": 0.36993175745010376,
+      "learning_rate": 3.8631578947368423e-07,
+      "loss": 2.1099,
+      "step": 399
+    },
+    {
+      "epoch": 4.1558441558441555,
+      "grad_norm": 0.380310595035553,
+      "learning_rate": 3.859649122807017e-07,
+      "loss": 2.0981,
+      "step": 400
+    },
+    {
+      "epoch": 4.166233766233766,
+      "grad_norm": 0.38853439688682556,
+      "learning_rate": 3.8561403508771926e-07,
+      "loss": 2.1049,
+      "step": 401
+    },
+    {
+      "epoch": 4.176623376623376,
+      "grad_norm": 0.3776048421859741,
+      "learning_rate": 3.8526315789473685e-07,
+      "loss": 2.0446,
+      "step": 402
+    },
+    {
+      "epoch": 4.187012987012987,
+      "grad_norm": 0.38195568323135376,
+      "learning_rate": 3.8491228070175434e-07,
+      "loss": 2.0813,
+      "step": 403
+    },
+    {
+      "epoch": 4.197402597402597,
+      "grad_norm": 0.373125284910202,
+      "learning_rate": 3.8456140350877193e-07,
+      "loss": 2.0903,
+      "step": 404
+    },
+    {
+      "epoch": 4.207792207792208,
+      "grad_norm": 0.3781803548336029,
+      "learning_rate": 3.842105263157894e-07,
+      "loss": 2.093,
+      "step": 405
+    },
+    {
+      "epoch": 4.218181818181818,
+      "grad_norm": 0.38378608226776123,
+      "learning_rate": 3.83859649122807e-07,
+      "loss": 2.0772,
+      "step": 406
+    },
+    {
+      "epoch": 4.228571428571429,
+      "grad_norm": 0.3815755248069763,
+      "learning_rate": 3.8350877192982455e-07,
+      "loss": 2.0876,
+      "step": 407
+    },
+    {
+      "epoch": 4.238961038961039,
+      "grad_norm": 0.3809583783149719,
+      "learning_rate": 3.831578947368421e-07,
+      "loss": 2.0631,
+      "step": 408
+    },
+    {
+      "epoch": 4.249350649350649,
+      "grad_norm": 0.3809110224246979,
+      "learning_rate": 3.8280701754385964e-07,
+      "loss": 2.1069,
+      "step": 409
+    },
+    {
+      "epoch": 4.259740259740259,
+      "grad_norm": 0.37152138352394104,
+      "learning_rate": 3.824561403508772e-07,
+      "loss": 2.0738,
+      "step": 410
+    },
+    {
+      "epoch": 4.27012987012987,
+      "grad_norm": 0.3761196434497833,
+      "learning_rate": 3.821052631578947e-07,
+      "loss": 2.0994,
+      "step": 411
+    },
+    {
+      "epoch": 4.28051948051948,
+      "grad_norm": 0.39031481742858887,
+      "learning_rate": 3.8175438596491226e-07,
+      "loss": 2.0857,
+      "step": 412
+    },
+    {
+      "epoch": 4.290909090909091,
+      "grad_norm": 0.37237513065338135,
+      "learning_rate": 3.814035087719298e-07,
+      "loss": 2.0844,
+      "step": 413
+    },
+    {
+      "epoch": 4.301298701298701,
+      "grad_norm": 0.38423943519592285,
+      "learning_rate": 3.8105263157894734e-07,
+      "loss": 2.126,
+      "step": 414
+    },
+    {
+      "epoch": 4.311688311688312,
+      "grad_norm": 0.36542361974716187,
+      "learning_rate": 3.8070175438596493e-07,
+      "loss": 2.0705,
+      "step": 415
+    },
+    {
+      "epoch": 4.322077922077922,
+      "grad_norm": 0.36861154437065125,
+      "learning_rate": 3.803508771929824e-07,
+      "loss": 2.0681,
+      "step": 416
+    },
+    {
+      "epoch": 4.332467532467533,
+      "grad_norm": 0.3783316910266876,
+      "learning_rate": 3.7999999999999996e-07,
+      "loss": 2.0777,
+      "step": 417
+    },
+    {
+      "epoch": 4.3428571428571425,
+      "grad_norm": 0.38323143124580383,
+      "learning_rate": 3.7964912280701756e-07,
+      "loss": 2.0952,
+      "step": 418
+    },
+    {
+      "epoch": 4.353246753246753,
+      "grad_norm": 0.3862488269805908,
+      "learning_rate": 3.7929824561403504e-07,
+      "loss": 2.0927,
+      "step": 419
+    },
+    {
+      "epoch": 4.363636363636363,
+      "grad_norm": 0.38100945949554443,
+      "learning_rate": 3.7894736842105264e-07,
+      "loss": 2.1043,
+      "step": 420
+    },
+    {
+      "epoch": 4.374025974025974,
+      "grad_norm": 0.38466402888298035,
+      "learning_rate": 3.785964912280701e-07,
+      "loss": 2.0906,
+      "step": 421
+    },
+    {
+      "epoch": 4.384415584415584,
+      "grad_norm": 0.37953078746795654,
+      "learning_rate": 3.782456140350877e-07,
+      "loss": 2.0706,
+      "step": 422
+    },
+    {
+      "epoch": 4.394805194805195,
+      "grad_norm": 0.3823856711387634,
+      "learning_rate": 3.7789473684210526e-07,
+      "loss": 2.0951,
+      "step": 423
+    },
+    {
+      "epoch": 4.405194805194805,
+      "grad_norm": 0.37538549304008484,
+      "learning_rate": 3.775438596491228e-07,
+      "loss": 2.0771,
+      "step": 424
+    },
+    {
+      "epoch": 4.415584415584416,
+      "grad_norm": 0.3802937865257263,
+      "learning_rate": 3.7719298245614034e-07,
+      "loss": 2.078,
+      "step": 425
+    },
+    {
+      "epoch": 4.425974025974026,
+      "grad_norm": 0.3733079433441162,
+      "learning_rate": 3.7684210526315783e-07,
+      "loss": 2.0799,
+      "step": 426
+    },
+    {
+      "epoch": 4.4363636363636365,
+      "grad_norm": 0.37729039788246155,
+      "learning_rate": 3.764912280701754e-07,
+      "loss": 2.1051,
+      "step": 427
+    },
+    {
+      "epoch": 4.4467532467532465,
+      "grad_norm": 0.3915861248970032,
+      "learning_rate": 3.7614035087719296e-07,
+      "loss": 2.0927,
+      "step": 428
+    },
+    {
+      "epoch": 4.457142857142857,
+      "grad_norm": 0.38771378993988037,
+      "learning_rate": 3.757894736842105e-07,
+      "loss": 2.0989,
+      "step": 429
+    },
+    {
+      "epoch": 4.467532467532467,
+      "grad_norm": 0.3854687213897705,
+      "learning_rate": 3.7543859649122804e-07,
+      "loss": 2.0984,
+      "step": 430
+    },
+    {
+      "epoch": 4.477922077922078,
+      "grad_norm": 0.3793568015098572,
+      "learning_rate": 3.7508771929824564e-07,
+      "loss": 2.0804,
+      "step": 431
+    },
+    {
+      "epoch": 4.488311688311688,
+      "grad_norm": 0.39430853724479675,
+      "learning_rate": 3.747368421052631e-07,
+      "loss": 2.0985,
+      "step": 432
+    },
+    {
+      "epoch": 4.498701298701299,
+      "grad_norm": 0.3847366273403168,
+      "learning_rate": 3.743859649122807e-07,
+      "loss": 2.0849,
+      "step": 433
+    },
+    {
+      "epoch": 4.509090909090909,
+      "grad_norm": 0.374398797750473,
+      "learning_rate": 3.740350877192982e-07,
+      "loss": 2.0847,
+      "step": 434
+    },
+    {
+      "epoch": 4.51948051948052,
+      "grad_norm": 0.4258849620819092,
+      "learning_rate": 3.7368421052631575e-07,
+      "loss": 2.082,
+      "step": 435
+    },
+    {
+      "epoch": 4.52987012987013,
+      "grad_norm": 0.3853350579738617,
+      "learning_rate": 3.7333333333333334e-07,
+      "loss": 2.0832,
+      "step": 436
+    },
+    {
+      "epoch": 4.54025974025974,
+      "grad_norm": 0.38020631670951843,
+      "learning_rate": 3.7298245614035083e-07,
+      "loss": 2.0799,
+      "step": 437
+    },
+    {
+      "epoch": 4.55064935064935,
+      "grad_norm": 0.4022679030895233,
+      "learning_rate": 3.726315789473684e-07,
+      "loss": 2.1038,
+      "step": 438
+    },
+    {
+      "epoch": 4.561038961038961,
+      "grad_norm": 0.37137728929519653,
+      "learning_rate": 3.7228070175438596e-07,
+      "loss": 2.0921,
+      "step": 439
+    },
+    {
+      "epoch": 4.571428571428571,
+      "grad_norm": 0.38251206278800964,
+      "learning_rate": 3.719298245614035e-07,
+      "loss": 2.0878,
+      "step": 440
+    },
+    {
+      "epoch": 4.581818181818182,
+      "grad_norm": 0.39200717210769653,
+      "learning_rate": 3.7157894736842104e-07,
+      "loss": 2.1202,
+      "step": 441
+    },
+    {
+      "epoch": 4.592207792207792,
+      "grad_norm": 0.3731335699558258,
+      "learning_rate": 3.7122807017543853e-07,
+      "loss": 2.0737,
+      "step": 442
+    },
+    {
+      "epoch": 4.602597402597403,
+      "grad_norm": 0.38276833295822144,
+      "learning_rate": 3.708771929824561e-07,
+      "loss": 2.0521,
+      "step": 443
+    },
+    {
+      "epoch": 4.612987012987013,
+      "grad_norm": 0.38775137066841125,
+      "learning_rate": 3.7052631578947367e-07,
+      "loss": 2.0772,
+      "step": 444
+    },
+    {
+      "epoch": 4.623376623376624,
+      "grad_norm": 0.3836955428123474,
+      "learning_rate": 3.701754385964912e-07,
+      "loss": 2.0992,
+      "step": 445
+    },
+    {
+      "epoch": 4.6337662337662335,
+      "grad_norm": 0.37715139985084534,
+      "learning_rate": 3.6982456140350875e-07,
+      "loss": 2.0499,
+      "step": 446
+    },
+    {
+      "epoch": 4.644155844155844,
+      "grad_norm": 0.3789008557796478,
+      "learning_rate": 3.6947368421052634e-07,
+      "loss": 2.0531,
+      "step": 447
+    },
+    {
+      "epoch": 4.654545454545454,
+      "grad_norm": 0.3865036964416504,
+      "learning_rate": 3.6912280701754383e-07,
+      "loss": 2.0949,
+      "step": 448
+    },
+    {
+      "epoch": 4.664935064935065,
+      "grad_norm": 0.3880210816860199,
+      "learning_rate": 3.687719298245614e-07,
+      "loss": 2.0871,
+      "step": 449
+    },
+    {
+      "epoch": 4.675324675324675,
+      "grad_norm": 0.3839876353740692,
+      "learning_rate": 3.684210526315789e-07,
+      "loss": 2.0586,
+      "step": 450
+    },
+    {
+      "epoch": 4.685714285714286,
+      "grad_norm": 0.39316463470458984,
+      "learning_rate": 3.6807017543859645e-07,
+      "loss": 2.0736,
+      "step": 451
+    },
+    {
+      "epoch": 4.696103896103896,
+      "grad_norm": 0.37328803539276123,
+      "learning_rate": 3.6771929824561405e-07,
+      "loss": 2.084,
+      "step": 452
+    },
+    {
+      "epoch": 4.706493506493507,
+      "grad_norm": 0.3884430527687073,
+      "learning_rate": 3.6736842105263153e-07,
+      "loss": 2.0788,
+      "step": 453
+    },
+    {
+      "epoch": 4.716883116883117,
+      "grad_norm": 0.385623574256897,
+      "learning_rate": 3.6701754385964913e-07,
+      "loss": 2.0705,
+      "step": 454
+    },
+    {
+      "epoch": 4.7272727272727275,
+      "grad_norm": 0.38950812816619873,
+      "learning_rate": 3.666666666666666e-07,
+      "loss": 2.0785,
+      "step": 455
+    },
+    {
+      "epoch": 4.7376623376623375,
+      "grad_norm": 0.38535040616989136,
+      "learning_rate": 3.663157894736842e-07,
+      "loss": 2.0909,
+      "step": 456
+    },
+    {
+      "epoch": 4.748051948051948,
+      "grad_norm": 0.3869593143463135,
+      "learning_rate": 3.6596491228070175e-07,
+      "loss": 2.0801,
+      "step": 457
+    },
+    {
+      "epoch": 4.758441558441558,
+      "grad_norm": 0.39084428548812866,
+      "learning_rate": 3.656140350877193e-07,
+      "loss": 2.096,
+      "step": 458
+    },
+    {
+      "epoch": 4.768831168831169,
+      "grad_norm": 0.3794546127319336,
+      "learning_rate": 3.6526315789473683e-07,
+      "loss": 2.0527,
+      "step": 459
+    },
+    {
+      "epoch": 4.779220779220779,
+      "grad_norm": 0.3870809078216553,
+      "learning_rate": 3.6491228070175437e-07,
+      "loss": 2.0853,
+      "step": 460
+    },
+    {
+      "epoch": 4.78961038961039,
+      "grad_norm": 0.38205036520957947,
+      "learning_rate": 3.645614035087719e-07,
+      "loss": 2.0643,
+      "step": 461
+    },
+    {
+      "epoch": 4.8,
+      "grad_norm": 0.3907061815261841,
+      "learning_rate": 3.6421052631578945e-07,
+      "loss": 2.0786,
+      "step": 462
+    },
+    {
+      "epoch": 4.810389610389611,
+      "grad_norm": 0.39493080973625183,
+      "learning_rate": 3.63859649122807e-07,
+      "loss": 2.0944,
+      "step": 463
+    },
+    {
+      "epoch": 4.820779220779221,
+      "grad_norm": 0.3930380046367645,
+      "learning_rate": 3.6350877192982453e-07,
+      "loss": 2.1138,
+      "step": 464
+    },
+    {
+      "epoch": 4.8311688311688314,
+      "grad_norm": 0.3952060639858246,
+      "learning_rate": 3.6315789473684213e-07,
+      "loss": 2.0802,
+      "step": 465
+    },
+    {
+      "epoch": 4.841558441558441,
+      "grad_norm": 0.3815995752811432,
+      "learning_rate": 3.628070175438596e-07,
+      "loss": 2.0838,
+      "step": 466
+    },
+    {
+      "epoch": 4.851948051948052,
+      "grad_norm": 0.38858020305633545,
+      "learning_rate": 3.6245614035087716e-07,
+      "loss": 2.0804,
+      "step": 467
+    },
+    {
+      "epoch": 4.862337662337662,
+      "grad_norm": 0.385565847158432,
+      "learning_rate": 3.6210526315789475e-07,
+      "loss": 2.0974,
+      "step": 468
+    },
+    {
+      "epoch": 4.872727272727273,
+      "grad_norm": 0.3909178078174591,
+      "learning_rate": 3.6175438596491224e-07,
+      "loss": 2.0887,
+      "step": 469
+    },
+    {
+      "epoch": 4.883116883116883,
+      "grad_norm": 0.3982325792312622,
+      "learning_rate": 3.6140350877192983e-07,
+      "loss": 2.1054,
+      "step": 470
+    },
+    {
+      "epoch": 4.893506493506494,
+      "grad_norm": 0.3876339793205261,
+      "learning_rate": 3.610526315789473e-07,
+      "loss": 2.1054,
+      "step": 471
+    },
+    {
+      "epoch": 4.903896103896104,
+      "grad_norm": 0.3819069266319275,
+      "learning_rate": 3.607017543859649e-07,
+      "loss": 2.0821,
+      "step": 472
+    },
+    {
+      "epoch": 4.914285714285715,
+      "grad_norm": 0.3924694359302521,
+      "learning_rate": 3.6035087719298245e-07,
+      "loss": 2.0712,
+      "step": 473
+    },
+    {
+      "epoch": 4.9246753246753245,
+      "grad_norm": 0.3937675654888153,
+      "learning_rate": 3.6e-07,
+      "loss": 2.1057,
+      "step": 474
+    },
+    {
+      "epoch": 4.935064935064935,
+      "grad_norm": 0.38620275259017944,
+      "learning_rate": 3.5964912280701754e-07,
+      "loss": 2.0845,
+      "step": 475
+    },
+    {
+      "epoch": 4.945454545454545,
+      "grad_norm": 0.40442150831222534,
+      "learning_rate": 3.59298245614035e-07,
+      "loss": 2.0936,
+      "step": 476
+    },
+    {
+      "epoch": 4.955844155844156,
+      "grad_norm": 0.3815317153930664,
+      "learning_rate": 3.589473684210526e-07,
+      "loss": 2.0845,
+      "step": 477
+    },
+    {
+      "epoch": 4.966233766233766,
+      "grad_norm": 0.38584476709365845,
+      "learning_rate": 3.5859649122807016e-07,
+      "loss": 2.071,
+      "step": 478
+    },
+    {
+      "epoch": 4.976623376623377,
+      "grad_norm": 0.3887505829334259,
+      "learning_rate": 3.582456140350877e-07,
+      "loss": 2.0924,
+      "step": 479
+    },
+    {
+      "epoch": 4.987012987012987,
+      "grad_norm": 0.3836219012737274,
+      "learning_rate": 3.5789473684210524e-07,
+      "loss": 2.0986,
+      "step": 480
+    },
+    {
+      "epoch": 4.997402597402598,
+      "grad_norm": 0.38430362939834595,
+      "learning_rate": 3.5754385964912283e-07,
+      "loss": 2.1006,
+      "step": 481
+    },
+    {
+      "epoch": 4.997402597402598,
+      "eval_loss": 2.0890417098999023,
+      "eval_runtime": 9.2277,
+      "eval_samples_per_second": 2.709,
+      "eval_steps_per_second": 0.433,
+      "step": 481
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 5.690178034414387e+18,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null