Training in progress, step 343000, checkpoint
Browse files
last-checkpoint/adapter_config.json
CHANGED
@@ -23,13 +23,13 @@
|
|
23 |
"rank_pattern": {},
|
24 |
"revision": null,
|
25 |
"target_modules": [
|
26 |
-
"up_proj",
|
27 |
-
"gate_proj",
|
28 |
-
"k_proj",
|
29 |
-
"v_proj",
|
30 |
"q_proj",
|
|
|
|
|
|
|
|
|
31 |
"o_proj",
|
32 |
-
"
|
33 |
],
|
34 |
"task_type": "CAUSAL_LM",
|
35 |
"use_dora": false,
|
|
|
23 |
"rank_pattern": {},
|
24 |
"revision": null,
|
25 |
"target_modules": [
|
|
|
|
|
|
|
|
|
26 |
"q_proj",
|
27 |
+
"down_proj",
|
28 |
+
"v_proj",
|
29 |
+
"k_proj",
|
30 |
+
"gate_proj",
|
31 |
"o_proj",
|
32 |
+
"up_proj"
|
33 |
],
|
34 |
"task_type": "CAUSAL_LM",
|
35 |
"use_dora": false,
|
last-checkpoint/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1342238560
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:558f0558dbe2ed2fed185bbe33a32e697578eb37a71364f4ae39a77ac585d1c8
|
3 |
size 1342238560
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 683268498
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:781887d172314801ab8802842158c08145ef998a6a80b07686139a50d9285ded
|
3 |
size 683268498
|
last-checkpoint/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e85f0257d01a91ff4050d39219e8dd384bbb4cfdc5b2e0fb4fabf6b2fe3b33e2
|
3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:840aa8e3a2615e43038d3be582aa3892a5d4ec1157dbf18b35d8a9ff2904fee4
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch": 0.
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -11781,6 +11781,237 @@
|
|
11781 |
"learning_rate": 1.9532589419723944e-05,
|
11782 |
"loss": 1.7161,
|
11783 |
"step": 336400
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11784 |
}
|
11785 |
],
|
11786 |
"logging_steps": 200,
|
@@ -11800,7 +12031,7 @@
|
|
11800 |
"attributes": {}
|
11801 |
}
|
11802 |
},
|
11803 |
-
"total_flos": 4.
|
11804 |
"train_batch_size": 1,
|
11805 |
"trial_name": null,
|
11806 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.23855044785586269,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 343000,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
11781 |
"learning_rate": 1.9532589419723944e-05,
|
11782 |
"loss": 1.7161,
|
11783 |
"step": 336400
|
11784 |
+
},
|
11785 |
+
{
|
11786 |
+
"epoch": 0.23409936078216728,
|
11787 |
+
"grad_norm": 5.75113582611084,
|
11788 |
+
"learning_rate": 1.9532039753658822e-05,
|
11789 |
+
"loss": 1.6752,
|
11790 |
+
"step": 336600
|
11791 |
+
},
|
11792 |
+
{
|
11793 |
+
"epoch": 0.23423845725322026,
|
11794 |
+
"grad_norm": 3.8082878589630127,
|
11795 |
+
"learning_rate": 1.9531489775761617e-05,
|
11796 |
+
"loss": 1.6679,
|
11797 |
+
"step": 336800
|
11798 |
+
},
|
11799 |
+
{
|
11800 |
+
"epoch": 0.23437755372427324,
|
11801 |
+
"grad_norm": 4.37647819519043,
|
11802 |
+
"learning_rate": 1.953093948605858e-05,
|
11803 |
+
"loss": 1.6643,
|
11804 |
+
"step": 337000
|
11805 |
+
},
|
11806 |
+
{
|
11807 |
+
"epoch": 0.23451665019532622,
|
11808 |
+
"grad_norm": 5.018675327301025,
|
11809 |
+
"learning_rate": 1.953038888457599e-05,
|
11810 |
+
"loss": 1.6606,
|
11811 |
+
"step": 337200
|
11812 |
+
},
|
11813 |
+
{
|
11814 |
+
"epoch": 0.2346557466663792,
|
11815 |
+
"grad_norm": 5.047998905181885,
|
11816 |
+
"learning_rate": 1.952983797134013e-05,
|
11817 |
+
"loss": 1.6508,
|
11818 |
+
"step": 337400
|
11819 |
+
},
|
11820 |
+
{
|
11821 |
+
"epoch": 0.23479484313743218,
|
11822 |
+
"grad_norm": 7.279408931732178,
|
11823 |
+
"learning_rate": 1.95292867463773e-05,
|
11824 |
+
"loss": 1.6547,
|
11825 |
+
"step": 337600
|
11826 |
+
},
|
11827 |
+
{
|
11828 |
+
"epoch": 0.23493393960848516,
|
11829 |
+
"grad_norm": 6.7975382804870605,
|
11830 |
+
"learning_rate": 1.9528735209713808e-05,
|
11831 |
+
"loss": 1.6461,
|
11832 |
+
"step": 337800
|
11833 |
+
},
|
11834 |
+
{
|
11835 |
+
"epoch": 0.23507303607953814,
|
11836 |
+
"grad_norm": 7.198062896728516,
|
11837 |
+
"learning_rate": 1.9528183361375986e-05,
|
11838 |
+
"loss": 1.6954,
|
11839 |
+
"step": 338000
|
11840 |
+
},
|
11841 |
+
{
|
11842 |
+
"epoch": 0.23521213255059112,
|
11843 |
+
"grad_norm": 4.493501663208008,
|
11844 |
+
"learning_rate": 1.9527631201390185e-05,
|
11845 |
+
"loss": 1.6956,
|
11846 |
+
"step": 338200
|
11847 |
+
},
|
11848 |
+
{
|
11849 |
+
"epoch": 0.2353512290216441,
|
11850 |
+
"grad_norm": 4.0898118019104,
|
11851 |
+
"learning_rate": 1.952707872978276e-05,
|
11852 |
+
"loss": 1.6233,
|
11853 |
+
"step": 338400
|
11854 |
+
},
|
11855 |
+
{
|
11856 |
+
"epoch": 0.23549032549269708,
|
11857 |
+
"grad_norm": 3.5022025108337402,
|
11858 |
+
"learning_rate": 1.952652594658009e-05,
|
11859 |
+
"loss": 1.6675,
|
11860 |
+
"step": 338600
|
11861 |
+
},
|
11862 |
+
{
|
11863 |
+
"epoch": 0.23562942196375006,
|
11864 |
+
"grad_norm": 3.9198243618011475,
|
11865 |
+
"learning_rate": 1.9525972851808555e-05,
|
11866 |
+
"loss": 1.6433,
|
11867 |
+
"step": 338800
|
11868 |
+
},
|
11869 |
+
{
|
11870 |
+
"epoch": 0.23576851843480304,
|
11871 |
+
"grad_norm": 4.736083507537842,
|
11872 |
+
"learning_rate": 1.9525419445494563e-05,
|
11873 |
+
"loss": 1.6486,
|
11874 |
+
"step": 339000
|
11875 |
+
},
|
11876 |
+
{
|
11877 |
+
"epoch": 0.23590761490585604,
|
11878 |
+
"grad_norm": 3.913604259490967,
|
11879 |
+
"learning_rate": 1.952486572766454e-05,
|
11880 |
+
"loss": 1.5873,
|
11881 |
+
"step": 339200
|
11882 |
+
},
|
11883 |
+
{
|
11884 |
+
"epoch": 0.23604671137690902,
|
11885 |
+
"grad_norm": 4.593210220336914,
|
11886 |
+
"learning_rate": 1.9524311698344908e-05,
|
11887 |
+
"loss": 1.696,
|
11888 |
+
"step": 339400
|
11889 |
+
},
|
11890 |
+
{
|
11891 |
+
"epoch": 0.236185807847962,
|
11892 |
+
"grad_norm": 12.825864791870117,
|
11893 |
+
"learning_rate": 1.9523757357562124e-05,
|
11894 |
+
"loss": 1.6756,
|
11895 |
+
"step": 339600
|
11896 |
+
},
|
11897 |
+
{
|
11898 |
+
"epoch": 0.23632490431901498,
|
11899 |
+
"grad_norm": 3.4124608039855957,
|
11900 |
+
"learning_rate": 1.9523202705342653e-05,
|
11901 |
+
"loss": 1.6614,
|
11902 |
+
"step": 339800
|
11903 |
+
},
|
11904 |
+
{
|
11905 |
+
"epoch": 0.23646400079006796,
|
11906 |
+
"grad_norm": 3.605181932449341,
|
11907 |
+
"learning_rate": 1.9522647741712966e-05,
|
11908 |
+
"loss": 1.6916,
|
11909 |
+
"step": 340000
|
11910 |
+
},
|
11911 |
+
{
|
11912 |
+
"epoch": 0.23660309726112094,
|
11913 |
+
"grad_norm": 5.278689384460449,
|
11914 |
+
"learning_rate": 1.952209246669956e-05,
|
11915 |
+
"loss": 1.6617,
|
11916 |
+
"step": 340200
|
11917 |
+
},
|
11918 |
+
{
|
11919 |
+
"epoch": 0.23674219373217392,
|
11920 |
+
"grad_norm": 5.578737258911133,
|
11921 |
+
"learning_rate": 1.9521536880328943e-05,
|
11922 |
+
"loss": 1.7077,
|
11923 |
+
"step": 340400
|
11924 |
+
},
|
11925 |
+
{
|
11926 |
+
"epoch": 0.2368812902032269,
|
11927 |
+
"grad_norm": 4.157208442687988,
|
11928 |
+
"learning_rate": 1.9520980982627642e-05,
|
11929 |
+
"loss": 1.6824,
|
11930 |
+
"step": 340600
|
11931 |
+
},
|
11932 |
+
{
|
11933 |
+
"epoch": 0.23702038667427988,
|
11934 |
+
"grad_norm": 3.1329407691955566,
|
11935 |
+
"learning_rate": 1.9520424773622193e-05,
|
11936 |
+
"loss": 1.6559,
|
11937 |
+
"step": 340800
|
11938 |
+
},
|
11939 |
+
{
|
11940 |
+
"epoch": 0.23715948314533286,
|
11941 |
+
"grad_norm": 4.475450038909912,
|
11942 |
+
"learning_rate": 1.951986825333914e-05,
|
11943 |
+
"loss": 1.7017,
|
11944 |
+
"step": 341000
|
11945 |
+
},
|
11946 |
+
{
|
11947 |
+
"epoch": 0.23729857961638584,
|
11948 |
+
"grad_norm": 4.912330627441406,
|
11949 |
+
"learning_rate": 1.9519311421805062e-05,
|
11950 |
+
"loss": 1.6263,
|
11951 |
+
"step": 341200
|
11952 |
+
},
|
11953 |
+
{
|
11954 |
+
"epoch": 0.23743767608743882,
|
11955 |
+
"grad_norm": 6.892397403717041,
|
11956 |
+
"learning_rate": 1.951875427904654e-05,
|
11957 |
+
"loss": 1.7071,
|
11958 |
+
"step": 341400
|
11959 |
+
},
|
11960 |
+
{
|
11961 |
+
"epoch": 0.2375767725584918,
|
11962 |
+
"grad_norm": 4.659296989440918,
|
11963 |
+
"learning_rate": 1.9518196825090167e-05,
|
11964 |
+
"loss": 1.6526,
|
11965 |
+
"step": 341600
|
11966 |
+
},
|
11967 |
+
{
|
11968 |
+
"epoch": 0.23771586902954478,
|
11969 |
+
"grad_norm": 7.2321977615356445,
|
11970 |
+
"learning_rate": 1.9517639059962558e-05,
|
11971 |
+
"loss": 1.619,
|
11972 |
+
"step": 341800
|
11973 |
+
},
|
11974 |
+
{
|
11975 |
+
"epoch": 0.23785496550059776,
|
11976 |
+
"grad_norm": 4.7723283767700195,
|
11977 |
+
"learning_rate": 1.951708098369033e-05,
|
11978 |
+
"loss": 1.6601,
|
11979 |
+
"step": 342000
|
11980 |
+
},
|
11981 |
+
{
|
11982 |
+
"epoch": 0.23799406197165074,
|
11983 |
+
"grad_norm": 4.46943473815918,
|
11984 |
+
"learning_rate": 1.951652259630014e-05,
|
11985 |
+
"loss": 1.6552,
|
11986 |
+
"step": 342200
|
11987 |
+
},
|
11988 |
+
{
|
11989 |
+
"epoch": 0.23813315844270372,
|
11990 |
+
"grad_norm": 3.9207563400268555,
|
11991 |
+
"learning_rate": 1.951596389781864e-05,
|
11992 |
+
"loss": 1.6588,
|
11993 |
+
"step": 342400
|
11994 |
+
},
|
11995 |
+
{
|
11996 |
+
"epoch": 0.2382722549137567,
|
11997 |
+
"grad_norm": 4.317783355712891,
|
11998 |
+
"learning_rate": 1.95154048882725e-05,
|
11999 |
+
"loss": 1.6362,
|
12000 |
+
"step": 342600
|
12001 |
+
},
|
12002 |
+
{
|
12003 |
+
"epoch": 0.2384113513848097,
|
12004 |
+
"grad_norm": 4.8455939292907715,
|
12005 |
+
"learning_rate": 1.9514845567688408e-05,
|
12006 |
+
"loss": 1.6518,
|
12007 |
+
"step": 342800
|
12008 |
+
},
|
12009 |
+
{
|
12010 |
+
"epoch": 0.23855044785586269,
|
12011 |
+
"grad_norm": 7.664321422576904,
|
12012 |
+
"learning_rate": 1.9514285936093064e-05,
|
12013 |
+
"loss": 1.6889,
|
12014 |
+
"step": 343000
|
12015 |
}
|
12016 |
],
|
12017 |
"logging_steps": 200,
|
|
|
12031 |
"attributes": {}
|
12032 |
}
|
12033 |
},
|
12034 |
+
"total_flos": 4.567214300600918e+18,
|
12035 |
"train_batch_size": 1,
|
12036 |
"trial_name": null,
|
12037 |
"trial_params": null
|
last-checkpoint/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 6840
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2f194f0afbf00cd135f18b6f6e0dc2d489f2d84487accfafc9254221384d4d16
|
3 |
size 6840
|