Training in progress, step 1900, checkpoint
Browse files
last-checkpoint/adapter_config.json
CHANGED
@@ -26,13 +26,13 @@
|
|
26 |
"rank_pattern": {},
|
27 |
"revision": null,
|
28 |
"target_modules": [
|
29 |
-
"k_proj",
|
30 |
-
"gate_proj",
|
31 |
-
"down_proj",
|
32 |
-
"q_proj",
|
33 |
"v_proj",
|
|
|
|
|
34 |
"o_proj",
|
35 |
-
"
|
|
|
|
|
36 |
],
|
37 |
"task_type": "CAUSAL_LM",
|
38 |
"use_dora": false,
|
|
|
26 |
"rank_pattern": {},
|
27 |
"revision": null,
|
28 |
"target_modules": [
|
|
|
|
|
|
|
|
|
29 |
"v_proj",
|
30 |
+
"up_proj",
|
31 |
+
"q_proj",
|
32 |
"o_proj",
|
33 |
+
"k_proj",
|
34 |
+
"gate_proj",
|
35 |
+
"down_proj"
|
36 |
],
|
37 |
"task_type": "CAUSAL_LM",
|
38 |
"use_dora": false,
|
last-checkpoint/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1370666272
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0f76dbb9a45ae718cb4c4ffa542564fbd46a97583f94b15b7d3e80c39275a70f
|
3 |
size 1370666272
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 697294462
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fb3c2e24267fa356ea44dc14e7953e417fa1d6dd44f526c4daea1bcf6b647b7
|
3 |
size 697294462
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e3dad5e9640794d19b0f41e34b58f722c69f08c60cfeb247e583e12e03c10e0
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
-
"epoch": 0.
|
5 |
"eval_steps": 500,
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
@@ -12957,6 +12957,356 @@
|
|
12957 |
"learning_rate": 0.00019639739459366182,
|
12958 |
"loss": 0.9533,
|
12959 |
"step": 1850
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12960 |
}
|
12961 |
],
|
12962 |
"logging_steps": 1,
|
@@ -12976,7 +13326,7 @@
|
|
12976 |
"attributes": {}
|
12977 |
}
|
12978 |
},
|
12979 |
-
"total_flos": 8.
|
12980 |
"train_batch_size": 32,
|
12981 |
"trial_name": null,
|
12982 |
"trial_params": null
|
|
|
1 |
{
|
2 |
"best_metric": null,
|
3 |
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.4409887431820819,
|
5 |
"eval_steps": 500,
|
6 |
+
"global_step": 1900,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
|
|
12957 |
"learning_rate": 0.00019639739459366182,
|
12958 |
"loss": 0.9533,
|
12959 |
"step": 1850
|
12960 |
+
},
|
12961 |
+
{
|
12962 |
+
"epoch": 0.42961587559475456,
|
12963 |
+
"grad_norm": 0.47029098868370056,
|
12964 |
+
"learning_rate": 0.0001963935140270619,
|
12965 |
+
"loss": 0.8904,
|
12966 |
+
"step": 1851
|
12967 |
+
},
|
12968 |
+
{
|
12969 |
+
"epoch": 0.4298479749332714,
|
12970 |
+
"grad_norm": 0.5885578393936157,
|
12971 |
+
"learning_rate": 0.00019638963140997906,
|
12972 |
+
"loss": 0.8957,
|
12973 |
+
"step": 1852
|
12974 |
+
},
|
12975 |
+
{
|
12976 |
+
"epoch": 0.43008007427178835,
|
12977 |
+
"grad_norm": 0.5930177569389343,
|
12978 |
+
"learning_rate": 0.00019638574674249587,
|
12979 |
+
"loss": 1.0084,
|
12980 |
+
"step": 1853
|
12981 |
+
},
|
12982 |
+
{
|
12983 |
+
"epoch": 0.4303121736103052,
|
12984 |
+
"grad_norm": 0.5558833479881287,
|
12985 |
+
"learning_rate": 0.00019638186002469494,
|
12986 |
+
"loss": 0.9102,
|
12987 |
+
"step": 1854
|
12988 |
+
},
|
12989 |
+
{
|
12990 |
+
"epoch": 0.4305442729488221,
|
12991 |
+
"grad_norm": 0.5855537056922913,
|
12992 |
+
"learning_rate": 0.000196377971256659,
|
12993 |
+
"loss": 0.9468,
|
12994 |
+
"step": 1855
|
12995 |
+
},
|
12996 |
+
{
|
12997 |
+
"epoch": 0.430776372287339,
|
12998 |
+
"grad_norm": 0.5754596590995789,
|
12999 |
+
"learning_rate": 0.00019637408043847074,
|
13000 |
+
"loss": 0.9044,
|
13001 |
+
"step": 1856
|
13002 |
+
},
|
13003 |
+
{
|
13004 |
+
"epoch": 0.43100847162585587,
|
13005 |
+
"grad_norm": 0.584676206111908,
|
13006 |
+
"learning_rate": 0.00019637018757021296,
|
13007 |
+
"loss": 0.8508,
|
13008 |
+
"step": 1857
|
13009 |
+
},
|
13010 |
+
{
|
13011 |
+
"epoch": 0.43124057096437274,
|
13012 |
+
"grad_norm": 0.49439355731010437,
|
13013 |
+
"learning_rate": 0.0001963662926519684,
|
13014 |
+
"loss": 0.8681,
|
13015 |
+
"step": 1858
|
13016 |
+
},
|
13017 |
+
{
|
13018 |
+
"epoch": 0.43147267030288966,
|
13019 |
+
"grad_norm": 0.5786611437797546,
|
13020 |
+
"learning_rate": 0.00019636239568382,
|
13021 |
+
"loss": 0.9083,
|
13022 |
+
"step": 1859
|
13023 |
+
},
|
13024 |
+
{
|
13025 |
+
"epoch": 0.4317047696414065,
|
13026 |
+
"grad_norm": 0.5431936383247375,
|
13027 |
+
"learning_rate": 0.00019635849666585058,
|
13028 |
+
"loss": 0.9406,
|
13029 |
+
"step": 1860
|
13030 |
+
},
|
13031 |
+
{
|
13032 |
+
"epoch": 0.4319368689799234,
|
13033 |
+
"grad_norm": 0.6521342992782593,
|
13034 |
+
"learning_rate": 0.00019635459559814314,
|
13035 |
+
"loss": 0.8659,
|
13036 |
+
"step": 1861
|
13037 |
+
},
|
13038 |
+
{
|
13039 |
+
"epoch": 0.4321689683184403,
|
13040 |
+
"grad_norm": 0.5077570676803589,
|
13041 |
+
"learning_rate": 0.00019635069248078062,
|
13042 |
+
"loss": 0.9172,
|
13043 |
+
"step": 1862
|
13044 |
+
},
|
13045 |
+
{
|
13046 |
+
"epoch": 0.4324010676569572,
|
13047 |
+
"grad_norm": 0.5636994242668152,
|
13048 |
+
"learning_rate": 0.00019634678731384608,
|
13049 |
+
"loss": 0.9587,
|
13050 |
+
"step": 1863
|
13051 |
+
},
|
13052 |
+
{
|
13053 |
+
"epoch": 0.43263316699547405,
|
13054 |
+
"grad_norm": 0.48513078689575195,
|
13055 |
+
"learning_rate": 0.00019634288009742255,
|
13056 |
+
"loss": 0.9519,
|
13057 |
+
"step": 1864
|
13058 |
+
},
|
13059 |
+
{
|
13060 |
+
"epoch": 0.432865266333991,
|
13061 |
+
"grad_norm": 0.519437849521637,
|
13062 |
+
"learning_rate": 0.00019633897083159318,
|
13063 |
+
"loss": 0.9289,
|
13064 |
+
"step": 1865
|
13065 |
+
},
|
13066 |
+
{
|
13067 |
+
"epoch": 0.43309736567250784,
|
13068 |
+
"grad_norm": 0.5995944738388062,
|
13069 |
+
"learning_rate": 0.00019633505951644113,
|
13070 |
+
"loss": 0.9566,
|
13071 |
+
"step": 1866
|
13072 |
+
},
|
13073 |
+
{
|
13074 |
+
"epoch": 0.4333294650110247,
|
13075 |
+
"grad_norm": 0.5057395100593567,
|
13076 |
+
"learning_rate": 0.00019633114615204958,
|
13077 |
+
"loss": 0.9654,
|
13078 |
+
"step": 1867
|
13079 |
+
},
|
13080 |
+
{
|
13081 |
+
"epoch": 0.43356156434954163,
|
13082 |
+
"grad_norm": 0.5791558623313904,
|
13083 |
+
"learning_rate": 0.00019632723073850176,
|
13084 |
+
"loss": 0.9469,
|
13085 |
+
"step": 1868
|
13086 |
+
},
|
13087 |
+
{
|
13088 |
+
"epoch": 0.4337936636880585,
|
13089 |
+
"grad_norm": 0.5840992331504822,
|
13090 |
+
"learning_rate": 0.000196323313275881,
|
13091 |
+
"loss": 0.918,
|
13092 |
+
"step": 1869
|
13093 |
+
},
|
13094 |
+
{
|
13095 |
+
"epoch": 0.43402576302657536,
|
13096 |
+
"grad_norm": 0.550893247127533,
|
13097 |
+
"learning_rate": 0.00019631939376427062,
|
13098 |
+
"loss": 0.8612,
|
13099 |
+
"step": 1870
|
13100 |
+
},
|
13101 |
+
{
|
13102 |
+
"epoch": 0.4342578623650923,
|
13103 |
+
"grad_norm": 0.537064790725708,
|
13104 |
+
"learning_rate": 0.00019631547220375398,
|
13105 |
+
"loss": 0.9316,
|
13106 |
+
"step": 1871
|
13107 |
+
},
|
13108 |
+
{
|
13109 |
+
"epoch": 0.43448996170360915,
|
13110 |
+
"grad_norm": 0.5622636675834656,
|
13111 |
+
"learning_rate": 0.00019631154859441454,
|
13112 |
+
"loss": 0.8822,
|
13113 |
+
"step": 1872
|
13114 |
+
},
|
13115 |
+
{
|
13116 |
+
"epoch": 0.434722061042126,
|
13117 |
+
"grad_norm": 0.599727213382721,
|
13118 |
+
"learning_rate": 0.0001963076229363357,
|
13119 |
+
"loss": 0.956,
|
13120 |
+
"step": 1873
|
13121 |
+
},
|
13122 |
+
{
|
13123 |
+
"epoch": 0.43495416038064294,
|
13124 |
+
"grad_norm": 0.5084268450737,
|
13125 |
+
"learning_rate": 0.00019630369522960104,
|
13126 |
+
"loss": 0.8993,
|
13127 |
+
"step": 1874
|
13128 |
+
},
|
13129 |
+
{
|
13130 |
+
"epoch": 0.4351862597191598,
|
13131 |
+
"grad_norm": 0.547834038734436,
|
13132 |
+
"learning_rate": 0.00019629976547429402,
|
13133 |
+
"loss": 0.9046,
|
13134 |
+
"step": 1875
|
13135 |
+
},
|
13136 |
+
{
|
13137 |
+
"epoch": 0.4354183590576767,
|
13138 |
+
"grad_norm": 0.5189753770828247,
|
13139 |
+
"learning_rate": 0.0001962958336704983,
|
13140 |
+
"loss": 0.8458,
|
13141 |
+
"step": 1876
|
13142 |
+
},
|
13143 |
+
{
|
13144 |
+
"epoch": 0.4356504583961936,
|
13145 |
+
"grad_norm": 0.501224160194397,
|
13146 |
+
"learning_rate": 0.00019629189981829753,
|
13147 |
+
"loss": 0.905,
|
13148 |
+
"step": 1877
|
13149 |
+
},
|
13150 |
+
{
|
13151 |
+
"epoch": 0.43588255773471046,
|
13152 |
+
"grad_norm": 0.5444706082344055,
|
13153 |
+
"learning_rate": 0.0001962879639177753,
|
13154 |
+
"loss": 0.8975,
|
13155 |
+
"step": 1878
|
13156 |
+
},
|
13157 |
+
{
|
13158 |
+
"epoch": 0.43611465707322733,
|
13159 |
+
"grad_norm": 0.5328624248504639,
|
13160 |
+
"learning_rate": 0.00019628402596901545,
|
13161 |
+
"loss": 0.9257,
|
13162 |
+
"step": 1879
|
13163 |
+
},
|
13164 |
+
{
|
13165 |
+
"epoch": 0.43634675641174425,
|
13166 |
+
"grad_norm": 0.5254698991775513,
|
13167 |
+
"learning_rate": 0.00019628008597210168,
|
13168 |
+
"loss": 0.8739,
|
13169 |
+
"step": 1880
|
13170 |
+
},
|
13171 |
+
{
|
13172 |
+
"epoch": 0.4365788557502611,
|
13173 |
+
"grad_norm": 0.5245271921157837,
|
13174 |
+
"learning_rate": 0.0001962761439271178,
|
13175 |
+
"loss": 0.8952,
|
13176 |
+
"step": 1881
|
13177 |
+
},
|
13178 |
+
{
|
13179 |
+
"epoch": 0.436810955088778,
|
13180 |
+
"grad_norm": 0.5154178142547607,
|
13181 |
+
"learning_rate": 0.00019627219983414768,
|
13182 |
+
"loss": 0.9408,
|
13183 |
+
"step": 1882
|
13184 |
+
},
|
13185 |
+
{
|
13186 |
+
"epoch": 0.4370430544272949,
|
13187 |
+
"grad_norm": 0.5660544037818909,
|
13188 |
+
"learning_rate": 0.00019626825369327525,
|
13189 |
+
"loss": 0.8846,
|
13190 |
+
"step": 1883
|
13191 |
+
},
|
13192 |
+
{
|
13193 |
+
"epoch": 0.4372751537658118,
|
13194 |
+
"grad_norm": 0.5544506907463074,
|
13195 |
+
"learning_rate": 0.0001962643055045844,
|
13196 |
+
"loss": 0.9322,
|
13197 |
+
"step": 1884
|
13198 |
+
},
|
13199 |
+
{
|
13200 |
+
"epoch": 0.43750725310432864,
|
13201 |
+
"grad_norm": 0.49590614438056946,
|
13202 |
+
"learning_rate": 0.00019626035526815912,
|
13203 |
+
"loss": 0.9737,
|
13204 |
+
"step": 1885
|
13205 |
+
},
|
13206 |
+
{
|
13207 |
+
"epoch": 0.43773935244284556,
|
13208 |
+
"grad_norm": 0.5184259414672852,
|
13209 |
+
"learning_rate": 0.0001962564029840835,
|
13210 |
+
"loss": 0.9169,
|
13211 |
+
"step": 1886
|
13212 |
+
},
|
13213 |
+
{
|
13214 |
+
"epoch": 0.43797145178136243,
|
13215 |
+
"grad_norm": 0.5171828866004944,
|
13216 |
+
"learning_rate": 0.00019625244865244156,
|
13217 |
+
"loss": 0.8724,
|
13218 |
+
"step": 1887
|
13219 |
+
},
|
13220 |
+
{
|
13221 |
+
"epoch": 0.4382035511198793,
|
13222 |
+
"grad_norm": 0.606625497341156,
|
13223 |
+
"learning_rate": 0.0001962484922733174,
|
13224 |
+
"loss": 0.8666,
|
13225 |
+
"step": 1888
|
13226 |
+
},
|
13227 |
+
{
|
13228 |
+
"epoch": 0.4384356504583962,
|
13229 |
+
"grad_norm": 0.5377411842346191,
|
13230 |
+
"learning_rate": 0.0001962445338467952,
|
13231 |
+
"loss": 0.9142,
|
13232 |
+
"step": 1889
|
13233 |
+
},
|
13234 |
+
{
|
13235 |
+
"epoch": 0.4386677497969131,
|
13236 |
+
"grad_norm": 0.5942894220352173,
|
13237 |
+
"learning_rate": 0.00019624057337295922,
|
13238 |
+
"loss": 0.957,
|
13239 |
+
"step": 1890
|
13240 |
+
},
|
13241 |
+
{
|
13242 |
+
"epoch": 0.43889984913542995,
|
13243 |
+
"grad_norm": 0.5858636498451233,
|
13244 |
+
"learning_rate": 0.00019623661085189364,
|
13245 |
+
"loss": 0.9022,
|
13246 |
+
"step": 1891
|
13247 |
+
},
|
13248 |
+
{
|
13249 |
+
"epoch": 0.4391319484739469,
|
13250 |
+
"grad_norm": 0.5353084206581116,
|
13251 |
+
"learning_rate": 0.00019623264628368275,
|
13252 |
+
"loss": 0.8723,
|
13253 |
+
"step": 1892
|
13254 |
+
},
|
13255 |
+
{
|
13256 |
+
"epoch": 0.43936404781246374,
|
13257 |
+
"grad_norm": 0.5895339846611023,
|
13258 |
+
"learning_rate": 0.0001962286796684109,
|
13259 |
+
"loss": 0.9509,
|
13260 |
+
"step": 1893
|
13261 |
+
},
|
13262 |
+
{
|
13263 |
+
"epoch": 0.4395961471509806,
|
13264 |
+
"grad_norm": 0.5124474763870239,
|
13265 |
+
"learning_rate": 0.0001962247110061625,
|
13266 |
+
"loss": 0.9523,
|
13267 |
+
"step": 1894
|
13268 |
+
},
|
13269 |
+
{
|
13270 |
+
"epoch": 0.43982824648949753,
|
13271 |
+
"grad_norm": 0.53212571144104,
|
13272 |
+
"learning_rate": 0.00019622074029702194,
|
13273 |
+
"loss": 0.8931,
|
13274 |
+
"step": 1895
|
13275 |
+
},
|
13276 |
+
{
|
13277 |
+
"epoch": 0.4400603458280144,
|
13278 |
+
"grad_norm": 0.4760664999485016,
|
13279 |
+
"learning_rate": 0.00019621676754107367,
|
13280 |
+
"loss": 0.9609,
|
13281 |
+
"step": 1896
|
13282 |
+
},
|
13283 |
+
{
|
13284 |
+
"epoch": 0.44029244516653127,
|
13285 |
+
"grad_norm": 0.4855426549911499,
|
13286 |
+
"learning_rate": 0.0001962127927384022,
|
13287 |
+
"loss": 0.9561,
|
13288 |
+
"step": 1897
|
13289 |
+
},
|
13290 |
+
{
|
13291 |
+
"epoch": 0.4405245445050482,
|
13292 |
+
"grad_norm": 0.6112794876098633,
|
13293 |
+
"learning_rate": 0.00019620881588909212,
|
13294 |
+
"loss": 0.9166,
|
13295 |
+
"step": 1898
|
13296 |
+
},
|
13297 |
+
{
|
13298 |
+
"epoch": 0.44075664384356505,
|
13299 |
+
"grad_norm": 0.5399686098098755,
|
13300 |
+
"learning_rate": 0.00019620483699322802,
|
13301 |
+
"loss": 0.8998,
|
13302 |
+
"step": 1899
|
13303 |
+
},
|
13304 |
+
{
|
13305 |
+
"epoch": 0.4409887431820819,
|
13306 |
+
"grad_norm": 0.5019717216491699,
|
13307 |
+
"learning_rate": 0.00019620085605089448,
|
13308 |
+
"loss": 0.8652,
|
13309 |
+
"step": 1900
|
13310 |
}
|
13311 |
],
|
13312 |
"logging_steps": 1,
|
|
|
13326 |
"attributes": {}
|
13327 |
}
|
13328 |
},
|
13329 |
+
"total_flos": 8.433657609977856e+17,
|
13330 |
"train_batch_size": 32,
|
13331 |
"trial_name": null,
|
13332 |
"trial_params": null
|