Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- small_attn_out/A0_N100_S-1.pt +3 -0
- small_attn_out/A0_N100_S-10.pt +3 -0
- small_attn_out/A0_N100_S-10_config.json +38 -0
- small_attn_out/A0_N100_S-1_config.json +38 -0
- small_attn_out/A0_N100_S-2.pt +3 -0
- small_attn_out/A0_N100_S-2_config.json +38 -0
- small_attn_out/A0_N100_S-3.pt +3 -0
- small_attn_out/A0_N100_S-3_config.json +38 -0
- small_attn_out/A0_N100_S-4.pt +3 -0
- small_attn_out/A0_N100_S-4_config.json +38 -0
- small_attn_out/A0_N100_S-5.pt +3 -0
- small_attn_out/A0_N100_S-5_config.json +38 -0
- small_attn_out/A0_N100_S-6.pt +3 -0
- small_attn_out/A0_N100_S-6_config.json +38 -0
- small_attn_out/A0_N100_S-7.pt +3 -0
- small_attn_out/A0_N100_S-7_config.json +38 -0
- small_attn_out/A0_N100_S-8.pt +3 -0
- small_attn_out/A0_N100_S-8_config.json +38 -0
- small_attn_out/A0_N100_S-9.pt +3 -0
- small_attn_out/A0_N100_S-9_config.json +38 -0
- small_attn_out/A0_N100_S0.pt +3 -0
- small_attn_out/A0_N100_S0_config.json +38 -0
- small_attn_out/A0_N100_S1.pt +3 -0
- small_attn_out/A0_N100_S1_config.json +38 -0
- small_attn_out/A0_N100_S2.pt +3 -0
- small_attn_out/A0_N100_S2_config.json +38 -0
- small_attn_out/A0_N300_S-1.pt +3 -0
- small_attn_out/A0_N300_S-10.pt +3 -0
- small_attn_out/A0_N300_S-10_config.json +38 -0
- small_attn_out/A0_N300_S-1_config.json +38 -0
- small_attn_out/A0_N300_S-2.pt +3 -0
- small_attn_out/A0_N300_S-2_config.json +38 -0
- small_attn_out/A0_N300_S-3.pt +3 -0
- small_attn_out/A0_N300_S-3_config.json +38 -0
- small_attn_out/A0_N300_S-4.pt +3 -0
- small_attn_out/A0_N300_S-4_config.json +38 -0
- small_attn_out/A0_N300_S-5.pt +3 -0
- small_attn_out/A0_N300_S-5_config.json +38 -0
- small_attn_out/A0_N300_S-6.pt +3 -0
- small_attn_out/A0_N300_S-6_config.json +38 -0
- small_attn_out/A0_N300_S-7.pt +3 -0
- small_attn_out/A0_N300_S-7_config.json +38 -0
- small_attn_out/A0_N300_S-8.pt +3 -0
- small_attn_out/A0_N300_S-8_config.json +38 -0
- small_attn_out/A0_N300_S-9.pt +3 -0
- small_attn_out/A0_N300_S-9_config.json +38 -0
- small_attn_out/A0_N300_S0.pt +3 -0
- small_attn_out/A0_N300_S0_config.json +38 -0
- small_attn_out/A0_N300_S1.pt +3 -0
- small_attn_out/A0_N300_S1_config.json +38 -0
small_attn_out/A0_N100_S-1.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fb86b244378b3011dfc004d1058b6115d991aa9955b699fd455d4b8bd560c6f5
|
3 |
+
size 619888
|
small_attn_out/A0_N100_S-10.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9abeff8e1ac701f13b70f199d73896437df864eb5641b6b61001cf97ece40c8c
|
3 |
+
size 619896
|
small_attn_out/A0_N100_S-10_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 100,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -10,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N100_S-10"
|
38 |
+
}
|
small_attn_out/A0_N100_S-1_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 100,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -1,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N100_S-1"
|
38 |
+
}
|
small_attn_out/A0_N100_S-2.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:57f00dfe46558ea82dc20bb8d8af5adb13e33404143df202ace91637481a2e02
|
3 |
+
size 619888
|
small_attn_out/A0_N100_S-2_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 100,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -2,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N100_S-2"
|
38 |
+
}
|
small_attn_out/A0_N100_S-3.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cdfb7e2f79e79ff1a56ae5d3b9ef0925b793cc8bb73c94c5cb1198c109160a87
|
3 |
+
size 619888
|
small_attn_out/A0_N100_S-3_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 100,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -3,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N100_S-3"
|
38 |
+
}
|
small_attn_out/A0_N100_S-4.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:64172016801e954ba2bff8539a99c6022cfdfb9fe3cd208563a0f3c379fdd47c
|
3 |
+
size 619888
|
small_attn_out/A0_N100_S-4_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 100,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -4,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N100_S-4"
|
38 |
+
}
|
small_attn_out/A0_N100_S-5.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:572e1822d0aaa37dad7d069960134b523b82ce07066395eb9f210053ced8539b
|
3 |
+
size 619888
|
small_attn_out/A0_N100_S-5_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 100,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -5,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N100_S-5"
|
38 |
+
}
|
small_attn_out/A0_N100_S-6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b78f5980b6982986dc61df07870239c00001b0767322b6467d13e826335bcce1
|
3 |
+
size 619888
|
small_attn_out/A0_N100_S-6_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 100,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -6,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N100_S-6"
|
38 |
+
}
|
small_attn_out/A0_N100_S-7.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28bb163f29b2ba110d5b35ea4c17d5638148b361d0bafa1e7f337fb8444015fe
|
3 |
+
size 619888
|
small_attn_out/A0_N100_S-7_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 100,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -7,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N100_S-7"
|
38 |
+
}
|
small_attn_out/A0_N100_S-8.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:984aa7811931448bcf982f409cae513a98d98d7d6c7d90a1088d946aee8627ee
|
3 |
+
size 619888
|
small_attn_out/A0_N100_S-8_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 100,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -8,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N100_S-8"
|
38 |
+
}
|
small_attn_out/A0_N100_S-9.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a03f1994c850df139158d9725ea08d69d5671fceb4f8d18472865236307c26d
|
3 |
+
size 619888
|
small_attn_out/A0_N100_S-9_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 100,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -9,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N100_S-9"
|
38 |
+
}
|
small_attn_out/A0_N100_S0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:538d5363bfe0940cdf3a83928c27ad3d3aad4715d05a2a4832f00ddba40f31d8
|
3 |
+
size 619880
|
small_attn_out/A0_N100_S0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 100,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": 0,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N100_S0"
|
38 |
+
}
|
small_attn_out/A0_N100_S1.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4eb074a9c3ba59325eb7c86710b1b609ee648ec50ab0f76e447438b5a0782920
|
3 |
+
size 619880
|
small_attn_out/A0_N100_S1_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 100,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": 1,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N100_S1"
|
38 |
+
}
|
small_attn_out/A0_N100_S2.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:017b709d1416d68bd1d7b3930b35d311d031e08aa6742c29a6c136ffef8c609b
|
3 |
+
size 619880
|
small_attn_out/A0_N100_S2_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 100,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": 2,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N100_S2"
|
38 |
+
}
|
small_attn_out/A0_N300_S-1.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3c7ffcaa369e5d113f5f5549675f412659f653d2c576a3fe6fc04b08b2199f32
|
3 |
+
size 1849456
|
small_attn_out/A0_N300_S-10.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:81488224431b9b80530e275eceb1358a9bb09c1e7912663c8d6efab343e48424
|
3 |
+
size 1849464
|
small_attn_out/A0_N300_S-10_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 300,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -10,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N300_S-10"
|
38 |
+
}
|
small_attn_out/A0_N300_S-1_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 300,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -1,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N300_S-1"
|
38 |
+
}
|
small_attn_out/A0_N300_S-2.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e5d3a95eef0b490b3697ec4dd635c38ecb34f84f2c5318cbae0388b5a57f9596
|
3 |
+
size 1849456
|
small_attn_out/A0_N300_S-2_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 300,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -2,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N300_S-2"
|
38 |
+
}
|
small_attn_out/A0_N300_S-3.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8da48169455324c25813b76bc11ab00c1adf63ec56f1851229f1897aad23c567
|
3 |
+
size 1849456
|
small_attn_out/A0_N300_S-3_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 300,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -3,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N300_S-3"
|
38 |
+
}
|
small_attn_out/A0_N300_S-4.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b4cf9f5d063a769b24dcc08094808aebd8aaa11835e1f61206a34942699d0096
|
3 |
+
size 1849456
|
small_attn_out/A0_N300_S-4_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 300,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -4,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N300_S-4"
|
38 |
+
}
|
small_attn_out/A0_N300_S-5.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fd976f75c285726ce4dd81340dc4ac6c2f788010dd7150ddeb3c3295d1351a8
|
3 |
+
size 1849456
|
small_attn_out/A0_N300_S-5_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 300,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -5,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N300_S-5"
|
38 |
+
}
|
small_attn_out/A0_N300_S-6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8df0e2aff051466d445625391a8576d382acceaeac2e53cc3d47b3ece9aa1a72
|
3 |
+
size 1849456
|
small_attn_out/A0_N300_S-6_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 300,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -6,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N300_S-6"
|
38 |
+
}
|
small_attn_out/A0_N300_S-7.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:79f20e3fdc82ac8edc83be6cad1899d93c11ae343006694b40148d1720e99e84
|
3 |
+
size 1849456
|
small_attn_out/A0_N300_S-7_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 300,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -7,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N300_S-7"
|
38 |
+
}
|
small_attn_out/A0_N300_S-8.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fa647d1a405e429b0bb6b7b1be62bd5d776b81f886d767d863f86c2ab2cde947
|
3 |
+
size 1849456
|
small_attn_out/A0_N300_S-8_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 300,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -8,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N300_S-8"
|
38 |
+
}
|
small_attn_out/A0_N300_S-9.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0c3fba6e9acb66bc3fd73d161cbfccc019653e15a69e390b7e12c7bc76ba438e
|
3 |
+
size 1849456
|
small_attn_out/A0_N300_S-9_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 300,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -9,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N300_S-9"
|
38 |
+
}
|
small_attn_out/A0_N300_S0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ab2e792aa009c941252a30ecbc7055e57f60c655df96732dbbfe201a1fa04252
|
3 |
+
size 1849448
|
small_attn_out/A0_N300_S0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 300,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": 0,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N300_S0"
|
38 |
+
}
|
small_attn_out/A0_N300_S1.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:64caadf4c6f75349aaa9284817069238d2c9974a337e99c221c394eab3b5b0e8
|
3 |
+
size 1849448
|
small_attn_out/A0_N300_S1_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 300,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": 1,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_N{n_features}_S{l1_exp}",
|
27 |
+
"project_name": "small_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_N300_S1"
|
38 |
+
}
|