Upload folder using huggingface_hub
Browse files- res_pre_attn/Ra0_S-1_R1_P0.pt +3 -0
- res_pre_attn/Ra0_S-1_R1_P0_config.json +38 -0
- res_pre_attn/Ra0_S-2_R1_P0.pt +3 -0
- res_pre_attn/Ra0_S-2_R1_P0_config.json +38 -0
- res_pre_attn/Ra0_S-3_R1_P0.pt +3 -0
- res_pre_attn/Ra0_S-3_R1_P0_config.json +38 -0
- res_pre_attn/Ra0_S-4_R1_P0.pt +3 -0
- res_pre_attn/Ra0_S-4_R1_P0_config.json +38 -0
- res_pre_attn/Ra0_S-5_R1_P0.pt +3 -0
- res_pre_attn/Ra0_S-5_R1_P0_config.json +38 -0
- res_pre_attn/Ra0_S-6_R1_P0.pt +3 -0
- res_pre_attn/Ra0_S-6_R1_P0_config.json +38 -0
- res_pre_attn/Ra1_S-1_R1_P0.pt +3 -0
- res_pre_attn/Ra1_S-1_R1_P0_config.json +38 -0
- res_pre_attn/Ra1_S-2_R1_P0.pt +3 -0
- res_pre_attn/Ra1_S-2_R1_P0_config.json +38 -0
- res_pre_attn/Ra1_S-3_R1_P0.pt +3 -0
- res_pre_attn/Ra1_S-3_R1_P0_config.json +38 -0
- res_pre_attn/Ra1_S-4_R1_P0.pt +3 -0
- res_pre_attn/Ra1_S-4_R1_P0_config.json +38 -0
- res_pre_attn/Ra1_S-5_R1_P0.pt +3 -0
- res_pre_attn/Ra1_S-5_R1_P0_config.json +38 -0
- res_pre_attn/Ra1_S-6_R1_P0.pt +3 -0
- res_pre_attn/Ra1_S-6_R1_P0_config.json +38 -0
- res_pre_attn/Ra2_S-1_R1_P0.pt +3 -0
- res_pre_attn/Ra2_S-1_R1_P0_config.json +38 -0
- res_pre_attn/Ra2_S-2_R1_P0.pt +3 -0
- res_pre_attn/Ra2_S-2_R1_P0_config.json +38 -0
- res_pre_attn/Ra2_S-3_R1_P0.pt +3 -0
- res_pre_attn/Ra2_S-3_R1_P0_config.json +38 -0
- res_pre_attn/Ra2_S-4_R1_P0.pt +3 -0
- res_pre_attn/Ra2_S-4_R1_P0_config.json +38 -0
- res_pre_attn/Ra2_S-5_R1_P0.pt +3 -0
- res_pre_attn/Ra2_S-5_R1_P0_config.json +38 -0
- res_pre_attn/Ra2_S-6_R1_P0.pt +3 -0
- res_pre_attn/Ra2_S-6_R1_P0_config.json +38 -0
- res_pre_attn/Ra3_S-1_R1_P0.pt +3 -0
- res_pre_attn/Ra3_S-1_R1_P0_config.json +38 -0
- res_pre_attn/Ra3_S-2_R1_P0.pt +3 -0
- res_pre_attn/Ra3_S-2_R1_P0_config.json +38 -0
- res_pre_attn/Ra3_S-3_R1_P0.pt +3 -0
- res_pre_attn/Ra3_S-3_R1_P0_config.json +38 -0
- res_pre_attn/Ra3_S-4_R1_P0.pt +3 -0
- res_pre_attn/Ra3_S-4_R1_P0_config.json +38 -0
- res_pre_attn/Ra3_S-5_R1_P0.pt +3 -0
- res_pre_attn/Ra3_S-5_R1_P0_config.json +38 -0
- res_pre_attn/Ra3_S-6_R1_P0.pt +3 -0
- res_pre_attn/Ra3_S-6_R1_P0_config.json +38 -0
res_pre_attn/Ra0_S-1_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:168696ab72886f38f5d97e585085e3a8ed8f518ecce43f5e8729e66a09ea967c
|
3 |
+
size 153705088
|
res_pre_attn/Ra0_S-1_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -1,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra0_S-1_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra0_S-2_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:492bca86760bce0fe1d4197a02cefd782badd6286102e5d8dd81f63abdad543d
|
3 |
+
size 153705088
|
res_pre_attn/Ra0_S-2_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -2,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra0_S-2_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra0_S-3_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:66aa12e85224b5fbc7327ddbebbe2adc98cd2c3db640a3aba08c880e733f846f
|
3 |
+
size 153705088
|
res_pre_attn/Ra0_S-3_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -3,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra0_S-3_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra0_S-4_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d52b130e663b809bd8949f8b361b32b8a41af046b12d31de85f6be0b62534d8a
|
3 |
+
size 153705088
|
res_pre_attn/Ra0_S-4_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -4,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra0_S-4_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra0_S-5_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:49cc2a7e45089eb83bcac6f525517d3704678fb227402f4c1f4a18ecfdc3ad40
|
3 |
+
size 153705088
|
res_pre_attn/Ra0_S-5_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -5,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra0_S-5_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra0_S-6_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed026ea5d3bbe516a03eeb08ba582fc80492c4ef73b529821144f4bf6903fc0e
|
3 |
+
size 153705088
|
res_pre_attn/Ra0_S-6_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -6,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra0_S-6_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra1_S-1_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cb793e5208c28a2f1f95a7b0d9335075b8fd8a86e7a4fd041469131dcc05cd84
|
3 |
+
size 153705088
|
res_pre_attn/Ra1_S-1_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 1,
|
20 |
+
"l1_exp": -1,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra1_S-1_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra1_S-2_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f27213da6eba34178d562cff99f0b80634ca5e7244e63dea4aacb15f98513056
|
3 |
+
size 153705088
|
res_pre_attn/Ra1_S-2_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 1,
|
20 |
+
"l1_exp": -2,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra1_S-2_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra1_S-3_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1753c457d61b78fed9133582c8fb2623e2bd4c216c5f596ac6d89726dcf5c781
|
3 |
+
size 153705088
|
res_pre_attn/Ra1_S-3_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 1,
|
20 |
+
"l1_exp": -3,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra1_S-3_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra1_S-4_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9a2b170964fdb49676e3cbef24b250f093e557511d946c376ac1695d5a102dbd
|
3 |
+
size 153705088
|
res_pre_attn/Ra1_S-4_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 1,
|
20 |
+
"l1_exp": -4,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra1_S-4_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra1_S-5_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c9d4de526c9389dfab48012beab25ab10b9f43508e37d198d1ab200b643027db
|
3 |
+
size 153705088
|
res_pre_attn/Ra1_S-5_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 1,
|
20 |
+
"l1_exp": -5,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra1_S-5_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra1_S-6_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f27664bd41b9a2f585b3f7e54b4bc1f36a1364848dfc1cf1ed934c0bbe253cf7
|
3 |
+
size 153705088
|
res_pre_attn/Ra1_S-6_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 1,
|
20 |
+
"l1_exp": -6,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra1_S-6_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra2_S-1_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b17d5b60a89465a4f754b2047143596e1c4b43a7b09071172f6c1178c637e952
|
3 |
+
size 153705088
|
res_pre_attn/Ra2_S-1_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 2,
|
20 |
+
"l1_exp": -1,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra2_S-1_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra2_S-2_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:71f7d678ad0283dac56b3e51c3496968ff25bd1f2ebebaf56b922f856079709e
|
3 |
+
size 153705088
|
res_pre_attn/Ra2_S-2_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 2,
|
20 |
+
"l1_exp": -2,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra2_S-2_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra2_S-3_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:022b8b0b892cffab5a929a0b9c6b5a65dce8655d0a2e5264510637d82f75f441
|
3 |
+
size 153705088
|
res_pre_attn/Ra2_S-3_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 2,
|
20 |
+
"l1_exp": -3,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra2_S-3_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra2_S-4_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eed0e84ef30a48e85c68b57423aa2c09f5d6ce812f47fea3933d8146522a9b70
|
3 |
+
size 153705088
|
res_pre_attn/Ra2_S-4_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 2,
|
20 |
+
"l1_exp": -4,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra2_S-4_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra2_S-5_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:24b9e4a1d45288357580a726fff2307798020dd9796ad4ba6ac7bcf9e8eeb0c4
|
3 |
+
size 153705088
|
res_pre_attn/Ra2_S-5_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 2,
|
20 |
+
"l1_exp": -5,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra2_S-5_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra2_S-6_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7c73d8a8bd6c1aaa84ea548081ad934c63918b9e8573df353fa0d8289279a47a
|
3 |
+
size 153705088
|
res_pre_attn/Ra2_S-6_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 2,
|
20 |
+
"l1_exp": -6,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra2_S-6_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra3_S-1_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c7a29cd8f711cbcac6078d1887dd32b2d98562cde08befaddfdc0d60db9be391
|
3 |
+
size 153705088
|
res_pre_attn/Ra3_S-1_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 3,
|
20 |
+
"l1_exp": -1,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra3_S-1_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra3_S-2_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e1bd39d50e0f200223c4b62f63ca37ab555ba377a34e196584b3098dc46f7e0c
|
3 |
+
size 153705088
|
res_pre_attn/Ra3_S-2_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 3,
|
20 |
+
"l1_exp": -2,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra3_S-2_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra3_S-3_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:68baad0c607802ada81a30324ba3a4013e5eef8a1f1ca7b08121d20deaba0b91
|
3 |
+
size 153705088
|
res_pre_attn/Ra3_S-3_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 3,
|
20 |
+
"l1_exp": -3,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra3_S-3_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra3_S-4_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:00382e4a254b88fcd9ca43bffea80815d2137d437ddbf3a3ea1eeb96782e39c0
|
3 |
+
size 153705088
|
res_pre_attn/Ra3_S-4_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 3,
|
20 |
+
"l1_exp": -4,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra3_S-4_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra3_S-5_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90bdacfe5c96776c7a0d41ed61ab386000909df6220365ff3de552c1e40e5235
|
3 |
+
size 153705088
|
res_pre_attn/Ra3_S-5_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 3,
|
20 |
+
"l1_exp": -5,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra3_S-5_R1_P0"
|
38 |
+
}
|
res_pre_attn/Ra3_S-6_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:39dc91fef1df76ae356b6680c38a7c4cfe3898b1580f62773399dff8d94e3215
|
3 |
+
size 153705088
|
res_pre_attn/Ra3_S-6_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "x",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 3,
|
20 |
+
"l1_exp": -6,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "res_pre_attn",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "res_A",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "Ra3_S-6_R1_P0"
|
38 |
+
}
|