noanabeshima commited on
Commit
d41c1ab
·
verified ·
1 Parent(s): 78c7da6

Upload folder using huggingface_hub

Browse files
Files changed (48) hide show
  1. res_pre_attn/Ra0_S-1_R1_P0.pt +3 -0
  2. res_pre_attn/Ra0_S-1_R1_P0_config.json +38 -0
  3. res_pre_attn/Ra0_S-2_R1_P0.pt +3 -0
  4. res_pre_attn/Ra0_S-2_R1_P0_config.json +38 -0
  5. res_pre_attn/Ra0_S-3_R1_P0.pt +3 -0
  6. res_pre_attn/Ra0_S-3_R1_P0_config.json +38 -0
  7. res_pre_attn/Ra0_S-4_R1_P0.pt +3 -0
  8. res_pre_attn/Ra0_S-4_R1_P0_config.json +38 -0
  9. res_pre_attn/Ra0_S-5_R1_P0.pt +3 -0
  10. res_pre_attn/Ra0_S-5_R1_P0_config.json +38 -0
  11. res_pre_attn/Ra0_S-6_R1_P0.pt +3 -0
  12. res_pre_attn/Ra0_S-6_R1_P0_config.json +38 -0
  13. res_pre_attn/Ra1_S-1_R1_P0.pt +3 -0
  14. res_pre_attn/Ra1_S-1_R1_P0_config.json +38 -0
  15. res_pre_attn/Ra1_S-2_R1_P0.pt +3 -0
  16. res_pre_attn/Ra1_S-2_R1_P0_config.json +38 -0
  17. res_pre_attn/Ra1_S-3_R1_P0.pt +3 -0
  18. res_pre_attn/Ra1_S-3_R1_P0_config.json +38 -0
  19. res_pre_attn/Ra1_S-4_R1_P0.pt +3 -0
  20. res_pre_attn/Ra1_S-4_R1_P0_config.json +38 -0
  21. res_pre_attn/Ra1_S-5_R1_P0.pt +3 -0
  22. res_pre_attn/Ra1_S-5_R1_P0_config.json +38 -0
  23. res_pre_attn/Ra1_S-6_R1_P0.pt +3 -0
  24. res_pre_attn/Ra1_S-6_R1_P0_config.json +38 -0
  25. res_pre_attn/Ra2_S-1_R1_P0.pt +3 -0
  26. res_pre_attn/Ra2_S-1_R1_P0_config.json +38 -0
  27. res_pre_attn/Ra2_S-2_R1_P0.pt +3 -0
  28. res_pre_attn/Ra2_S-2_R1_P0_config.json +38 -0
  29. res_pre_attn/Ra2_S-3_R1_P0.pt +3 -0
  30. res_pre_attn/Ra2_S-3_R1_P0_config.json +38 -0
  31. res_pre_attn/Ra2_S-4_R1_P0.pt +3 -0
  32. res_pre_attn/Ra2_S-4_R1_P0_config.json +38 -0
  33. res_pre_attn/Ra2_S-5_R1_P0.pt +3 -0
  34. res_pre_attn/Ra2_S-5_R1_P0_config.json +38 -0
  35. res_pre_attn/Ra2_S-6_R1_P0.pt +3 -0
  36. res_pre_attn/Ra2_S-6_R1_P0_config.json +38 -0
  37. res_pre_attn/Ra3_S-1_R1_P0.pt +3 -0
  38. res_pre_attn/Ra3_S-1_R1_P0_config.json +38 -0
  39. res_pre_attn/Ra3_S-2_R1_P0.pt +3 -0
  40. res_pre_attn/Ra3_S-2_R1_P0_config.json +38 -0
  41. res_pre_attn/Ra3_S-3_R1_P0.pt +3 -0
  42. res_pre_attn/Ra3_S-3_R1_P0_config.json +38 -0
  43. res_pre_attn/Ra3_S-4_R1_P0.pt +3 -0
  44. res_pre_attn/Ra3_S-4_R1_P0_config.json +38 -0
  45. res_pre_attn/Ra3_S-5_R1_P0.pt +3 -0
  46. res_pre_attn/Ra3_S-5_R1_P0_config.json +38 -0
  47. res_pre_attn/Ra3_S-6_R1_P0.pt +3 -0
  48. res_pre_attn/Ra3_S-6_R1_P0_config.json +38 -0
res_pre_attn/Ra0_S-1_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:168696ab72886f38f5d97e585085e3a8ed8f518ecce43f5e8729e66a09ea967c
3
+ size 153705088
res_pre_attn/Ra0_S-1_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 0,
20
+ "l1_exp": -1,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra0_S-1_R1_P0"
38
+ }
res_pre_attn/Ra0_S-2_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:492bca86760bce0fe1d4197a02cefd782badd6286102e5d8dd81f63abdad543d
3
+ size 153705088
res_pre_attn/Ra0_S-2_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 0,
20
+ "l1_exp": -2,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra0_S-2_R1_P0"
38
+ }
res_pre_attn/Ra0_S-3_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66aa12e85224b5fbc7327ddbebbe2adc98cd2c3db640a3aba08c880e733f846f
3
+ size 153705088
res_pre_attn/Ra0_S-3_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 0,
20
+ "l1_exp": -3,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra0_S-3_R1_P0"
38
+ }
res_pre_attn/Ra0_S-4_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d52b130e663b809bd8949f8b361b32b8a41af046b12d31de85f6be0b62534d8a
3
+ size 153705088
res_pre_attn/Ra0_S-4_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 0,
20
+ "l1_exp": -4,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra0_S-4_R1_P0"
38
+ }
res_pre_attn/Ra0_S-5_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49cc2a7e45089eb83bcac6f525517d3704678fb227402f4c1f4a18ecfdc3ad40
3
+ size 153705088
res_pre_attn/Ra0_S-5_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 0,
20
+ "l1_exp": -5,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra0_S-5_R1_P0"
38
+ }
res_pre_attn/Ra0_S-6_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed026ea5d3bbe516a03eeb08ba582fc80492c4ef73b529821144f4bf6903fc0e
3
+ size 153705088
res_pre_attn/Ra0_S-6_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 0,
20
+ "l1_exp": -6,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra0_S-6_R1_P0"
38
+ }
res_pre_attn/Ra1_S-1_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb793e5208c28a2f1f95a7b0d9335075b8fd8a86e7a4fd041469131dcc05cd84
3
+ size 153705088
res_pre_attn/Ra1_S-1_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 1,
20
+ "l1_exp": -1,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra1_S-1_R1_P0"
38
+ }
res_pre_attn/Ra1_S-2_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f27213da6eba34178d562cff99f0b80634ca5e7244e63dea4aacb15f98513056
3
+ size 153705088
res_pre_attn/Ra1_S-2_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 1,
20
+ "l1_exp": -2,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra1_S-2_R1_P0"
38
+ }
res_pre_attn/Ra1_S-3_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1753c457d61b78fed9133582c8fb2623e2bd4c216c5f596ac6d89726dcf5c781
3
+ size 153705088
res_pre_attn/Ra1_S-3_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 1,
20
+ "l1_exp": -3,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra1_S-3_R1_P0"
38
+ }
res_pre_attn/Ra1_S-4_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a2b170964fdb49676e3cbef24b250f093e557511d946c376ac1695d5a102dbd
3
+ size 153705088
res_pre_attn/Ra1_S-4_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 1,
20
+ "l1_exp": -4,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra1_S-4_R1_P0"
38
+ }
res_pre_attn/Ra1_S-5_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9d4de526c9389dfab48012beab25ab10b9f43508e37d198d1ab200b643027db
3
+ size 153705088
res_pre_attn/Ra1_S-5_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 1,
20
+ "l1_exp": -5,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra1_S-5_R1_P0"
38
+ }
res_pre_attn/Ra1_S-6_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f27664bd41b9a2f585b3f7e54b4bc1f36a1364848dfc1cf1ed934c0bbe253cf7
3
+ size 153705088
res_pre_attn/Ra1_S-6_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 1,
20
+ "l1_exp": -6,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra1_S-6_R1_P0"
38
+ }
res_pre_attn/Ra2_S-1_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b17d5b60a89465a4f754b2047143596e1c4b43a7b09071172f6c1178c637e952
3
+ size 153705088
res_pre_attn/Ra2_S-1_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 2,
20
+ "l1_exp": -1,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra2_S-1_R1_P0"
38
+ }
res_pre_attn/Ra2_S-2_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71f7d678ad0283dac56b3e51c3496968ff25bd1f2ebebaf56b922f856079709e
3
+ size 153705088
res_pre_attn/Ra2_S-2_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 2,
20
+ "l1_exp": -2,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra2_S-2_R1_P0"
38
+ }
res_pre_attn/Ra2_S-3_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:022b8b0b892cffab5a929a0b9c6b5a65dce8655d0a2e5264510637d82f75f441
3
+ size 153705088
res_pre_attn/Ra2_S-3_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 2,
20
+ "l1_exp": -3,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra2_S-3_R1_P0"
38
+ }
res_pre_attn/Ra2_S-4_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eed0e84ef30a48e85c68b57423aa2c09f5d6ce812f47fea3933d8146522a9b70
3
+ size 153705088
res_pre_attn/Ra2_S-4_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 2,
20
+ "l1_exp": -4,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra2_S-4_R1_P0"
38
+ }
res_pre_attn/Ra2_S-5_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24b9e4a1d45288357580a726fff2307798020dd9796ad4ba6ac7bcf9e8eeb0c4
3
+ size 153705088
res_pre_attn/Ra2_S-5_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 2,
20
+ "l1_exp": -5,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra2_S-5_R1_P0"
38
+ }
res_pre_attn/Ra2_S-6_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c73d8a8bd6c1aaa84ea548081ad934c63918b9e8573df353fa0d8289279a47a
3
+ size 153705088
res_pre_attn/Ra2_S-6_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 2,
20
+ "l1_exp": -6,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra2_S-6_R1_P0"
38
+ }
res_pre_attn/Ra3_S-1_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7a29cd8f711cbcac6078d1887dd32b2d98562cde08befaddfdc0d60db9be391
3
+ size 153705088
res_pre_attn/Ra3_S-1_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 3,
20
+ "l1_exp": -1,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra3_S-1_R1_P0"
38
+ }
res_pre_attn/Ra3_S-2_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1bd39d50e0f200223c4b62f63ca37ab555ba377a34e196584b3098dc46f7e0c
3
+ size 153705088
res_pre_attn/Ra3_S-2_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 3,
20
+ "l1_exp": -2,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra3_S-2_R1_P0"
38
+ }
res_pre_attn/Ra3_S-3_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68baad0c607802ada81a30324ba3a4013e5eef8a1f1ca7b08121d20deaba0b91
3
+ size 153705088
res_pre_attn/Ra3_S-3_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 3,
20
+ "l1_exp": -3,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra3_S-3_R1_P0"
38
+ }
res_pre_attn/Ra3_S-4_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00382e4a254b88fcd9ca43bffea80815d2137d437ddbf3a3ea1eeb96782e39c0
3
+ size 153705088
res_pre_attn/Ra3_S-4_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 3,
20
+ "l1_exp": -4,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra3_S-4_R1_P0"
38
+ }
res_pre_attn/Ra3_S-5_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90bdacfe5c96776c7a0d41ed61ab386000909df6220365ff3de552c1e40e5235
3
+ size 153705088
res_pre_attn/Ra3_S-5_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 3,
20
+ "l1_exp": -5,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra3_S-5_R1_P0"
38
+ }
res_pre_attn/Ra3_S-6_R1_P0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39dc91fef1df76ae356b6680c38a7c4cfe3898b1580f62773399dff8d94e3215
3
+ size 153705088
res_pre_attn/Ra3_S-6_R1_P0_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_features": 25000,
3
+ "d_model": 768,
4
+ "lr_exp": -10,
5
+ "disable_comet": false,
6
+ "per_neuron_reinit_interval": 0,
7
+ "reservoir_time_discount": 0.995,
8
+ "reinit_interval": 800,
9
+ "max_reinit_neurons": 5000,
10
+ "reservoir_size": 5000,
11
+ "n_piles": 292,
12
+ "log_interval": 200,
13
+ "reinit_input_norm": "target_scaled",
14
+ "reinit_input": "x",
15
+ "reinit_norm_alpha": 0.3,
16
+ "data_loc": "attn_data",
17
+ "reinit_threshold": -6,
18
+ "scheduler": "wsd",
19
+ "layer_idx": 3,
20
+ "l1_exp": -6,
21
+ "neuron_reinit_percent": 0.85,
22
+ "beta1": 1,
23
+ "beta2": 4,
24
+ "reinit_target": "error",
25
+ "sparse_adam": false,
26
+ "run_template": "Ra{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
27
+ "project_name": "res_pre_attn",
28
+ "decoder_bias": true,
29
+ "l1_beta": 0.99,
30
+ "alt_sparsity_loss": "log",
31
+ "l1_ratio": 1,
32
+ "l1_p": 0,
33
+ "optimizer": "sparse_adam",
34
+ "model_type": "res_A",
35
+ "adam_beta1": 0.5,
36
+ "adam_beta2": 0.9375,
37
+ "run_name": "Ra3_S-6_R1_P0"
38
+ }