diff --git a/attn/A0_S-1_R1_P0.pt b/attn/A0_S-1_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4b80d7fbe31971ead5cb53662693db3a825871de
--- /dev/null
+++ b/attn/A0_S-1_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:881b7169ab0e6f4fcb87372653876c278f975d4efeeb43346a80f3ca6b7c2aaa
+size 153705080
diff --git a/attn/A0_S-1_R1_P0_config.json b/attn/A0_S-1_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..83f3b160bcdac213ae26e68e73c5d28dde7aa7f4
--- /dev/null
+++ b/attn/A0_S-1_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 0,
+  "l1_exp": -1,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A0_S-1_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A0_S-2_R1_P0.pt b/attn/A0_S-2_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6b3356144518227ef204daf27581540d791409b0
--- /dev/null
+++ b/attn/A0_S-2_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a967d32a0cf8724fa9fe4d2a6aac4925037c75c5307891546056f5fd04dc9fd
+size 153705080
diff --git a/attn/A0_S-2_R1_P0_config.json b/attn/A0_S-2_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..251b0be051607f57ed702ce77c11d3f296e62a2f
--- /dev/null
+++ b/attn/A0_S-2_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 0,
+  "l1_exp": -2,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A0_S-2_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A0_S-3_R1_P0.pt b/attn/A0_S-3_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3f8b162d867accd75a6c3c0046a1d181fc15c349
--- /dev/null
+++ b/attn/A0_S-3_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1b7ccabe7a4a3e5fa5811c6013c57228362bf80734afcbecf1b567569e5f542
+size 153705080
diff --git a/attn/A0_S-3_R1_P0_config.json b/attn/A0_S-3_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7f841e7ce0528216a13adf32b283cd71eceb8c4
--- /dev/null
+++ b/attn/A0_S-3_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 0,
+  "l1_exp": -3,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A0_S-3_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A0_S-4_R1_P0.pt b/attn/A0_S-4_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e594a49cdd5bff5960ed1f3dee59145605a596a3
--- /dev/null
+++ b/attn/A0_S-4_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df8d6af259884d54c4fe117c8b4bff09a04e049187cd6a29f16a1445a09d656c
+size 153705080
diff --git a/attn/A0_S-4_R1_P0_config.json b/attn/A0_S-4_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..cd64cd263d98157efd3a61d5aa4b7ee130412ba2
--- /dev/null
+++ b/attn/A0_S-4_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 0,
+  "l1_exp": -4,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A0_S-4_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A0_S-5_R1_P0.pt b/attn/A0_S-5_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5adb92fee38b21ac63d52e0f005ec58dcb6c094a
--- /dev/null
+++ b/attn/A0_S-5_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3225b3849eaf2b0526808aa02f477084cd39751145125b50345778b11e78d21d
+size 153705080
diff --git a/attn/A0_S-5_R1_P0_config.json b/attn/A0_S-5_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d9ab1819865b0b6b2caf16fd1172af124a231f2
--- /dev/null
+++ b/attn/A0_S-5_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 0,
+  "l1_exp": -5,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A0_S-5_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A0_S-6_R1_P0.pt b/attn/A0_S-6_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..32ad577a23c829ccd49fbdebeb0cb3e1392b9da3
--- /dev/null
+++ b/attn/A0_S-6_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1aa1970e842e927e4f1c02dee66572818ffea6bc4a40d7d97e62e57d4d95356e
+size 153705080
diff --git a/attn/A0_S-6_R1_P0_config.json b/attn/A0_S-6_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c36f4991a887cfd4b4f3ff3a748b286aa14860e8
--- /dev/null
+++ b/attn/A0_S-6_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 0,
+  "l1_exp": -6,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A0_S-6_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A0_S-7_R1_P0.pt b/attn/A0_S-7_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d2cb19d6eb13ad0aa5d31c3c2e0e54484ea3f52c
--- /dev/null
+++ b/attn/A0_S-7_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e63474090a94d6ca4dc76f1198a407692960dc5199abad46ee8409307390d924
+size 153705080
diff --git a/attn/A0_S-7_R1_P0_config.json b/attn/A0_S-7_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d4fc24d3ed67d4288d128ebcb61de6d2e61dcfa
--- /dev/null
+++ b/attn/A0_S-7_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 0,
+  "l1_exp": -7,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A0_S-7_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A0_S-8_R1_P0.pt b/attn/A0_S-8_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d325dc52bcfc3c45bedcf72c7f65873eaea44a0c
--- /dev/null
+++ b/attn/A0_S-8_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b7a0ae2e87296395587c99f6d6dda887ce2a22211e868959cd4184c7b32dde7
+size 153705080
diff --git a/attn/A0_S-8_R1_P0_config.json b/attn/A0_S-8_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..93b708ad5a1d1f3ef193efd72e31f9c205993edd
--- /dev/null
+++ b/attn/A0_S-8_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 0,
+  "l1_exp": -8,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A0_S-8_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A1_S-1_R1_P0.pt b/attn/A1_S-1_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..130a86cf47429da52155680c50579057c9b0c38e
--- /dev/null
+++ b/attn/A1_S-1_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e9e47e6e3eae481fa23db381f57f1340fb530456ec5d4e625b7232340f7a2f8
+size 153705080
diff --git a/attn/A1_S-1_R1_P0_config.json b/attn/A1_S-1_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f711f48800e891031418fddc5854b178dae4f42e
--- /dev/null
+++ b/attn/A1_S-1_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 1,
+  "l1_exp": -1,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A1_S-1_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A1_S-2_R1_P0.pt b/attn/A1_S-2_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..682f7941b40d10fd6dc07ea3ed73adb84764ffc6
--- /dev/null
+++ b/attn/A1_S-2_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9da8e0ed7db2cfd12e35d74f726b1284295347b9695a965f7ddcaffb0449399e
+size 153705080
diff --git a/attn/A1_S-2_R1_P0_config.json b/attn/A1_S-2_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e4c9f80cc26c549d3c1a273b0c585fb1ef530de
--- /dev/null
+++ b/attn/A1_S-2_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 1,
+  "l1_exp": -2,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A1_S-2_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A1_S-3_R1_P0.pt b/attn/A1_S-3_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..09910e93f0c14e1ad1ec67eb926fbd730b007fc4
--- /dev/null
+++ b/attn/A1_S-3_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eaa30e7dcfdc66166c34f3b63d3c02048c5f7bbcd2c35e0ed6ab7a1c0213d3b5
+size 153705080
diff --git a/attn/A1_S-3_R1_P0_config.json b/attn/A1_S-3_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..41d457c9e63d2f883e9b885130315a2c454e9619
--- /dev/null
+++ b/attn/A1_S-3_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 1,
+  "l1_exp": -3,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A1_S-3_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A1_S-4_R1_P0.pt b/attn/A1_S-4_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d9272a763e07687c479d9c048b444bd71fa8984c
--- /dev/null
+++ b/attn/A1_S-4_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c39898304960fef85e4e335e2ef19aa320b9b52addf607631748cafe3af72f68
+size 153705080
diff --git a/attn/A1_S-4_R1_P0_config.json b/attn/A1_S-4_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..26302217aa525b21af6bd47e5af7f30ac6999a3c
--- /dev/null
+++ b/attn/A1_S-4_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 1,
+  "l1_exp": -4,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A1_S-4_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A1_S-5_R1_P0.pt b/attn/A1_S-5_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dbab719263b188d387683a1e7084fa71f476f5ca
--- /dev/null
+++ b/attn/A1_S-5_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f83a42d62c42719e64afb5874fdc7a816aeb66051a9ac0700ef4bc0152de88b3
+size 153705080
diff --git a/attn/A1_S-5_R1_P0_config.json b/attn/A1_S-5_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb8df884f8fd45c081e8a5a0f09a845a1f340cc6
--- /dev/null
+++ b/attn/A1_S-5_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 1,
+  "l1_exp": -5,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A1_S-5_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A1_S-6_R1_P0.pt b/attn/A1_S-6_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..04c9331d4db91439c10d1ab343b3660d97ce4ba5
--- /dev/null
+++ b/attn/A1_S-6_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a46305261cd3ecf76a415d4b237a6b9e78fe6f98299d32dc1b72c83951e1883f
+size 153705080
diff --git a/attn/A1_S-6_R1_P0_config.json b/attn/A1_S-6_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6747294fb3bf6c50aeeb4bbfb8b534b785bbb959
--- /dev/null
+++ b/attn/A1_S-6_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 1,
+  "l1_exp": -6,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A1_S-6_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A1_S-7_R1_P0.pt b/attn/A1_S-7_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5ea07a96a7cb7f6bee102023b92efb0d069dd59a
--- /dev/null
+++ b/attn/A1_S-7_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9dc01776116cda0db85b31f842153d595aba0d89557090e5ee61d10e83098bfe
+size 153705080
diff --git a/attn/A1_S-7_R1_P0_config.json b/attn/A1_S-7_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f4f5b760ffaa1be28c7d87e0242ab3c92888b90
--- /dev/null
+++ b/attn/A1_S-7_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 1,
+  "l1_exp": -7,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A1_S-7_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A1_S-8_R1_P0.pt b/attn/A1_S-8_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..80aa67a76e34a1b9bb6b5438a2df30fe247dc828
--- /dev/null
+++ b/attn/A1_S-8_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7f67148f42731c6f0417868ebbb4840f3ca38214c53a43579b7e74c2bd7498e
+size 153705080
diff --git a/attn/A1_S-8_R1_P0_config.json b/attn/A1_S-8_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6eae0777749190f7a9b9fe5e9ffe8d554d54a936
--- /dev/null
+++ b/attn/A1_S-8_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 1,
+  "l1_exp": -8,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A1_S-8_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A2_S-1_R1_P0.pt b/attn/A2_S-1_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..252c2e8d11876615a583ab60639a5a66d683c8ef
--- /dev/null
+++ b/attn/A2_S-1_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e77aee122249497ecd59ae4c3c6d3adf19b84c7c2d1457a70eb46f72457147d
+size 153705080
diff --git a/attn/A2_S-1_R1_P0_config.json b/attn/A2_S-1_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..017b142cf124b2996117423bf500022ed1ca80f8
--- /dev/null
+++ b/attn/A2_S-1_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 2,
+  "l1_exp": -1,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A2_S-1_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A2_S-2_R1_P0.pt b/attn/A2_S-2_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4ea161dd005953f2ec8f496a29263ffaafe44798
--- /dev/null
+++ b/attn/A2_S-2_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82a94ba450fb8ab21ac7951ba04a152282ccc9048e082a518f865768cf735eaf
+size 153705080
diff --git a/attn/A2_S-2_R1_P0_config.json b/attn/A2_S-2_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c850d412598b18390a1c00fa1aa169ae7474e2e3
--- /dev/null
+++ b/attn/A2_S-2_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 2,
+  "l1_exp": -2,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A2_S-2_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A2_S-3_R1_P0.pt b/attn/A2_S-3_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d2c4bdd2d6304c6054dd8483857394250938a1c3
--- /dev/null
+++ b/attn/A2_S-3_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de6f0b4dad578eab23f3a7e0373c4e2aae4cf2f23a161cf05672185687d0634d
+size 153705080
diff --git a/attn/A2_S-3_R1_P0_config.json b/attn/A2_S-3_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea3b2cad829a1f2de0b370d96ef328dc6d460f2f
--- /dev/null
+++ b/attn/A2_S-3_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 2,
+  "l1_exp": -3,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A2_S-3_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A2_S-4_R1_P0.pt b/attn/A2_S-4_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d32a090310b7f0d6a75dfc91e183dbc1c7ff2263
--- /dev/null
+++ b/attn/A2_S-4_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c89d9e68cc30e96eb3466874e9c058dea7ea6e04df51a2f8be962cc8e883b022
+size 153705080
diff --git a/attn/A2_S-4_R1_P0_config.json b/attn/A2_S-4_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e775a4286d04f16c0057fea4947b66813635258
--- /dev/null
+++ b/attn/A2_S-4_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 2,
+  "l1_exp": -4,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A2_S-4_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A2_S-5_R1_P0.pt b/attn/A2_S-5_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2a10ab49600a8c0e58f4ac1fa5fc95c825317499
--- /dev/null
+++ b/attn/A2_S-5_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce3bfc12d00c24c5f11bca089b3b567e8caeba847bb17b0c63122a1e4e54838b
+size 153705080
diff --git a/attn/A2_S-5_R1_P0_config.json b/attn/A2_S-5_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4aff8cb460b9fae73e23000cafc637b21122812e
--- /dev/null
+++ b/attn/A2_S-5_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 2,
+  "l1_exp": -5,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A2_S-5_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A2_S-6_R1_P0.pt b/attn/A2_S-6_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..813cf45672fe444e59101f9391c36db7e4d0c61b
--- /dev/null
+++ b/attn/A2_S-6_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:946fb7a78c0a1bbdffdeebac539ce09e00dcfe2510f6a119a2e87ea1ae291df1
+size 153705080
diff --git a/attn/A2_S-6_R1_P0_config.json b/attn/A2_S-6_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3a99a05630cea8b6985f938b000a06693951e20
--- /dev/null
+++ b/attn/A2_S-6_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 2,
+  "l1_exp": -6,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A2_S-6_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A2_S-7_R1_P0.pt b/attn/A2_S-7_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3ed82605448eb3b4e3bac655ab393b60dd964dfb
--- /dev/null
+++ b/attn/A2_S-7_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21d55f2678f24117c8963bd013ac42b6202b770f9a2423c1f55ba01967f6e670
+size 153705080
diff --git a/attn/A2_S-7_R1_P0_config.json b/attn/A2_S-7_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f14217db42f9cadc1df5457361499ff55f01eea1
--- /dev/null
+++ b/attn/A2_S-7_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 2,
+  "l1_exp": -7,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A2_S-7_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A2_S-8_R1_P0.pt b/attn/A2_S-8_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9d877f7e90d2a494d0766358b321987f7d0e0805
--- /dev/null
+++ b/attn/A2_S-8_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:093c51dda5c92c3b33ca29a6d2944e48ae83ff060ea812cec54ed8196e4a0fb7
+size 153705080
diff --git a/attn/A2_S-8_R1_P0_config.json b/attn/A2_S-8_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..1698f744aba443b18dd479fe542f7e756b631356
--- /dev/null
+++ b/attn/A2_S-8_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 2,
+  "l1_exp": -8,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A2_S-8_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A3_S-1_R1_P0.pt b/attn/A3_S-1_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..88687dceb49402a33048b3989b564e3eb5139517
--- /dev/null
+++ b/attn/A3_S-1_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac13f6074c397c536dfed42c409d186b7ec71599cc137a0034a558e19342fd34
+size 153705080
diff --git a/attn/A3_S-1_R1_P0_config.json b/attn/A3_S-1_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..180cf9112a50be6d752191bc3ad66907a52f7eee
--- /dev/null
+++ b/attn/A3_S-1_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 3,
+  "l1_exp": -1,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A3_S-1_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A3_S-2_R1_P0.pt b/attn/A3_S-2_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e49d2650f1134cbeb1d7ab37cafe780597c09021
--- /dev/null
+++ b/attn/A3_S-2_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93e8279dc7296f421a340036b17b96afbdccd4b3a31a60845b572962f39acaf3
+size 153705080
diff --git a/attn/A3_S-2_R1_P0_config.json b/attn/A3_S-2_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9efca91f660028191a3dd63f0c052b05c3f64f2
--- /dev/null
+++ b/attn/A3_S-2_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 3,
+  "l1_exp": -2,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A3_S-2_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A3_S-3_R1_P0.pt b/attn/A3_S-3_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c6933a0a35fe2304443c31103a01d71bab69f000
--- /dev/null
+++ b/attn/A3_S-3_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9924363fe283d547f84cfa283ef0e0a56aa64ae2291265951f540685cd3dc20c
+size 153705080
diff --git a/attn/A3_S-3_R1_P0_config.json b/attn/A3_S-3_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..28fa78db26f3f27c3a6b0fa58b0a310ce9f9bb96
--- /dev/null
+++ b/attn/A3_S-3_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 3,
+  "l1_exp": -3,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A3_S-3_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A3_S-4_R1_P0.pt b/attn/A3_S-4_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c3b46017626ce5c061480fd76be68c44a6746099
--- /dev/null
+++ b/attn/A3_S-4_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:774dd034c7cc62ddfc779bb3ba8f7cb4a8c0791f32b6afde75027e6477402eed
+size 153705080
diff --git a/attn/A3_S-4_R1_P0_config.json b/attn/A3_S-4_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..fda0a59e2dcbe441b413b04ac217b96db4991e77
--- /dev/null
+++ b/attn/A3_S-4_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 3,
+  "l1_exp": -4,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A3_S-4_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A3_S-5_R1_P0.pt b/attn/A3_S-5_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f90ff61c88f07c74c4cbb38a274017dd22599c54
--- /dev/null
+++ b/attn/A3_S-5_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b6bc4b24e527dd772f059545765a0d3041b29d6c57ebb99d15d0a6a49ca1be0
+size 153705080
diff --git a/attn/A3_S-5_R1_P0_config.json b/attn/A3_S-5_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f03ed2ef626b9e30b076d0960fdefac544b50b01
--- /dev/null
+++ b/attn/A3_S-5_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 3,
+  "l1_exp": -5,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A3_S-5_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A3_S-6_R1_P0.pt b/attn/A3_S-6_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d0fd7a7e761724d9cbbf4291fadcd62733e9bc51
--- /dev/null
+++ b/attn/A3_S-6_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4682de083ca79eebe8ce713f0008eeed0a3f7e9e892ca1486077213c3bb0085
+size 153705080
diff --git a/attn/A3_S-6_R1_P0_config.json b/attn/A3_S-6_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..571969ec99b36f5ed010b9eb9d318141eb11b8f1
--- /dev/null
+++ b/attn/A3_S-6_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 3,
+  "l1_exp": -6,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A3_S-6_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A3_S-7_R1_P0.pt b/attn/A3_S-7_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4083bcdf9b5fdd38ac98b21cd114fd3e11c46190
--- /dev/null
+++ b/attn/A3_S-7_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4afe2079717756606813c533cd7fa0c6270172540ff6b88a61e5c288aac1ecd8
+size 153705080
diff --git a/attn/A3_S-7_R1_P0_config.json b/attn/A3_S-7_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3938a5e2d966ab723d7b7e32875f37ed5a8c707
--- /dev/null
+++ b/attn/A3_S-7_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 3,
+  "l1_exp": -7,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A3_S-7_R1_P0"
+}
\ No newline at end of file
diff --git a/attn/A3_S-8_R1_P0.pt b/attn/A3_S-8_R1_P0.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a384031cb3fb4a7c90a3da44ba262bb8b1e10e5e
--- /dev/null
+++ b/attn/A3_S-8_R1_P0.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81bfc5a69399687b0f7db6544ebb93412528282934c416d81d90d7a245de7041
+size 153705080
diff --git a/attn/A3_S-8_R1_P0_config.json b/attn/A3_S-8_R1_P0_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..eaf3b58ef2b6b1556778a0568ad4dc6bcf2f4a04
--- /dev/null
+++ b/attn/A3_S-8_R1_P0_config.json
@@ -0,0 +1,38 @@
+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 3,
+  "l1_exp": -8,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A3_S-8_R1_P0"
+}
\ No newline at end of file