diff --git a/attn/A0_S-1_R1_P0.pt b/attn/A0_S-1_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b80d7fbe31971ead5cb53662693db3a825871de --- /dev/null +++ b/attn/A0_S-1_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:881b7169ab0e6f4fcb87372653876c278f975d4efeeb43346a80f3ca6b7c2aaa +size 153705080 diff --git a/attn/A0_S-1_R1_P0_config.json b/attn/A0_S-1_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83f3b160bcdac213ae26e68e73c5d28dde7aa7f4 --- /dev/null +++ b/attn/A0_S-1_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_S-1_R1_P0" +} \ No newline at end of file diff --git a/attn/A0_S-2_R1_P0.pt b/attn/A0_S-2_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b3356144518227ef204daf27581540d791409b0 --- /dev/null +++ b/attn/A0_S-2_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a967d32a0cf8724fa9fe4d2a6aac4925037c75c5307891546056f5fd04dc9fd +size 153705080 diff --git a/attn/A0_S-2_R1_P0_config.json b/attn/A0_S-2_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..251b0be051607f57ed702ce77c11d3f296e62a2f --- /dev/null +++ b/attn/A0_S-2_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_S-2_R1_P0" +} \ No newline at end of file diff --git a/attn/A0_S-3_R1_P0.pt b/attn/A0_S-3_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f8b162d867accd75a6c3c0046a1d181fc15c349 --- /dev/null +++ b/attn/A0_S-3_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1b7ccabe7a4a3e5fa5811c6013c57228362bf80734afcbecf1b567569e5f542 +size 153705080 diff --git a/attn/A0_S-3_R1_P0_config.json b/attn/A0_S-3_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f7f841e7ce0528216a13adf32b283cd71eceb8c4 --- /dev/null +++ b/attn/A0_S-3_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -3, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_S-3_R1_P0" +} \ No newline at end of file diff --git a/attn/A0_S-4_R1_P0.pt b/attn/A0_S-4_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..e594a49cdd5bff5960ed1f3dee59145605a596a3 --- /dev/null +++ b/attn/A0_S-4_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df8d6af259884d54c4fe117c8b4bff09a04e049187cd6a29f16a1445a09d656c +size 153705080 diff --git a/attn/A0_S-4_R1_P0_config.json b/attn/A0_S-4_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cd64cd263d98157efd3a61d5aa4b7ee130412ba2 --- /dev/null +++ b/attn/A0_S-4_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -4, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_S-4_R1_P0" +} \ No newline at end of file diff --git a/attn/A0_S-5_R1_P0.pt b/attn/A0_S-5_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..5adb92fee38b21ac63d52e0f005ec58dcb6c094a --- /dev/null +++ b/attn/A0_S-5_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3225b3849eaf2b0526808aa02f477084cd39751145125b50345778b11e78d21d +size 153705080 diff --git a/attn/A0_S-5_R1_P0_config.json b/attn/A0_S-5_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2d9ab1819865b0b6b2caf16fd1172af124a231f2 --- /dev/null +++ b/attn/A0_S-5_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -5, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_S-5_R1_P0" +} \ No newline at end of file diff --git a/attn/A0_S-6_R1_P0.pt b/attn/A0_S-6_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..32ad577a23c829ccd49fbdebeb0cb3e1392b9da3 --- /dev/null +++ b/attn/A0_S-6_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1aa1970e842e927e4f1c02dee66572818ffea6bc4a40d7d97e62e57d4d95356e +size 153705080 diff --git a/attn/A0_S-6_R1_P0_config.json b/attn/A0_S-6_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c36f4991a887cfd4b4f3ff3a748b286aa14860e8 --- /dev/null +++ b/attn/A0_S-6_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -6, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_S-6_R1_P0" +} \ No newline at end of file diff --git a/attn/A0_S-7_R1_P0.pt b/attn/A0_S-7_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2cb19d6eb13ad0aa5d31c3c2e0e54484ea3f52c --- /dev/null +++ b/attn/A0_S-7_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e63474090a94d6ca4dc76f1198a407692960dc5199abad46ee8409307390d924 +size 153705080 diff --git a/attn/A0_S-7_R1_P0_config.json b/attn/A0_S-7_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8d4fc24d3ed67d4288d128ebcb61de6d2e61dcfa --- /dev/null +++ b/attn/A0_S-7_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -7, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_S-7_R1_P0" +} \ No newline at end of file diff --git a/attn/A0_S-8_R1_P0.pt b/attn/A0_S-8_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..d325dc52bcfc3c45bedcf72c7f65873eaea44a0c --- /dev/null +++ b/attn/A0_S-8_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b7a0ae2e87296395587c99f6d6dda887ce2a22211e868959cd4184c7b32dde7 +size 153705080 diff --git a/attn/A0_S-8_R1_P0_config.json b/attn/A0_S-8_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..93b708ad5a1d1f3ef193efd72e31f9c205993edd --- /dev/null +++ b/attn/A0_S-8_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -8, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_S-8_R1_P0" +} \ No newline at end of file diff --git a/attn/A1_S-1_R1_P0.pt b/attn/A1_S-1_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..130a86cf47429da52155680c50579057c9b0c38e --- /dev/null +++ b/attn/A1_S-1_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e9e47e6e3eae481fa23db381f57f1340fb530456ec5d4e625b7232340f7a2f8 +size 153705080 diff --git a/attn/A1_S-1_R1_P0_config.json b/attn/A1_S-1_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f711f48800e891031418fddc5854b178dae4f42e --- /dev/null +++ b/attn/A1_S-1_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_S-1_R1_P0" +} \ No newline at end of file diff --git a/attn/A1_S-2_R1_P0.pt b/attn/A1_S-2_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..682f7941b40d10fd6dc07ea3ed73adb84764ffc6 --- /dev/null +++ b/attn/A1_S-2_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9da8e0ed7db2cfd12e35d74f726b1284295347b9695a965f7ddcaffb0449399e +size 153705080 diff --git a/attn/A1_S-2_R1_P0_config.json b/attn/A1_S-2_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6e4c9f80cc26c549d3c1a273b0c585fb1ef530de --- /dev/null +++ b/attn/A1_S-2_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_S-2_R1_P0" +} \ No newline at end of file diff --git a/attn/A1_S-3_R1_P0.pt b/attn/A1_S-3_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..09910e93f0c14e1ad1ec67eb926fbd730b007fc4 --- /dev/null +++ b/attn/A1_S-3_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eaa30e7dcfdc66166c34f3b63d3c02048c5f7bbcd2c35e0ed6ab7a1c0213d3b5 +size 153705080 diff --git a/attn/A1_S-3_R1_P0_config.json b/attn/A1_S-3_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..41d457c9e63d2f883e9b885130315a2c454e9619 --- /dev/null +++ b/attn/A1_S-3_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -3, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_S-3_R1_P0" +} \ No newline at end of file diff --git a/attn/A1_S-4_R1_P0.pt b/attn/A1_S-4_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9272a763e07687c479d9c048b444bd71fa8984c --- /dev/null +++ b/attn/A1_S-4_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c39898304960fef85e4e335e2ef19aa320b9b52addf607631748cafe3af72f68 +size 153705080 diff --git a/attn/A1_S-4_R1_P0_config.json b/attn/A1_S-4_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..26302217aa525b21af6bd47e5af7f30ac6999a3c --- /dev/null +++ b/attn/A1_S-4_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -4, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_S-4_R1_P0" +} \ No newline at end of file diff --git a/attn/A1_S-5_R1_P0.pt b/attn/A1_S-5_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbab719263b188d387683a1e7084fa71f476f5ca --- /dev/null +++ b/attn/A1_S-5_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f83a42d62c42719e64afb5874fdc7a816aeb66051a9ac0700ef4bc0152de88b3 +size 153705080 diff --git a/attn/A1_S-5_R1_P0_config.json b/attn/A1_S-5_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eb8df884f8fd45c081e8a5a0f09a845a1f340cc6 --- /dev/null +++ b/attn/A1_S-5_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -5, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_S-5_R1_P0" +} \ No newline at end of file diff --git a/attn/A1_S-6_R1_P0.pt b/attn/A1_S-6_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..04c9331d4db91439c10d1ab343b3660d97ce4ba5 --- /dev/null +++ b/attn/A1_S-6_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a46305261cd3ecf76a415d4b237a6b9e78fe6f98299d32dc1b72c83951e1883f +size 153705080 diff --git a/attn/A1_S-6_R1_P0_config.json b/attn/A1_S-6_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6747294fb3bf6c50aeeb4bbfb8b534b785bbb959 --- /dev/null +++ b/attn/A1_S-6_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -6, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_S-6_R1_P0" +} \ No newline at end of file diff --git a/attn/A1_S-7_R1_P0.pt b/attn/A1_S-7_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ea07a96a7cb7f6bee102023b92efb0d069dd59a --- /dev/null +++ b/attn/A1_S-7_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dc01776116cda0db85b31f842153d595aba0d89557090e5ee61d10e83098bfe +size 153705080 diff --git a/attn/A1_S-7_R1_P0_config.json b/attn/A1_S-7_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9f4f5b760ffaa1be28c7d87e0242ab3c92888b90 --- /dev/null +++ b/attn/A1_S-7_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -7, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_S-7_R1_P0" +} \ No newline at end of file diff --git a/attn/A1_S-8_R1_P0.pt b/attn/A1_S-8_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..80aa67a76e34a1b9bb6b5438a2df30fe247dc828 --- /dev/null +++ b/attn/A1_S-8_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7f67148f42731c6f0417868ebbb4840f3ca38214c53a43579b7e74c2bd7498e +size 153705080 diff --git a/attn/A1_S-8_R1_P0_config.json b/attn/A1_S-8_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6eae0777749190f7a9b9fe5e9ffe8d554d54a936 --- /dev/null +++ b/attn/A1_S-8_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -8, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_S-8_R1_P0" +} \ No newline at end of file diff --git a/attn/A2_S-1_R1_P0.pt b/attn/A2_S-1_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..252c2e8d11876615a583ab60639a5a66d683c8ef --- /dev/null +++ b/attn/A2_S-1_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e77aee122249497ecd59ae4c3c6d3adf19b84c7c2d1457a70eb46f72457147d +size 153705080 diff --git a/attn/A2_S-1_R1_P0_config.json b/attn/A2_S-1_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..017b142cf124b2996117423bf500022ed1ca80f8 --- /dev/null +++ b/attn/A2_S-1_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 2, + "l1_exp": -1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A2_S-1_R1_P0" +} \ No newline at end of file diff --git a/attn/A2_S-2_R1_P0.pt b/attn/A2_S-2_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ea161dd005953f2ec8f496a29263ffaafe44798 --- /dev/null +++ b/attn/A2_S-2_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82a94ba450fb8ab21ac7951ba04a152282ccc9048e082a518f865768cf735eaf +size 153705080 diff --git a/attn/A2_S-2_R1_P0_config.json b/attn/A2_S-2_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c850d412598b18390a1c00fa1aa169ae7474e2e3 --- /dev/null +++ b/attn/A2_S-2_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 2, + "l1_exp": -2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A2_S-2_R1_P0" +} \ No newline at end of file diff --git a/attn/A2_S-3_R1_P0.pt b/attn/A2_S-3_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2c4bdd2d6304c6054dd8483857394250938a1c3 --- /dev/null +++ b/attn/A2_S-3_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de6f0b4dad578eab23f3a7e0373c4e2aae4cf2f23a161cf05672185687d0634d +size 153705080 diff --git a/attn/A2_S-3_R1_P0_config.json b/attn/A2_S-3_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ea3b2cad829a1f2de0b370d96ef328dc6d460f2f --- /dev/null +++ b/attn/A2_S-3_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 2, + "l1_exp": -3, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A2_S-3_R1_P0" +} \ No newline at end of file diff --git a/attn/A2_S-4_R1_P0.pt b/attn/A2_S-4_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..d32a090310b7f0d6a75dfc91e183dbc1c7ff2263 --- /dev/null +++ b/attn/A2_S-4_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c89d9e68cc30e96eb3466874e9c058dea7ea6e04df51a2f8be962cc8e883b022 +size 153705080 diff --git a/attn/A2_S-4_R1_P0_config.json b/attn/A2_S-4_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7e775a4286d04f16c0057fea4947b66813635258 --- /dev/null +++ b/attn/A2_S-4_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 2, + "l1_exp": -4, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A2_S-4_R1_P0" +} \ No newline at end of file diff --git a/attn/A2_S-5_R1_P0.pt b/attn/A2_S-5_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a10ab49600a8c0e58f4ac1fa5fc95c825317499 --- /dev/null +++ b/attn/A2_S-5_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce3bfc12d00c24c5f11bca089b3b567e8caeba847bb17b0c63122a1e4e54838b +size 153705080 diff --git a/attn/A2_S-5_R1_P0_config.json b/attn/A2_S-5_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4aff8cb460b9fae73e23000cafc637b21122812e --- /dev/null +++ b/attn/A2_S-5_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 2, + "l1_exp": -5, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A2_S-5_R1_P0" +} \ No newline at end of file diff --git a/attn/A2_S-6_R1_P0.pt b/attn/A2_S-6_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..813cf45672fe444e59101f9391c36db7e4d0c61b --- /dev/null +++ b/attn/A2_S-6_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:946fb7a78c0a1bbdffdeebac539ce09e00dcfe2510f6a119a2e87ea1ae291df1 +size 153705080 diff --git a/attn/A2_S-6_R1_P0_config.json b/attn/A2_S-6_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f3a99a05630cea8b6985f938b000a06693951e20 --- /dev/null +++ b/attn/A2_S-6_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 2, + "l1_exp": -6, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A2_S-6_R1_P0" +} \ No newline at end of file diff --git a/attn/A2_S-7_R1_P0.pt b/attn/A2_S-7_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ed82605448eb3b4e3bac655ab393b60dd964dfb --- /dev/null +++ b/attn/A2_S-7_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21d55f2678f24117c8963bd013ac42b6202b770f9a2423c1f55ba01967f6e670 +size 153705080 diff --git a/attn/A2_S-7_R1_P0_config.json b/attn/A2_S-7_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f14217db42f9cadc1df5457361499ff55f01eea1 --- /dev/null +++ b/attn/A2_S-7_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 2, + "l1_exp": -7, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A2_S-7_R1_P0" +} \ No newline at end of file diff --git a/attn/A2_S-8_R1_P0.pt b/attn/A2_S-8_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d877f7e90d2a494d0766358b321987f7d0e0805 --- /dev/null +++ b/attn/A2_S-8_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:093c51dda5c92c3b33ca29a6d2944e48ae83ff060ea812cec54ed8196e4a0fb7 +size 153705080 diff --git a/attn/A2_S-8_R1_P0_config.json b/attn/A2_S-8_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1698f744aba443b18dd479fe542f7e756b631356 --- /dev/null +++ b/attn/A2_S-8_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 2, + "l1_exp": -8, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A2_S-8_R1_P0" +} \ No newline at end of file diff --git a/attn/A3_S-1_R1_P0.pt b/attn/A3_S-1_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..88687dceb49402a33048b3989b564e3eb5139517 --- /dev/null +++ b/attn/A3_S-1_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac13f6074c397c536dfed42c409d186b7ec71599cc137a0034a558e19342fd34 +size 153705080 diff --git a/attn/A3_S-1_R1_P0_config.json b/attn/A3_S-1_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..180cf9112a50be6d752191bc3ad66907a52f7eee --- /dev/null +++ b/attn/A3_S-1_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 3, + "l1_exp": -1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A3_S-1_R1_P0" +} \ No newline at end of file diff --git a/attn/A3_S-2_R1_P0.pt b/attn/A3_S-2_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..e49d2650f1134cbeb1d7ab37cafe780597c09021 --- /dev/null +++ b/attn/A3_S-2_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93e8279dc7296f421a340036b17b96afbdccd4b3a31a60845b572962f39acaf3 +size 153705080 diff --git a/attn/A3_S-2_R1_P0_config.json b/attn/A3_S-2_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b9efca91f660028191a3dd63f0c052b05c3f64f2 --- /dev/null +++ b/attn/A3_S-2_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 3, + "l1_exp": -2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A3_S-2_R1_P0" +} \ No newline at end of file diff --git a/attn/A3_S-3_R1_P0.pt b/attn/A3_S-3_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6933a0a35fe2304443c31103a01d71bab69f000 --- /dev/null +++ b/attn/A3_S-3_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9924363fe283d547f84cfa283ef0e0a56aa64ae2291265951f540685cd3dc20c +size 153705080 diff --git a/attn/A3_S-3_R1_P0_config.json b/attn/A3_S-3_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..28fa78db26f3f27c3a6b0fa58b0a310ce9f9bb96 --- /dev/null +++ b/attn/A3_S-3_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 3, + "l1_exp": -3, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A3_S-3_R1_P0" +} \ No newline at end of file diff --git a/attn/A3_S-4_R1_P0.pt b/attn/A3_S-4_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3b46017626ce5c061480fd76be68c44a6746099 --- /dev/null +++ b/attn/A3_S-4_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:774dd034c7cc62ddfc779bb3ba8f7cb4a8c0791f32b6afde75027e6477402eed +size 153705080 diff --git a/attn/A3_S-4_R1_P0_config.json b/attn/A3_S-4_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fda0a59e2dcbe441b413b04ac217b96db4991e77 --- /dev/null +++ b/attn/A3_S-4_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 3, + "l1_exp": -4, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A3_S-4_R1_P0" +} \ No newline at end of file diff --git a/attn/A3_S-5_R1_P0.pt b/attn/A3_S-5_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..f90ff61c88f07c74c4cbb38a274017dd22599c54 --- /dev/null +++ b/attn/A3_S-5_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b6bc4b24e527dd772f059545765a0d3041b29d6c57ebb99d15d0a6a49ca1be0 +size 153705080 diff --git a/attn/A3_S-5_R1_P0_config.json b/attn/A3_S-5_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f03ed2ef626b9e30b076d0960fdefac544b50b01 --- /dev/null +++ b/attn/A3_S-5_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 3, + "l1_exp": -5, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A3_S-5_R1_P0" +} \ No newline at end of file diff --git a/attn/A3_S-6_R1_P0.pt b/attn/A3_S-6_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0fd7a7e761724d9cbbf4291fadcd62733e9bc51 --- /dev/null +++ b/attn/A3_S-6_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4682de083ca79eebe8ce713f0008eeed0a3f7e9e892ca1486077213c3bb0085 +size 153705080 diff --git a/attn/A3_S-6_R1_P0_config.json b/attn/A3_S-6_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..571969ec99b36f5ed010b9eb9d318141eb11b8f1 --- /dev/null +++ b/attn/A3_S-6_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 3, + "l1_exp": -6, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A3_S-6_R1_P0" +} \ No newline at end of file diff --git a/attn/A3_S-7_R1_P0.pt b/attn/A3_S-7_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..4083bcdf9b5fdd38ac98b21cd114fd3e11c46190 --- /dev/null +++ b/attn/A3_S-7_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4afe2079717756606813c533cd7fa0c6270172540ff6b88a61e5c288aac1ecd8 +size 153705080 diff --git a/attn/A3_S-7_R1_P0_config.json b/attn/A3_S-7_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e3938a5e2d966ab723d7b7e32875f37ed5a8c707 --- /dev/null +++ b/attn/A3_S-7_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 3, + "l1_exp": -7, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A3_S-7_R1_P0" +} \ No newline at end of file diff --git a/attn/A3_S-8_R1_P0.pt b/attn/A3_S-8_R1_P0.pt new file mode 100644 index 0000000000000000000000000000000000000000..a384031cb3fb4a7c90a3da44ba262bb8b1e10e5e --- /dev/null +++ b/attn/A3_S-8_R1_P0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81bfc5a69399687b0f7db6544ebb93412528282934c416d81d90d7a245de7041 +size 153705080 diff --git a/attn/A3_S-8_R1_P0_config.json b/attn/A3_S-8_R1_P0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eaf3b58ef2b6b1556778a0568ad4dc6bcf2f4a04 --- /dev/null +++ b/attn/A3_S-8_R1_P0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 25000, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 3, + "l1_exp": -8, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}", + "project_name": "attn_test", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A3_S-8_R1_P0" +} \ No newline at end of file