diff --git a/small_attn_out/A0_N100_S-1.pt b/small_attn_out/A0_N100_S-1.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa2352f268d7db19c2f07ca03caac359255bfedc --- /dev/null +++ b/small_attn_out/A0_N100_S-1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb86b244378b3011dfc004d1058b6115d991aa9955b699fd455d4b8bd560c6f5 +size 619888 diff --git a/small_attn_out/A0_N100_S-10.pt b/small_attn_out/A0_N100_S-10.pt new file mode 100644 index 0000000000000000000000000000000000000000..793cb75162048b01c7d2cc2f1d5ded1001856be0 --- /dev/null +++ b/small_attn_out/A0_N100_S-10.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9abeff8e1ac701f13b70f199d73896437df864eb5641b6b61001cf97ece40c8c +size 619896 diff --git a/small_attn_out/A0_N100_S-10_config.json b/small_attn_out/A0_N100_S-10_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0fb8f7d65376cc6ab41bfaffe27e1d91894f5c77 --- /dev/null +++ b/small_attn_out/A0_N100_S-10_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -10, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N100_S-10" +} \ No newline at end of file diff --git a/small_attn_out/A0_N100_S-1_config.json b/small_attn_out/A0_N100_S-1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fd6938c1dda41b12d94cad2237453815e511fcf3 --- /dev/null +++ b/small_attn_out/A0_N100_S-1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N100_S-1" +} \ No newline at end of file diff --git a/small_attn_out/A0_N100_S-2.pt b/small_attn_out/A0_N100_S-2.pt new file mode 100644 index 0000000000000000000000000000000000000000..5de7965589cadd70bd7954ed47c88d42090a8157 --- /dev/null +++ b/small_attn_out/A0_N100_S-2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57f00dfe46558ea82dc20bb8d8af5adb13e33404143df202ace91637481a2e02 +size 619888 diff --git a/small_attn_out/A0_N100_S-2_config.json b/small_attn_out/A0_N100_S-2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..185f25ebe3fdada1994b750686995746248f4800 --- /dev/null +++ b/small_attn_out/A0_N100_S-2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N100_S-2" +} \ No newline at end of file diff --git a/small_attn_out/A0_N100_S-3.pt b/small_attn_out/A0_N100_S-3.pt new file mode 100644 index 0000000000000000000000000000000000000000..537d04326b319f0e5c7791dba8e8cc7d1e4c2426 --- /dev/null +++ b/small_attn_out/A0_N100_S-3.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdfb7e2f79e79ff1a56ae5d3b9ef0925b793cc8bb73c94c5cb1198c109160a87 +size 619888 diff --git a/small_attn_out/A0_N100_S-3_config.json b/small_attn_out/A0_N100_S-3_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c5528834e7c1f7e4dea10199bf60851c782e2539 --- /dev/null +++ b/small_attn_out/A0_N100_S-3_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -3, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N100_S-3" +} \ No newline at end of file diff --git a/small_attn_out/A0_N100_S-4.pt b/small_attn_out/A0_N100_S-4.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f8fdb07556b5b55d321c89bc18d7b6055d116ba --- /dev/null +++ b/small_attn_out/A0_N100_S-4.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64172016801e954ba2bff8539a99c6022cfdfb9fe3cd208563a0f3c379fdd47c +size 619888 diff --git a/small_attn_out/A0_N100_S-4_config.json b/small_attn_out/A0_N100_S-4_config.json new file mode 100644 index 0000000000000000000000000000000000000000..41b6973eb804f5820222c2c71d683e4cfa213a20 --- /dev/null +++ b/small_attn_out/A0_N100_S-4_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -4, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N100_S-4" +} \ No newline at end of file diff --git a/small_attn_out/A0_N100_S-5.pt b/small_attn_out/A0_N100_S-5.pt new file mode 100644 index 0000000000000000000000000000000000000000..84cc923e178026e57217190615daac2340b89674 --- /dev/null +++ b/small_attn_out/A0_N100_S-5.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:572e1822d0aaa37dad7d069960134b523b82ce07066395eb9f210053ced8539b +size 619888 diff --git a/small_attn_out/A0_N100_S-5_config.json b/small_attn_out/A0_N100_S-5_config.json new file mode 100644 index 0000000000000000000000000000000000000000..07e02583d771235c50940c84e16f9355b5043309 --- /dev/null +++ b/small_attn_out/A0_N100_S-5_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -5, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N100_S-5" +} \ No newline at end of file diff --git a/small_attn_out/A0_N100_S-6.pt b/small_attn_out/A0_N100_S-6.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5ac6c2a2803b500b6757c4d450920a09ba4fc01 --- /dev/null +++ b/small_attn_out/A0_N100_S-6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b78f5980b6982986dc61df07870239c00001b0767322b6467d13e826335bcce1 +size 619888 diff --git a/small_attn_out/A0_N100_S-6_config.json b/small_attn_out/A0_N100_S-6_config.json new file mode 100644 index 0000000000000000000000000000000000000000..997a2441117c743065e68784aab2a815c90f9322 --- /dev/null +++ b/small_attn_out/A0_N100_S-6_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -6, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N100_S-6" +} \ No newline at end of file diff --git a/small_attn_out/A0_N100_S-7.pt b/small_attn_out/A0_N100_S-7.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b23508f2af5beadfd6eec76d8f9a0ee96615502 --- /dev/null +++ b/small_attn_out/A0_N100_S-7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28bb163f29b2ba110d5b35ea4c17d5638148b361d0bafa1e7f337fb8444015fe +size 619888 diff --git a/small_attn_out/A0_N100_S-7_config.json b/small_attn_out/A0_N100_S-7_config.json new file mode 100644 index 0000000000000000000000000000000000000000..978ee421e435e3941c5cd14d6bb283eaf044514f --- /dev/null +++ b/small_attn_out/A0_N100_S-7_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -7, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N100_S-7" +} \ No newline at end of file diff --git a/small_attn_out/A0_N100_S-8.pt b/small_attn_out/A0_N100_S-8.pt new file mode 100644 index 0000000000000000000000000000000000000000..b97a252fd8cccd547cd832eeb8f9097733dc28db --- /dev/null +++ b/small_attn_out/A0_N100_S-8.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:984aa7811931448bcf982f409cae513a98d98d7d6c7d90a1088d946aee8627ee +size 619888 diff --git a/small_attn_out/A0_N100_S-8_config.json b/small_attn_out/A0_N100_S-8_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f85b25e62a6d779ca88df0efd5194f08f228e4a0 --- /dev/null +++ b/small_attn_out/A0_N100_S-8_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -8, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N100_S-8" +} \ No newline at end of file diff --git a/small_attn_out/A0_N100_S-9.pt b/small_attn_out/A0_N100_S-9.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a7bdce24b93f7d7cb77822b345bd8518ff10465 --- /dev/null +++ b/small_attn_out/A0_N100_S-9.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a03f1994c850df139158d9725ea08d69d5671fceb4f8d18472865236307c26d +size 619888 diff --git a/small_attn_out/A0_N100_S-9_config.json b/small_attn_out/A0_N100_S-9_config.json new file mode 100644 index 0000000000000000000000000000000000000000..200caf1be441e24789c697f66335db87b9e4b544 --- /dev/null +++ b/small_attn_out/A0_N100_S-9_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -9, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N100_S-9" +} \ No newline at end of file diff --git a/small_attn_out/A0_N100_S0.pt b/small_attn_out/A0_N100_S0.pt new file mode 100644 index 0000000000000000000000000000000000000000..07eb54eef8536668456d12296931f15e05fe24de --- /dev/null +++ b/small_attn_out/A0_N100_S0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:538d5363bfe0940cdf3a83928c27ad3d3aad4715d05a2a4832f00ddba40f31d8 +size 619880 diff --git a/small_attn_out/A0_N100_S0_config.json b/small_attn_out/A0_N100_S0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67feff425231b295373c6c0196d08280633eea9e --- /dev/null +++ b/small_attn_out/A0_N100_S0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": 0, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N100_S0" +} \ No newline at end of file diff --git a/small_attn_out/A0_N100_S1.pt b/small_attn_out/A0_N100_S1.pt new file mode 100644 index 0000000000000000000000000000000000000000..91ec170dc642562f5f2b52c6eabb87c2e3019e06 --- /dev/null +++ b/small_attn_out/A0_N100_S1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4eb074a9c3ba59325eb7c86710b1b609ee648ec50ab0f76e447438b5a0782920 +size 619880 diff --git a/small_attn_out/A0_N100_S1_config.json b/small_attn_out/A0_N100_S1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cae2963846fa57a49ba9b8de664b11bfb42562bf --- /dev/null +++ b/small_attn_out/A0_N100_S1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": 1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N100_S1" +} \ No newline at end of file diff --git a/small_attn_out/A0_N100_S2.pt b/small_attn_out/A0_N100_S2.pt new file mode 100644 index 0000000000000000000000000000000000000000..803cbef4903292d68f29d5fc3450576ceee334f7 --- /dev/null +++ b/small_attn_out/A0_N100_S2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:017b709d1416d68bd1d7b3930b35d311d031e08aa6742c29a6c136ffef8c609b +size 619880 diff --git a/small_attn_out/A0_N100_S2_config.json b/small_attn_out/A0_N100_S2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3c2d0852c13f5651fc92ab7d2758606203f01efd --- /dev/null +++ b/small_attn_out/A0_N100_S2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": 2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N100_S2" +} \ No newline at end of file diff --git a/small_attn_out/A0_N300_S-1.pt b/small_attn_out/A0_N300_S-1.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b031cc734cf5f354faf2bcd1406cddcd5d7c0ad --- /dev/null +++ b/small_attn_out/A0_N300_S-1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c7ffcaa369e5d113f5f5549675f412659f653d2c576a3fe6fc04b08b2199f32 +size 1849456 diff --git a/small_attn_out/A0_N300_S-10.pt b/small_attn_out/A0_N300_S-10.pt new file mode 100644 index 0000000000000000000000000000000000000000..c00b6256b4e6290edb6a72d94a5a7311216791d4 --- /dev/null +++ b/small_attn_out/A0_N300_S-10.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81488224431b9b80530e275eceb1358a9bb09c1e7912663c8d6efab343e48424 +size 1849464 diff --git a/small_attn_out/A0_N300_S-10_config.json b/small_attn_out/A0_N300_S-10_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0cd6152f03e7a5325fdf6b244399b5e8da260ba4 --- /dev/null +++ b/small_attn_out/A0_N300_S-10_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -10, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N300_S-10" +} \ No newline at end of file diff --git a/small_attn_out/A0_N300_S-1_config.json b/small_attn_out/A0_N300_S-1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..402997e4912dab7461163bcd1ad283c0891a9fd1 --- /dev/null +++ b/small_attn_out/A0_N300_S-1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N300_S-1" +} \ No newline at end of file diff --git a/small_attn_out/A0_N300_S-2.pt b/small_attn_out/A0_N300_S-2.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8e1fdb0fa0f7b13f5b11eb22a97f3500bedfe39 --- /dev/null +++ b/small_attn_out/A0_N300_S-2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5d3a95eef0b490b3697ec4dd635c38ecb34f84f2c5318cbae0388b5a57f9596 +size 1849456 diff --git a/small_attn_out/A0_N300_S-2_config.json b/small_attn_out/A0_N300_S-2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f4fc888e4fff4e905687027fc93bb4b03a2645ab --- /dev/null +++ b/small_attn_out/A0_N300_S-2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N300_S-2" +} \ No newline at end of file diff --git a/small_attn_out/A0_N300_S-3.pt b/small_attn_out/A0_N300_S-3.pt new file mode 100644 index 0000000000000000000000000000000000000000..a475c98293851450d038c40a9ffa7e9373fd7651 --- /dev/null +++ b/small_attn_out/A0_N300_S-3.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8da48169455324c25813b76bc11ab00c1adf63ec56f1851229f1897aad23c567 +size 1849456 diff --git a/small_attn_out/A0_N300_S-3_config.json b/small_attn_out/A0_N300_S-3_config.json new file mode 100644 index 0000000000000000000000000000000000000000..88665f02292b081e180408a424229ae3feabf34b --- /dev/null +++ b/small_attn_out/A0_N300_S-3_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -3, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N300_S-3" +} \ No newline at end of file diff --git a/small_attn_out/A0_N300_S-4.pt b/small_attn_out/A0_N300_S-4.pt new file mode 100644 index 0000000000000000000000000000000000000000..e06f25ad98be6faf444f3e17e0eaf63e22900710 --- /dev/null +++ b/small_attn_out/A0_N300_S-4.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4cf9f5d063a769b24dcc08094808aebd8aaa11835e1f61206a34942699d0096 +size 1849456 diff --git a/small_attn_out/A0_N300_S-4_config.json b/small_attn_out/A0_N300_S-4_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec196c1f975fb2b3b1fff8c2c2e0a618cb738e2c --- /dev/null +++ b/small_attn_out/A0_N300_S-4_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -4, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N300_S-4" +} \ No newline at end of file diff --git a/small_attn_out/A0_N300_S-5.pt b/small_attn_out/A0_N300_S-5.pt new file mode 100644 index 0000000000000000000000000000000000000000..451a1082d71a881c8a8d9c8b8820332ce18509df --- /dev/null +++ b/small_attn_out/A0_N300_S-5.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fd976f75c285726ce4dd81340dc4ac6c2f788010dd7150ddeb3c3295d1351a8 +size 1849456 diff --git a/small_attn_out/A0_N300_S-5_config.json b/small_attn_out/A0_N300_S-5_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1070706fb515fe0a6c9444886739eb6587bb5f76 --- /dev/null +++ b/small_attn_out/A0_N300_S-5_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -5, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N300_S-5" +} \ No newline at end of file diff --git a/small_attn_out/A0_N300_S-6.pt b/small_attn_out/A0_N300_S-6.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6da3056b1eeb22c34cf58f49d735e28eb4f9b33 --- /dev/null +++ b/small_attn_out/A0_N300_S-6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8df0e2aff051466d445625391a8576d382acceaeac2e53cc3d47b3ece9aa1a72 +size 1849456 diff --git a/small_attn_out/A0_N300_S-6_config.json b/small_attn_out/A0_N300_S-6_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5bc5a5b2af9d30ff76820d4cd75ad850406500d8 --- /dev/null +++ b/small_attn_out/A0_N300_S-6_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -6, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N300_S-6" +} \ No newline at end of file diff --git a/small_attn_out/A0_N300_S-7.pt b/small_attn_out/A0_N300_S-7.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbe81f3bbeacbe7a39cb07e7e5d85944693a9da0 --- /dev/null +++ b/small_attn_out/A0_N300_S-7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f20e3fdc82ac8edc83be6cad1899d93c11ae343006694b40148d1720e99e84 +size 1849456 diff --git a/small_attn_out/A0_N300_S-7_config.json b/small_attn_out/A0_N300_S-7_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f24e19d0d69a47d10a10d4a1c7fa0e9ccfea04eb --- /dev/null +++ b/small_attn_out/A0_N300_S-7_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -7, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N300_S-7" +} \ No newline at end of file diff --git a/small_attn_out/A0_N300_S-8.pt b/small_attn_out/A0_N300_S-8.pt new file mode 100644 index 0000000000000000000000000000000000000000..213c89eef40bbe8c0c48fec81c5df0a159c88cda --- /dev/null +++ b/small_attn_out/A0_N300_S-8.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa647d1a405e429b0bb6b7b1be62bd5d776b81f886d767d863f86c2ab2cde947 +size 1849456 diff --git a/small_attn_out/A0_N300_S-8_config.json b/small_attn_out/A0_N300_S-8_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2ec9f6ccdc3cb19a625b61c87f0b99572ebf77f7 --- /dev/null +++ b/small_attn_out/A0_N300_S-8_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -8, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N300_S-8" +} \ No newline at end of file diff --git a/small_attn_out/A0_N300_S-9.pt b/small_attn_out/A0_N300_S-9.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e9158530a420e542e4a6fc175b2f34bbc801a56 --- /dev/null +++ b/small_attn_out/A0_N300_S-9.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c3fba6e9acb66bc3fd73d161cbfccc019653e15a69e390b7e12c7bc76ba438e +size 1849456 diff --git a/small_attn_out/A0_N300_S-9_config.json b/small_attn_out/A0_N300_S-9_config.json new file mode 100644 index 0000000000000000000000000000000000000000..51aa8ac85748a7c0e1974924067e9edc8ae40b8c --- /dev/null +++ b/small_attn_out/A0_N300_S-9_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -9, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N300_S-9" +} \ No newline at end of file diff --git a/small_attn_out/A0_N300_S0.pt b/small_attn_out/A0_N300_S0.pt new file mode 100644 index 0000000000000000000000000000000000000000..6dfeaab4b93ea398cfefd5117a005ac7cf72eb20 --- /dev/null +++ b/small_attn_out/A0_N300_S0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab2e792aa009c941252a30ecbc7055e57f60c655df96732dbbfe201a1fa04252 +size 1849448 diff --git a/small_attn_out/A0_N300_S0_config.json b/small_attn_out/A0_N300_S0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0ef1cdf5b896822452ee9190005cda66ea86733 --- /dev/null +++ b/small_attn_out/A0_N300_S0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": 0, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N300_S0" +} \ No newline at end of file diff --git a/small_attn_out/A0_N300_S1.pt b/small_attn_out/A0_N300_S1.pt new file mode 100644 index 0000000000000000000000000000000000000000..a11682f2bd5761f6e881da1b0e459f21c2eb2bf7 --- /dev/null +++ b/small_attn_out/A0_N300_S1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64caadf4c6f75349aaa9284817069238d2c9974a337e99c221c394eab3b5b0e8 +size 1849448 diff --git a/small_attn_out/A0_N300_S1_config.json b/small_attn_out/A0_N300_S1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cefd135d26b4be8c28a3b21669dd26021f763fb5 --- /dev/null +++ b/small_attn_out/A0_N300_S1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": 1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N300_S1" +} \ No newline at end of file diff --git a/small_attn_out/A0_N300_S2.pt b/small_attn_out/A0_N300_S2.pt new file mode 100644 index 0000000000000000000000000000000000000000..a633f42a5d73a46bb3d2a007966fc34d981e18f9 --- /dev/null +++ b/small_attn_out/A0_N300_S2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f17c0a4c6b686e81c76a00b7aaf6223c828796881ddf2d39a8fafe2b876a8a4f +size 1849448 diff --git a/small_attn_out/A0_N300_S2_config.json b/small_attn_out/A0_N300_S2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..14136e3d4310d6aa89871f8f937189f37d6eeee2 --- /dev/null +++ b/small_attn_out/A0_N300_S2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": 2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N300_S2" +} \ No newline at end of file diff --git a/small_attn_out/A1_N100_S-1.pt b/small_attn_out/A1_N100_S-1.pt new file mode 100644 index 0000000000000000000000000000000000000000..9119eb26dbecf4b3136b76e9f3197a88e93a2a65 --- /dev/null +++ b/small_attn_out/A1_N100_S-1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46b404d5bacd40523b6aba5ac71767f195ff508bcdd9bb2bd7f09374d3951471 +size 619888 diff --git a/small_attn_out/A1_N100_S-10.pt b/small_attn_out/A1_N100_S-10.pt new file mode 100644 index 0000000000000000000000000000000000000000..ccc52c4a7845e6cc88b485f5285e1083689f6394 --- /dev/null +++ b/small_attn_out/A1_N100_S-10.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6ab9198f7e125747b31b6d51b7e9bbef223f57d660afaade47a2188ca0167e5 +size 619896 diff --git a/small_attn_out/A1_N100_S-10_config.json b/small_attn_out/A1_N100_S-10_config.json new file mode 100644 index 0000000000000000000000000000000000000000..712f9c17a0345e20d2880c297bb4a3837a70f70e --- /dev/null +++ b/small_attn_out/A1_N100_S-10_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -10, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N100_S-10" +} \ No newline at end of file diff --git a/small_attn_out/A1_N100_S-1_config.json b/small_attn_out/A1_N100_S-1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4435b1e3b0086cc422a17a6b265e80d3ed2c50bb --- /dev/null +++ b/small_attn_out/A1_N100_S-1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N100_S-1" +} \ No newline at end of file diff --git a/small_attn_out/A1_N100_S-2.pt b/small_attn_out/A1_N100_S-2.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b46965ebf27db10a5f9ec69232b9a374445e4e8 --- /dev/null +++ b/small_attn_out/A1_N100_S-2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0a19d0f365865548a361d9cd931d87b4908a337b11ed7d60c691ae2f95b199e +size 619888 diff --git a/small_attn_out/A1_N100_S-2_config.json b/small_attn_out/A1_N100_S-2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ea6888e62011b408b7316e4f0969ef88bf7d9ad4 --- /dev/null +++ b/small_attn_out/A1_N100_S-2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N100_S-2" +} \ No newline at end of file diff --git a/small_attn_out/A1_N100_S-3.pt b/small_attn_out/A1_N100_S-3.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9a67d8d28f75e6d441530f1f2412d0fc63a8857 --- /dev/null +++ b/small_attn_out/A1_N100_S-3.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f265642cf8de9efc2bf4baaf5d1ffbe11fb2b77bc88942742b4a882adf060397 +size 619888 diff --git a/small_attn_out/A1_N100_S-3_config.json b/small_attn_out/A1_N100_S-3_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b6aafe0a72deb42d9c7428801eddc27d93139d4b --- /dev/null +++ b/small_attn_out/A1_N100_S-3_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -3, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N100_S-3" +} \ No newline at end of file diff --git a/small_attn_out/A1_N100_S-4.pt b/small_attn_out/A1_N100_S-4.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8778ac6b3eccf908b8f5e6d64a6bb77bb1a4e6d --- /dev/null +++ b/small_attn_out/A1_N100_S-4.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fb3eaf6257a93673dd3766606140039d082bc77dd7cdff3acfe2445d0d156f0 +size 619888 diff --git a/small_attn_out/A1_N100_S-4_config.json b/small_attn_out/A1_N100_S-4_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a915afb32065d857cd693a9ab2d5a3768f68a9ab --- /dev/null +++ b/small_attn_out/A1_N100_S-4_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -4, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N100_S-4" +} \ No newline at end of file diff --git a/small_attn_out/A1_N100_S-5.pt b/small_attn_out/A1_N100_S-5.pt new file mode 100644 index 0000000000000000000000000000000000000000..c454ed0957fd8743214c086dc433dc665e35c998 --- /dev/null +++ b/small_attn_out/A1_N100_S-5.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:118e3a4e7ed88a8917c577e88de60b30de3e08a48f115685819f6cd0833f9a0f +size 619888 diff --git a/small_attn_out/A1_N100_S-5_config.json b/small_attn_out/A1_N100_S-5_config.json new file mode 100644 index 0000000000000000000000000000000000000000..978078c2ca45615f315b917e99cb83ed134c61c2 --- /dev/null +++ b/small_attn_out/A1_N100_S-5_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -5, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N100_S-5" +} \ No newline at end of file diff --git a/small_attn_out/A1_N100_S-6.pt b/small_attn_out/A1_N100_S-6.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fb6fa32fdd76270cc39f27807baa88dc0793735 --- /dev/null +++ b/small_attn_out/A1_N100_S-6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddc4d31a4be73da0461fbb7532ad2f7382365d9c4198649e09d904b4184ce191 +size 619888 diff --git a/small_attn_out/A1_N100_S-6_config.json b/small_attn_out/A1_N100_S-6_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4ece964208d771ad14ad74d79fce3cd76b39e656 --- /dev/null +++ b/small_attn_out/A1_N100_S-6_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -6, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N100_S-6" +} \ No newline at end of file diff --git a/small_attn_out/A1_N100_S-7.pt b/small_attn_out/A1_N100_S-7.pt new file mode 100644 index 0000000000000000000000000000000000000000..4dde3a9389f191e0c6ea22bff73f170c8f5dfac5 --- /dev/null +++ b/small_attn_out/A1_N100_S-7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32e48f202509892f959d314ac3209109c2eabb367550601dbdc014b4b2a2cdd4 +size 619888 diff --git a/small_attn_out/A1_N100_S-7_config.json b/small_attn_out/A1_N100_S-7_config.json new file mode 100644 index 0000000000000000000000000000000000000000..75c9385dd45ef38ec7fd7dcb96fcf58e2c5e9d55 --- /dev/null +++ b/small_attn_out/A1_N100_S-7_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -7, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N100_S-7" +} \ No newline at end of file diff --git a/small_attn_out/A1_N100_S-8.pt b/small_attn_out/A1_N100_S-8.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7afd5fafd9d58157254d32320e47af2226dd454 --- /dev/null +++ b/small_attn_out/A1_N100_S-8.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f97522693afd9e104b03aec3ed10bfae40f2a5af60d8dbac5bc2e8cdb00cfe7 +size 619888 diff --git a/small_attn_out/A1_N100_S-8_config.json b/small_attn_out/A1_N100_S-8_config.json new file mode 100644 index 0000000000000000000000000000000000000000..64bf5b7a0e5fb763855aad1bd2a76badd30a9027 --- /dev/null +++ b/small_attn_out/A1_N100_S-8_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -8, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N100_S-8" +} \ No newline at end of file diff --git a/small_attn_out/A1_N100_S-9.pt b/small_attn_out/A1_N100_S-9.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7b8cbad2aeef4e3a2de9d135591156d539ae41b --- /dev/null +++ b/small_attn_out/A1_N100_S-9.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17f3da916c866f15df44d921c0760d5a4c7971f1b2a3c524a2536563be2b0fb0 +size 619888 diff --git a/small_attn_out/A1_N100_S-9_config.json b/small_attn_out/A1_N100_S-9_config.json new file mode 100644 index 0000000000000000000000000000000000000000..241251e4973711dda1f499bf2696ec5b6060443a --- /dev/null +++ b/small_attn_out/A1_N100_S-9_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -9, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N100_S-9" +} \ No newline at end of file diff --git a/small_attn_out/A1_N100_S0.pt b/small_attn_out/A1_N100_S0.pt new file mode 100644 index 0000000000000000000000000000000000000000..71b49e94d3080c090786a63042c3f7496f185620 --- /dev/null +++ b/small_attn_out/A1_N100_S0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1404450db83e3382d46ddab670e4ea17af61b2d36f1ef56efc8b2c491fc268f +size 619880 diff --git a/small_attn_out/A1_N100_S0_config.json b/small_attn_out/A1_N100_S0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6a22f5fb737536226077a8f585dde5e64e4bae1 --- /dev/null +++ b/small_attn_out/A1_N100_S0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": 0, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N100_S0" +} \ No newline at end of file diff --git a/small_attn_out/A1_N100_S1.pt b/small_attn_out/A1_N100_S1.pt new file mode 100644 index 0000000000000000000000000000000000000000..0dd8ed2a1ae2cf6c536bf36184064011544197ca --- /dev/null +++ b/small_attn_out/A1_N100_S1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9cb648aa607d2b2e1a599b2fc5e7ea6786cc5346fdb683f1e2e033cc566727c +size 619880 diff --git a/small_attn_out/A1_N100_S1_config.json b/small_attn_out/A1_N100_S1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..715902773004bcdbc3fe99c8e7dee1cfee15cf8e --- /dev/null +++ b/small_attn_out/A1_N100_S1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": 1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N100_S1" +} \ No newline at end of file diff --git a/small_attn_out/A1_N100_S2.pt b/small_attn_out/A1_N100_S2.pt new file mode 100644 index 0000000000000000000000000000000000000000..b534189320d49aa258e776542c9376a755d71163 --- /dev/null +++ b/small_attn_out/A1_N100_S2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d4bc6b45c87cbbf03fc7d0814d590b357e228152922083fb58914c07dc72ce5 +size 619880 diff --git a/small_attn_out/A1_N100_S2_config.json b/small_attn_out/A1_N100_S2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6d7d396422a6e74fa1be7711fd8fbdc23efb6be7 --- /dev/null +++ b/small_attn_out/A1_N100_S2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 100, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": 2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N100_S2" +} \ No newline at end of file diff --git a/small_attn_out/A1_N300_S-1.pt b/small_attn_out/A1_N300_S-1.pt new file mode 100644 index 0000000000000000000000000000000000000000..06a934804a06db1e4de3e7d8ba41e4e3c1ca7edc --- /dev/null +++ b/small_attn_out/A1_N300_S-1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c379e7c793d546b8e1097e28b32ec1853e4483db60755c1f313d0b0bf41ffe79 +size 1849456 diff --git a/small_attn_out/A1_N300_S-10.pt b/small_attn_out/A1_N300_S-10.pt new file mode 100644 index 0000000000000000000000000000000000000000..8dd79f37436328001d2786db6a4f8af41c731409 --- /dev/null +++ b/small_attn_out/A1_N300_S-10.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b30a81c1f3aa544704ea0bb949351f30ae62e0faa895a5244406fcc7df910e12 +size 1849464 diff --git a/small_attn_out/A1_N300_S-10_config.json b/small_attn_out/A1_N300_S-10_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9525674f7245e64a5e72266c8ab1c3c77e652d01 --- /dev/null +++ b/small_attn_out/A1_N300_S-10_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -10, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N300_S-10" +} \ No newline at end of file diff --git a/small_attn_out/A1_N300_S-1_config.json b/small_attn_out/A1_N300_S-1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4c7c5bb25e1b844809e88e459f37715edde09af5 --- /dev/null +++ b/small_attn_out/A1_N300_S-1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N300_S-1" +} \ No newline at end of file diff --git a/small_attn_out/A1_N300_S-2.pt b/small_attn_out/A1_N300_S-2.pt new file mode 100644 index 0000000000000000000000000000000000000000..ecd7ce8a8d16b9cfe4ac46880368cb315799ae7c --- /dev/null +++ b/small_attn_out/A1_N300_S-2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5c363e8a2775275dfcb75ed70a71b18089ca2d7e62663bb471deea99e1c6ac5 +size 1849456 diff --git a/small_attn_out/A1_N300_S-2_config.json b/small_attn_out/A1_N300_S-2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6eead25508ba4f692c7a42641dd67471c53e3af8 --- /dev/null +++ b/small_attn_out/A1_N300_S-2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N300_S-2" +} \ No newline at end of file diff --git a/small_attn_out/A1_N300_S-3.pt b/small_attn_out/A1_N300_S-3.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c8d8c00a5cd984bcc57bd3e937a8e1731f77fae --- /dev/null +++ b/small_attn_out/A1_N300_S-3.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e055efe0d7231505e8bd605d3a7b92b87889cbfab108c731a03ccea4d5f5186 +size 1849456 diff --git a/small_attn_out/A1_N300_S-3_config.json b/small_attn_out/A1_N300_S-3_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d5abd46f9565b9d73fcb3c794a6ebf96e630f401 --- /dev/null +++ b/small_attn_out/A1_N300_S-3_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -3, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N300_S-3" +} \ No newline at end of file diff --git a/small_attn_out/A1_N300_S-4.pt b/small_attn_out/A1_N300_S-4.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab13f026dab46811364ece0834d23a633cd78561 --- /dev/null +++ b/small_attn_out/A1_N300_S-4.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2565f3c577a581e29ff235a5002400b90c846941295638a155f0275f9857d2ad +size 1849456 diff --git a/small_attn_out/A1_N300_S-4_config.json b/small_attn_out/A1_N300_S-4_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3fdaf6643ecca5d6ac7c5b446ba20afe7f1c6718 --- /dev/null +++ b/small_attn_out/A1_N300_S-4_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -4, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N300_S-4" +} \ No newline at end of file diff --git a/small_attn_out/A1_N300_S-5.pt b/small_attn_out/A1_N300_S-5.pt new file mode 100644 index 0000000000000000000000000000000000000000..6adbb32cb31eda23d7e31abd88adbb53320c4c6c --- /dev/null +++ b/small_attn_out/A1_N300_S-5.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9950838983d2043b3d9c339c1f217bfd33d3c73fb60280831ac9b5c51922bcf +size 1849456 diff --git a/small_attn_out/A1_N300_S-5_config.json b/small_attn_out/A1_N300_S-5_config.json new file mode 100644 index 0000000000000000000000000000000000000000..44459ccd9e42459710a16aa064556584e8bd04a7 --- /dev/null +++ b/small_attn_out/A1_N300_S-5_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -5, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N300_S-5" +} \ No newline at end of file diff --git a/small_attn_out/A1_N300_S-6.pt b/small_attn_out/A1_N300_S-6.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ba760c98c0d0b1243cb98fb8113c6af085c7219 --- /dev/null +++ b/small_attn_out/A1_N300_S-6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f424002230e50eb7e61824bf15992d7e6e04c071e7801ce81b77bfac10d778f +size 1849456 diff --git a/small_attn_out/A1_N300_S-6_config.json b/small_attn_out/A1_N300_S-6_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1fa1edff38eb48964ea202bf2bff759130773cc2 --- /dev/null +++ b/small_attn_out/A1_N300_S-6_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -6, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N300_S-6" +} \ No newline at end of file diff --git a/small_attn_out/A1_N300_S-7.pt b/small_attn_out/A1_N300_S-7.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0dd07b953917b395fefcff63455a28186f4832d --- /dev/null +++ b/small_attn_out/A1_N300_S-7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d710f0b6c1683bba7712dc8fcfcc4f2b7f4da2b7f92dfabda9515ca3f393b0c1 +size 1849456 diff --git a/small_attn_out/A1_N300_S-7_config.json b/small_attn_out/A1_N300_S-7_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2efcf61689dd118a7c775fda49c0aa9089cceb68 --- /dev/null +++ b/small_attn_out/A1_N300_S-7_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -7, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N300_S-7" +} \ No newline at end of file diff --git a/small_attn_out/A1_N300_S-8.pt b/small_attn_out/A1_N300_S-8.pt new file mode 100644 index 0000000000000000000000000000000000000000..75f8f14ec06bbd03f44c02631876e2d8481a8be7 --- /dev/null +++ b/small_attn_out/A1_N300_S-8.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77064142854070d2aba212155395f1d735244e3fc363b52cbf7695eb3e504c2c +size 1849456 diff --git a/small_attn_out/A1_N300_S-8_config.json b/small_attn_out/A1_N300_S-8_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e1ce9230577a93d9d55c07d296cb9eccd1dfa507 --- /dev/null +++ b/small_attn_out/A1_N300_S-8_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -8, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N300_S-8" +} \ No newline at end of file diff --git a/small_attn_out/A1_N300_S-9.pt b/small_attn_out/A1_N300_S-9.pt new file mode 100644 index 0000000000000000000000000000000000000000..08750a7f5175896cdbaf2e0fd069e0a01e7ab91d --- /dev/null +++ b/small_attn_out/A1_N300_S-9.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7099d8d15e84187f833c74c07442e90362632e7bd5a9be1412426a8696fca4f1 +size 1849456 diff --git a/small_attn_out/A1_N300_S-9_config.json b/small_attn_out/A1_N300_S-9_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a1fb7b213e661705a7da9f0496e5f0c9c501d877 --- /dev/null +++ b/small_attn_out/A1_N300_S-9_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -9, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N300_S-9" +} \ No newline at end of file diff --git a/small_attn_out/A1_N300_S0.pt b/small_attn_out/A1_N300_S0.pt new file mode 100644 index 0000000000000000000000000000000000000000..15c8220a447098071bf59fa7c13a0aced53f2f1d --- /dev/null +++ b/small_attn_out/A1_N300_S0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ded1c88f9603c0fce7d4459cfeaed20df241bcd54ff55574617d5eba4c36629e +size 1849448 diff --git a/small_attn_out/A1_N300_S0_config.json b/small_attn_out/A1_N300_S0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9f0d57146378b735da855643db7b8f13cb457e73 --- /dev/null +++ b/small_attn_out/A1_N300_S0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": 0, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N300_S0" +} \ No newline at end of file diff --git a/small_attn_out/A1_N300_S1.pt b/small_attn_out/A1_N300_S1.pt new file mode 100644 index 0000000000000000000000000000000000000000..1417917e12c3155e75b80c52a10183d27d02b2e7 --- /dev/null +++ b/small_attn_out/A1_N300_S1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8258a24ac7384af4e1ea0b23db53e4b0bf36935dc91fdacc959a282b7055c9c1 +size 1849448 diff --git a/small_attn_out/A1_N300_S1_config.json b/small_attn_out/A1_N300_S1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5c3ba21cd38b519fe45911c4786cb29088e201d2 --- /dev/null +++ b/small_attn_out/A1_N300_S1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": 1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N300_S1" +} \ No newline at end of file diff --git a/small_attn_out/A1_N300_S2.pt b/small_attn_out/A1_N300_S2.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4d313bde5cd22da90a69308c67e0c8546f1c485 --- /dev/null +++ b/small_attn_out/A1_N300_S2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f1d4a7b2b6fa6fada21e54d0eeac15778f337ba543a74abd91bfcc27b9c182f +size 1849448 diff --git a/small_attn_out/A1_N300_S2_config.json b/small_attn_out/A1_N300_S2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5df230326e915b929089b529cda08b62ec87e382 --- /dev/null +++ b/small_attn_out/A1_N300_S2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 300, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": 2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N300_S2" +} \ No newline at end of file