unassigned
commited on
Commit
•
c833498
1
Parent(s):
41c65b6
add 2b token runs
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v15.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v15_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v23.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v23_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v31.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v31_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v39.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v39_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v47.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v47_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v55.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v55_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v63.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v63_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v7.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v71.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v71_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v79.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v79_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v7_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v14.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v14_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v22.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v22_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v30.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v30_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v38.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v38_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v46.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v46_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v54.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v54_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v6.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v62.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v62_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v6_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v70.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v70_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v78.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v78_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v11.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v11_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v19.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v19_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v27.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v27_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v3.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v35.pt +3 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v35_cfg.json +1 -0
- concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v3_cfg.json +1 -0
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v15.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1e97c72914e51b45d477e4d3d081d1fdd144d02384f8e31b03fba2fb7d72a40b
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v15_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 1.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v23.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9c382c9450e29c3f14c6e9b640640103ec92e56839f6e17996638c9a57a190eb
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v23_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 1.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v31.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f465f20e848bb3715e9cc8624db956df5fef6c6fcc90b41ea5c5557ed93abb4e
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v31_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 1.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v39.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf89777c8e6616c4189c0f392b5c1393792ce582ebc0d8daee1c5a3b5bf12d33
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v39_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 1.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v47.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:032b1130fe9077742abe44f7903346057dc3af9a5954bf07cdf42e449a7f44ac
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v47_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 1.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v55.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4d46a18cba74e90cae299762ed1431774a78f8e278427710a064ecf0dd6ab202
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v55_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 1.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v63.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3273fd7876079820290f4fdc3f5bb209dad7a518bb8ca24bfd398ae7576be599
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v63_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 1.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v7.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:03ea8f38194fc7fc3c6fc34138aed35f307e4cbccaf7b168cea152d7848e4db0
|
3 |
+
size 67179616
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v71.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5733bad4d356667db55867a411d8b2d45d3fa996c632fb08b10fc7a4a997e482
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v71_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 1.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v79.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a3e0bf49f971047ce76fbc2c7c3e103873da31fc8b037eccfa9875e2577b71c3
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v79_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 1.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l11.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v7_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 1.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v14.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7afacacac684a397a856fe55389a4dcee08470100e679d5a2e0874ae1e481585
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v14_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 2.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v22.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6523e76ebd27aeb7931533f7a624b66093b7aa3154498b42472adddfcd9c68fc
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v22_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 2.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v30.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:88d8b2fd55028a37e567736fe502fd23d6acfc4908c3a63345c712b3480f090a
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v30_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 2.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v38.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d5233bbfdfe40a9312d5d6133ad877eb0b0d183e47b2eb20c27bbfd8292afab4
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v38_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 2.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v46.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e47de6b3c846d938bfbf5bef41d8b9d97362e1b9ecb4bfad331c4dc66b31e87f
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v46_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 2.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v54.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5f39ce5d0eb3c49ecdfc948062a28bf81cc88ca93aa09e110945d8c1b4a2ac04
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v54_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 2.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v6.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:846bfb4552ffd78868642f38a6909033a03c5528a7d33016808fa528c311c961
|
3 |
+
size 67179616
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v62.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:91521b5decd333ab26ad70c593e4305d3bc74980a39669fb78257438787c4f79
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v62_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 2.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v6_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 2.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v70.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:73bb85f80ff54fa2ee77d7a9ddf089f62bab1294e89bc092859cb259cef90004
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v70_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 2.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v78.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4e2604a5383ab7cfed42a0309a9f45fcf603e9a4953458551dbb1bbd1554150d
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l12.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v78_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 2.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v11.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a4d74e31c0826fce61245aec45202d4a41411ac755c92247eeb51375b9baf793
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v11_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 4.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v19.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:197afa3bec7503155d55caeb9ceaef6a73d82290c07152c0366e582524c3f723
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v19_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 4.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v27.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8e51babc82c8c8d3dd01ed0b659fd4ed4e92f07a09e9a00e0c57b508729a9680
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v27_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 4.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v3.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f806c03038bd695370b08bc1fa9e16256b32ea47681d92f603941a6a15fdf71
|
3 |
+
size 67179616
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v35.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:297df2b990f03aa3d4b7a12ca68d36d43fb68a8fbd4e9dd8920b1ef6c632771e
|
3 |
+
size 67179624
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v35_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 4.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|
concat-z-gelu-21-l1-lr-sweep-3/gelu-2l_L1_Hcat_z_lr1.00e-03_l14.00e+00_ds16384_bs4096_dc1.00e-07_rie50000_nr4_v3_cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"seed": 49, "batch_size": 4096, "buffer_mult": 384, "lr": 0.001, "num_tokens": 2000000000, "l1_coeff": 4.0, "beta1": 0.9, "beta2": 0.99, "dict_mult": 32, "seq_len": 128, "enc_dtype": "fp32", "model_name": "gelu-2l", "site": "z", "layer": 1, "device": "cuda", "reinit": "reinit", "head": "cat", "concat_heads": true, "dead_direction_cutoff": 1e-07, "re_init_every": 50000, "anthropic_resample_last": 25000, "resample_factor": 0.01, "num_resamples": 4, "wandb_project_name": "concat-z-gelu-21-l1-lr-sweep-3", "wandb_entity": "ckkissane", "save_state_dict_every": 50000, "model_batch_size": 512, "buffer_size": 1572864, "buffer_batches": 12288, "act_name": "blocks.1.attn.hook_z", "act_size": 512, "dict_size": 16384, "name": "gelu-2l_1_16384_z"}
|