add CerebrasGPT-111_muP

Files changed (3) hide show

CerebrasGPT-111_muP/config.json +26 -0
CerebrasGPT-111_muP/pytorch_model-00001-of-00001.bin +3 -0
CerebrasGPT-111_muP/pytorch_model.bin.index.json +132 -0

CerebrasGPT-111_muP/config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+    "attn_pdrop": 0.0,
+    "scale_attn_weights": true,
+    "resid_pdrop": 0.0,
+    "n_inner": 3072,
+    "n_embd": 768,
+    "layer_norm_epsilon": 1e-05,
+    "n_positions": 2048,
+    "mup_embeddings_scale": 10,
+    "activation_function": "gelu",
+    "n_head": 12,
+    "n_layer": 10,
+    "position_embedding_type": "learned",
+    "mup_scale_qk_dot_by_d": true,
+    "tie_word_embeddings": true,
+    "vocab_size": 50257,
+    "alibi_scaling": null,
+    "embd_pdrop": 0.0,
+    "model_type": "btlm",
+    "use_cache": true,
+    "auto_map": {
+        "AutoConfig": "cerebras/btlm-3b-8k-base--configuration_btlm.BTLMConfig",
+        "AutoModel": "cerebras/btlm-3b-8k-base--modeling_btlm.BTLMModel",
+        "AutoModelForCausalLM": "cerebras/btlm-3b-8k-base--modeling_btlm.BTLMLMHeadModel"
+    }
+}

CerebrasGPT-111_muP/pytorch_model-00001-of-00001.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2358a4a5122966c3e777dcf24b3170ad4aa0eebae041382da41c8aa5635dcca5
+size 598632525

CerebrasGPT-111_muP/pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+    "metadata": {
+        "total_size": 598591488
+    },
+    "weight_map": {
+        "transformer.wte.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.wpe.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.0.attn.c_attn.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.0.attn.c_attn.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.0.attn.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.0.attn.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.0.ln_1.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.0.ln_1.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.0.ln_2.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.0.ln_2.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.0.mlp.c_fc.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.0.mlp.c_fc.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.0.mlp.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.0.mlp.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.1.attn.c_attn.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.1.attn.c_attn.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.1.attn.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.1.attn.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.1.ln_1.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.1.ln_1.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.1.ln_2.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.1.ln_2.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.1.mlp.c_fc.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.1.mlp.c_fc.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.1.mlp.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.1.mlp.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.2.attn.c_attn.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.2.attn.c_attn.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.2.attn.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.2.attn.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.2.ln_1.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.2.ln_1.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.2.ln_2.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.2.ln_2.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.2.mlp.c_fc.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.2.mlp.c_fc.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.2.mlp.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.2.mlp.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.3.attn.c_attn.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.3.attn.c_attn.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.3.attn.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.3.attn.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.3.ln_1.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.3.ln_1.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.3.ln_2.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.3.ln_2.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.3.mlp.c_fc.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.3.mlp.c_fc.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.3.mlp.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.3.mlp.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.4.attn.c_attn.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.4.attn.c_attn.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.4.attn.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.4.attn.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.4.ln_1.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.4.ln_1.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.4.ln_2.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.4.ln_2.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.4.mlp.c_fc.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.4.mlp.c_fc.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.4.mlp.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.4.mlp.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.5.attn.c_attn.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.5.attn.c_attn.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.5.attn.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.5.attn.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.5.ln_1.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.5.ln_1.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.5.ln_2.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.5.ln_2.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.5.mlp.c_fc.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.5.mlp.c_fc.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.5.mlp.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.5.mlp.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.6.attn.c_attn.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.6.attn.c_attn.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.6.attn.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.6.attn.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.6.ln_1.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.6.ln_1.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.6.ln_2.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.6.ln_2.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.6.mlp.c_fc.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.6.mlp.c_fc.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.6.mlp.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.6.mlp.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.7.attn.c_attn.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.7.attn.c_attn.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.7.attn.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.7.attn.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.7.ln_1.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.7.ln_1.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.7.ln_2.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.7.ln_2.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.7.mlp.c_fc.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.7.mlp.c_fc.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.7.mlp.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.7.mlp.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.8.attn.c_attn.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.8.attn.c_attn.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.8.attn.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.8.attn.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.8.ln_1.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.8.ln_1.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.8.ln_2.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.8.ln_2.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.8.mlp.c_fc.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.8.mlp.c_fc.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.8.mlp.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.8.mlp.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.9.attn.c_attn.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.9.attn.c_attn.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.9.attn.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.9.attn.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.9.ln_1.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.9.ln_1.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.9.ln_2.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.9.ln_2.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.9.mlp.c_fc.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.9.mlp.c_fc.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.9.mlp.c_proj.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.h.9.mlp.c_proj.bias": "pytorch_model-00001-of-00001.bin",
+        "transformer.ln_f.weight": "pytorch_model-00001-of-00001.bin",
+        "transformer.ln_f.bias": "pytorch_model-00001-of-00001.bin",
+        "lm_head.weight": "pytorch_model-00001-of-00001.bin"
+    }
+}