Upload GPTSANJapaneseForConditionalGeneration

Browse files

Files changed (5) hide show

config.json +2 -6
generation_config.json +4 -2
pytorch_model-00001-of-00002.bin +3 -0
pytorch_model-00002-of-00002.bin +3 -0
pytorch_model.bin.index.json +10 -10

config.json CHANGED Viewed

@@ -1,5 +1,4 @@
 {
-  "_name_or_path": "Tanrei/GPTSAN-japanese",
   "architectures": [
     "GPTSANJapaneseForConditionalGeneration"
   ],
@@ -8,15 +7,14 @@
   "d_ff": 8192,
   "d_model": 1024,
   "d_spout": 128,
-  "do_sample": true,
   "dropout_rate": 0.0,
   "eos_token_id": 35999,
   "expert_capacity": 128,
   "initializer_factor": 0.002,
   "layer_norm_epsilon": 1e-05,
   "mask_token_id": 35994,
   "model_type": "gptsan-japanese",
-  "num_contexts": 1280,
   "num_experts": 16,
   "num_ext_layers": 0,
   "num_heads": 16,
@@ -29,10 +27,8 @@
   "router_ignore_padding_tokens": false,
   "router_jitter_noise": 0.0,
   "separator_token_id": 35998,
-  "top_k": 120,
   "torch_dtype": "float32",
-  "transformers_version": "4.26.0.dev0",
   "unk_token_id": 35996,
-  "use_cache": true,
   "vocab_size": 36000
 }

 {
   "architectures": [
     "GPTSANJapaneseForConditionalGeneration"
   ],
   "d_ff": 8192,
   "d_model": 1024,
   "d_spout": 128,
   "dropout_rate": 0.0,
   "eos_token_id": 35999,
   "expert_capacity": 128,
   "initializer_factor": 0.002,
   "layer_norm_epsilon": 1e-05,
   "mask_token_id": 35994,
+  "max_position_embeddings": 1280,
   "model_type": "gptsan-japanese",
   "num_experts": 16,
   "num_ext_layers": 0,
   "num_heads": 16,
   "router_ignore_padding_tokens": false,
   "router_jitter_noise": 0.0,
   "separator_token_id": 35998,
   "torch_dtype": "float32",
+  "transformers_version": "4.27.0.dev0",
   "unk_token_id": 35996,
   "vocab_size": 36000
 }

generation_config.json CHANGED Viewed

@@ -1,5 +1,7 @@
 {
-  "do_sample": true,
-  "top_k": 120,
   "transformers_version": "4.27.0.dev0"
 }

 {
+  "_from_model_config": true,
+  "bos_token_id": 35993,
+  "eos_token_id": 35999,
+  "pad_token_id": 35995,
   "transformers_version": "4.27.0.dev0"
 }

pytorch_model-00001-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f34e100f478d000708270536fd52b3011f09343c6d4b007b94e662abf8531621
+size 9972570554

pytorch_model-00002-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1f9dd95d7ccb1c5972ba3a267ca07113bf41ac5c6adc3973a758f0b28e4d4dc
+size 1143580233

pytorch_model.bin.index.json CHANGED Viewed

@@ -38,7 +38,7 @@
     "blocks.0.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.0.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.0.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
-    "blocks.0.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.0.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.0.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.0.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -78,7 +78,7 @@
     "blocks.1.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.1.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.1.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
-    "blocks.1.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.1.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.1.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.1.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -118,7 +118,7 @@
     "blocks.2.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.2.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.2.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
-    "blocks.2.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.2.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.2.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.2.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -158,7 +158,7 @@
     "blocks.3.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.3.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.3.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
-    "blocks.3.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.3.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.3.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.3.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -198,7 +198,7 @@
     "blocks.4.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.4.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.4.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
-    "blocks.4.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.4.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.4.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.4.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -238,7 +238,7 @@
     "blocks.5.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.5.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.5.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
-    "blocks.5.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.5.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.5.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.5.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -278,7 +278,7 @@
     "blocks.6.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.6.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.6.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
-    "blocks.6.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.6.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.6.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.6.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -318,7 +318,7 @@
     "blocks.7.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.7.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.7.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
-    "blocks.7.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.7.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.7.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.7.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -358,7 +358,7 @@
     "blocks.8.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.8.FeedForward.norm.bias": "pytorch_model-00002-of-00002.bin",
     "blocks.8.FeedForward.norm.weight": "pytorch_model-00002-of-00002.bin",
-    "blocks.8.FeedForward.smlp.weight": "pytorch_model-00002-of-00002.bin",
     "blocks.8.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.8.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.8.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -398,7 +398,7 @@
     "blocks.9.FeedForward.mlp.router.classifier.weight": "pytorch_model-00002-of-00002.bin",
     "blocks.9.FeedForward.norm.bias": "pytorch_model-00002-of-00002.bin",
     "blocks.9.FeedForward.norm.weight": "pytorch_model-00002-of-00002.bin",
-    "blocks.9.FeedForward.smlp.weight": "pytorch_model-00002-of-00002.bin",
     "blocks.9.SelfAttention.SelfAttention.o": "pytorch_model-00002-of-00002.bin",
     "blocks.9.SelfAttention.SelfAttention.qkv": "pytorch_model-00002-of-00002.bin",
     "blocks.9.SelfAttention.norm.bias": "pytorch_model-00002-of-00002.bin",

     "blocks.0.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.0.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.0.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
+    "blocks.0.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.0.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.0.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.0.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.1.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.1.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.1.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
+    "blocks.1.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.1.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.1.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.1.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.2.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.2.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.2.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
+    "blocks.2.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.2.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.2.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.2.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.3.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.3.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.3.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
+    "blocks.3.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.3.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.3.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.3.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.4.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.4.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.4.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
+    "blocks.4.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.4.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.4.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.4.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.5.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.5.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.5.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
+    "blocks.5.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.5.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.5.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.5.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.6.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.6.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.6.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
+    "blocks.6.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.6.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.6.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.6.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.7.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.7.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.7.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
+    "blocks.7.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.7.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.7.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.7.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.8.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
     "blocks.8.FeedForward.norm.bias": "pytorch_model-00002-of-00002.bin",
     "blocks.8.FeedForward.norm.weight": "pytorch_model-00002-of-00002.bin",
+    "blocks.8.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00002-of-00002.bin",
     "blocks.8.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
     "blocks.8.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
     "blocks.8.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
     "blocks.9.FeedForward.mlp.router.classifier.weight": "pytorch_model-00002-of-00002.bin",
     "blocks.9.FeedForward.norm.bias": "pytorch_model-00002-of-00002.bin",
     "blocks.9.FeedForward.norm.weight": "pytorch_model-00002-of-00002.bin",
+    "blocks.9.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00002-of-00002.bin",
     "blocks.9.SelfAttention.SelfAttention.o": "pytorch_model-00002-of-00002.bin",
     "blocks.9.SelfAttention.SelfAttention.qkv": "pytorch_model-00002-of-00002.bin",
     "blocks.9.SelfAttention.norm.bias": "pytorch_model-00002-of-00002.bin",