Upload GPTSANJapaneseForConditionalGeneration
Browse files- config.json +2 -6
- generation_config.json +4 -2
- pytorch_model-00001-of-00002.bin +3 -0
- pytorch_model-00002-of-00002.bin +3 -0
- pytorch_model.bin.index.json +10 -10
config.json
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "Tanrei/GPTSAN-japanese",
|
3 |
"architectures": [
|
4 |
"GPTSANJapaneseForConditionalGeneration"
|
5 |
],
|
@@ -8,15 +7,14 @@
|
|
8 |
"d_ff": 8192,
|
9 |
"d_model": 1024,
|
10 |
"d_spout": 128,
|
11 |
-
"do_sample": true,
|
12 |
"dropout_rate": 0.0,
|
13 |
"eos_token_id": 35999,
|
14 |
"expert_capacity": 128,
|
15 |
"initializer_factor": 0.002,
|
16 |
"layer_norm_epsilon": 1e-05,
|
17 |
"mask_token_id": 35994,
|
|
|
18 |
"model_type": "gptsan-japanese",
|
19 |
-
"num_contexts": 1280,
|
20 |
"num_experts": 16,
|
21 |
"num_ext_layers": 0,
|
22 |
"num_heads": 16,
|
@@ -29,10 +27,8 @@
|
|
29 |
"router_ignore_padding_tokens": false,
|
30 |
"router_jitter_noise": 0.0,
|
31 |
"separator_token_id": 35998,
|
32 |
-
"top_k": 120,
|
33 |
"torch_dtype": "float32",
|
34 |
-
"transformers_version": "4.
|
35 |
"unk_token_id": 35996,
|
36 |
-
"use_cache": true,
|
37 |
"vocab_size": 36000
|
38 |
}
|
|
|
1 |
{
|
|
|
2 |
"architectures": [
|
3 |
"GPTSANJapaneseForConditionalGeneration"
|
4 |
],
|
|
|
7 |
"d_ff": 8192,
|
8 |
"d_model": 1024,
|
9 |
"d_spout": 128,
|
|
|
10 |
"dropout_rate": 0.0,
|
11 |
"eos_token_id": 35999,
|
12 |
"expert_capacity": 128,
|
13 |
"initializer_factor": 0.002,
|
14 |
"layer_norm_epsilon": 1e-05,
|
15 |
"mask_token_id": 35994,
|
16 |
+
"max_position_embeddings": 1280,
|
17 |
"model_type": "gptsan-japanese",
|
|
|
18 |
"num_experts": 16,
|
19 |
"num_ext_layers": 0,
|
20 |
"num_heads": 16,
|
|
|
27 |
"router_ignore_padding_tokens": false,
|
28 |
"router_jitter_noise": 0.0,
|
29 |
"separator_token_id": 35998,
|
|
|
30 |
"torch_dtype": "float32",
|
31 |
+
"transformers_version": "4.27.0.dev0",
|
32 |
"unk_token_id": 35996,
|
|
|
33 |
"vocab_size": 36000
|
34 |
}
|
generation_config.json
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
{
|
2 |
-
"
|
3 |
-
"
|
|
|
|
|
4 |
"transformers_version": "4.27.0.dev0"
|
5 |
}
|
|
|
1 |
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 35993,
|
4 |
+
"eos_token_id": 35999,
|
5 |
+
"pad_token_id": 35995,
|
6 |
"transformers_version": "4.27.0.dev0"
|
7 |
}
|
pytorch_model-00001-of-00002.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f34e100f478d000708270536fd52b3011f09343c6d4b007b94e662abf8531621
|
3 |
+
size 9972570554
|
pytorch_model-00002-of-00002.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1f9dd95d7ccb1c5972ba3a267ca07113bf41ac5c6adc3973a758f0b28e4d4dc
|
3 |
+
size 1143580233
|
pytorch_model.bin.index.json
CHANGED
@@ -38,7 +38,7 @@
|
|
38 |
"blocks.0.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
39 |
"blocks.0.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
40 |
"blocks.0.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
41 |
-
"blocks.0.FeedForward.
|
42 |
"blocks.0.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
43 |
"blocks.0.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
44 |
"blocks.0.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
@@ -78,7 +78,7 @@
|
|
78 |
"blocks.1.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
79 |
"blocks.1.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
80 |
"blocks.1.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
81 |
-
"blocks.1.FeedForward.
|
82 |
"blocks.1.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
83 |
"blocks.1.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
84 |
"blocks.1.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
@@ -118,7 +118,7 @@
|
|
118 |
"blocks.2.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
119 |
"blocks.2.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
120 |
"blocks.2.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
121 |
-
"blocks.2.FeedForward.
|
122 |
"blocks.2.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
123 |
"blocks.2.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
124 |
"blocks.2.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
@@ -158,7 +158,7 @@
|
|
158 |
"blocks.3.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
159 |
"blocks.3.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
160 |
"blocks.3.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
161 |
-
"blocks.3.FeedForward.
|
162 |
"blocks.3.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
163 |
"blocks.3.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
164 |
"blocks.3.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
@@ -198,7 +198,7 @@
|
|
198 |
"blocks.4.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
199 |
"blocks.4.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
200 |
"blocks.4.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
201 |
-
"blocks.4.FeedForward.
|
202 |
"blocks.4.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
203 |
"blocks.4.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
204 |
"blocks.4.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
@@ -238,7 +238,7 @@
|
|
238 |
"blocks.5.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
239 |
"blocks.5.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
240 |
"blocks.5.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
241 |
-
"blocks.5.FeedForward.
|
242 |
"blocks.5.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
243 |
"blocks.5.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
244 |
"blocks.5.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
@@ -278,7 +278,7 @@
|
|
278 |
"blocks.6.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
279 |
"blocks.6.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
280 |
"blocks.6.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
281 |
-
"blocks.6.FeedForward.
|
282 |
"blocks.6.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
283 |
"blocks.6.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
284 |
"blocks.6.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
@@ -318,7 +318,7 @@
|
|
318 |
"blocks.7.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
319 |
"blocks.7.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
320 |
"blocks.7.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
321 |
-
"blocks.7.FeedForward.
|
322 |
"blocks.7.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
323 |
"blocks.7.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
324 |
"blocks.7.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
@@ -358,7 +358,7 @@
|
|
358 |
"blocks.8.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
359 |
"blocks.8.FeedForward.norm.bias": "pytorch_model-00002-of-00002.bin",
|
360 |
"blocks.8.FeedForward.norm.weight": "pytorch_model-00002-of-00002.bin",
|
361 |
-
"blocks.8.FeedForward.
|
362 |
"blocks.8.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
363 |
"blocks.8.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
364 |
"blocks.8.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
@@ -398,7 +398,7 @@
|
|
398 |
"blocks.9.FeedForward.mlp.router.classifier.weight": "pytorch_model-00002-of-00002.bin",
|
399 |
"blocks.9.FeedForward.norm.bias": "pytorch_model-00002-of-00002.bin",
|
400 |
"blocks.9.FeedForward.norm.weight": "pytorch_model-00002-of-00002.bin",
|
401 |
-
"blocks.9.FeedForward.
|
402 |
"blocks.9.SelfAttention.SelfAttention.o": "pytorch_model-00002-of-00002.bin",
|
403 |
"blocks.9.SelfAttention.SelfAttention.qkv": "pytorch_model-00002-of-00002.bin",
|
404 |
"blocks.9.SelfAttention.norm.bias": "pytorch_model-00002-of-00002.bin",
|
|
|
38 |
"blocks.0.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
39 |
"blocks.0.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
40 |
"blocks.0.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
41 |
+
"blocks.0.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
|
42 |
"blocks.0.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
43 |
"blocks.0.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
44 |
"blocks.0.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
|
|
78 |
"blocks.1.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
79 |
"blocks.1.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
80 |
"blocks.1.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
81 |
+
"blocks.1.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
|
82 |
"blocks.1.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
83 |
"blocks.1.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
84 |
"blocks.1.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
|
|
118 |
"blocks.2.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
119 |
"blocks.2.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
120 |
"blocks.2.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
121 |
+
"blocks.2.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
|
122 |
"blocks.2.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
123 |
"blocks.2.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
124 |
"blocks.2.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
|
|
158 |
"blocks.3.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
159 |
"blocks.3.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
160 |
"blocks.3.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
161 |
+
"blocks.3.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
|
162 |
"blocks.3.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
163 |
"blocks.3.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
164 |
"blocks.3.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
|
|
198 |
"blocks.4.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
199 |
"blocks.4.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
200 |
"blocks.4.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
201 |
+
"blocks.4.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
|
202 |
"blocks.4.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
203 |
"blocks.4.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
204 |
"blocks.4.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
|
|
238 |
"blocks.5.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
239 |
"blocks.5.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
240 |
"blocks.5.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
241 |
+
"blocks.5.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
|
242 |
"blocks.5.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
243 |
"blocks.5.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
244 |
"blocks.5.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
|
|
278 |
"blocks.6.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
279 |
"blocks.6.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
280 |
"blocks.6.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
281 |
+
"blocks.6.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
|
282 |
"blocks.6.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
283 |
"blocks.6.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
284 |
"blocks.6.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
|
|
318 |
"blocks.7.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
319 |
"blocks.7.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
|
320 |
"blocks.7.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
|
321 |
+
"blocks.7.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
|
322 |
"blocks.7.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
323 |
"blocks.7.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
324 |
"blocks.7.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
|
|
358 |
"blocks.8.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
|
359 |
"blocks.8.FeedForward.norm.bias": "pytorch_model-00002-of-00002.bin",
|
360 |
"blocks.8.FeedForward.norm.weight": "pytorch_model-00002-of-00002.bin",
|
361 |
+
"blocks.8.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00002-of-00002.bin",
|
362 |
"blocks.8.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
|
363 |
"blocks.8.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
|
364 |
"blocks.8.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
|
|
|
398 |
"blocks.9.FeedForward.mlp.router.classifier.weight": "pytorch_model-00002-of-00002.bin",
|
399 |
"blocks.9.FeedForward.norm.bias": "pytorch_model-00002-of-00002.bin",
|
400 |
"blocks.9.FeedForward.norm.weight": "pytorch_model-00002-of-00002.bin",
|
401 |
+
"blocks.9.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00002-of-00002.bin",
|
402 |
"blocks.9.SelfAttention.SelfAttention.o": "pytorch_model-00002-of-00002.bin",
|
403 |
"blocks.9.SelfAttention.SelfAttention.qkv": "pytorch_model-00002-of-00002.bin",
|
404 |
"blocks.9.SelfAttention.norm.bias": "pytorch_model-00002-of-00002.bin",
|