ZeroMatrix commited on
Commit
bf8969c
1 Parent(s): c1daed0

opi_full_tuned_model

Browse files
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/share/project/xiaohongwang/LLM_checkpoints/galai/galactica-6.7b",
3
+ "_remove_final_layer_norm": false,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "gelu",
6
+ "architectures": [
7
+ "OPTForCausalLM"
8
+ ],
9
+ "attention_dropout": 0.1,
10
+ "bos_token_id": 0,
11
+ "do_layer_norm_before": true,
12
+ "dropout": 0.1,
13
+ "enable_bias": true,
14
+ "eos_token_id": 2,
15
+ "ffn_dim": 16384,
16
+ "hidden_size": 4096,
17
+ "init_std": 0.02,
18
+ "layer_norm_elementwise_affine": true,
19
+ "layerdrop": 0.0,
20
+ "learned_embeddings": true,
21
+ "max_position_embeddings": 2048,
22
+ "model_type": "opt",
23
+ "num_attention_heads": 32,
24
+ "num_hidden_layers": 32,
25
+ "pad_token_id": 1,
26
+ "scale_embeddings": false,
27
+ "torch_dtype": "bfloat16",
28
+ "transformers_version": "4.28.1",
29
+ "use_cache": true,
30
+ "vocab_size": 50001,
31
+ "word_embed_proj_dim": 4096
32
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.28.1"
7
+ }
pytorch_model-1-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33851cf19ec9838c81ab69cea078b938134a369c835b7254044adf13392f0bd2
3
+ size 1658361449
pytorch_model-10-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21f717222b7cda430c1314ab03a4faaff71024053bea284c20fb50bf3e31fead
3
+ size 805523957
pytorch_model-11-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9528e113dc74ed0a25d4ca1c74f9ac85ec93e3ef50f9aacc205504482ddd64f6
3
+ size 805523957
pytorch_model-12-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:730cee9d38d7df6fe2f6a56e26daf5ca81d573faa41e5046280e1ff7c5169fba
3
+ size 805523957
pytorch_model-13-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53f0e9cc2fb124a4bc186c6f49d2ad27245423ecd10e3e7a566e78fb1d3cd377
3
+ size 805523957
pytorch_model-14-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bc4add41baef7f74caa4f07fd3ace481ad8ba706914e114983c3eb9e6845a12
3
+ size 805523957
pytorch_model-15-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70d8328d3a735b1cc484161814e969e97a5e8ff786512cf03b04962667137708
3
+ size 805523957
pytorch_model-16-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e81cdcfb2bff10d49eb00a449b652578686cbb36f01eca74eb49c537acbb627
3
+ size 805523957
pytorch_model-17-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18aa842f6f7cb8a491a1c9c46136fdfd5aced6103cbfe42ad22b1e5bf32e3ba3
3
+ size 805523957
pytorch_model-18-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:257458025f14cc240435ceb4eaec213ffa3ea68c0bcd5147d9df4aff7aca1d97
3
+ size 805523957
pytorch_model-19-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06f2c30b6d7e69811409ee73fcfb65938843b68f7943ee9178dcf7758928b58f
3
+ size 805523957
pytorch_model-2-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5659e53f7b0c9455892d4d86d55631848d72909da303938f51eccbc932c87e4
3
+ size 805523957
pytorch_model-20-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e470113e2fc15266b34d1543bd71fcc73bf4dcc8ba7fb76b284b8114cba0de4b
3
+ size 805523957
pytorch_model-21-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a07c8fb9b9a2477215fbfecade440cbbb09d7f2ee11cf837d6730233e7ab35d9
3
+ size 805523957
pytorch_model-22-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:086c97e31fce8a97268a04f3cc9ab3d58ee954a6fdd53e16d6df507bfd9a833c
3
+ size 805523957
pytorch_model-23-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a83cd31c4c13046c83657e05df2f5209d8d4220060ac2e21ded5dd651f5b7e0
3
+ size 805523957
pytorch_model-24-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8520e04e8e46e893928be3787fd723e7e954ebf76a3901e07168c1684186e458
3
+ size 805523957
pytorch_model-25-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b4a027bf43608f1db0ab7865db1a6da4ec7791ff763d2898693d540b1d5034d
3
+ size 805523957
pytorch_model-26-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:958485a57eb8da5f7ab6fc5088ac082fcaec51ce04231008768fd178f740fd8a
3
+ size 805523957
pytorch_model-27-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3b8c90bcc069a0b4a9443026c291c8d39ac455cae370c8fcbfdafdb025281f0
3
+ size 805523957
pytorch_model-28-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e81e1d3234156366bb8a3281283df8c2dc1545c2d7df48148104e692eeddaf23
3
+ size 805523957
pytorch_model-29-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83c9303e57b75c11e6d294d29b9b88f0a050f91dcaddceb6aedde39083bec47b
3
+ size 805523957
pytorch_model-3-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64e800582d0d01e32a9cc7b38aa393f992f693d1f92af86c486c0d6fde118d75
3
+ size 805523957
pytorch_model-30-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c94ac13a208adbecd84f03060a90fa37f6d8da99575e61f1b311f1681bf31f62
3
+ size 805523957
pytorch_model-31-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f21492733523958e62883b9d591ba94c190b2bfcd60319e60e066598153040a1
3
+ size 805523957
pytorch_model-32-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d0ede5fbb5b7d971d1d3bb02eaf5cbc998c8e2d0a16c0657c3366f493d207fc
3
+ size 1624740658
pytorch_model-4-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:270822c412e74256763a7406b88f9c51b66759e062fcfeb44703d39d30783c71
3
+ size 805523957
pytorch_model-5-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e05e99ab17b48f57d563368043850c25de9c2f07b5bae195b2ceb8bbd113a99
3
+ size 805523957
pytorch_model-6-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64007b17970cd393fddbb50e580dd5a531f17ef79b647857d77cce6142a8f063
3
+ size 805523957
pytorch_model-7-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b96ca35655a151dd4003e286b413dddc4212be8b676bd2f74d0cbfe61812fde
3
+ size 805523957
pytorch_model-8-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b4b562f760b7441d90dd9b5e699ac5153d6ceaf0773d7f6d937bceb022a2895
3
+ size 805523957
pytorch_model-9-of-32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4b1189685df26dc60d1076441d96169348ca8a8084dc12770ef1b8fe38eb168
3
+ size 805523957
pytorch_model.bin.index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"metadata": {"total_size": 13724336128}, "weight_map": {"model.decoder.layers.0.self_attn.q_proj.weight": "pytorch_model-1-of-32.bin", "model.decoder.layers.0.self_attn.q_proj.bias": "pytorch_model-1-of-32.bin", "model.decoder.layers.0.self_attn.k_proj.weight": "pytorch_model-1-of-32.bin", "model.decoder.layers.0.self_attn.k_proj.bias": "pytorch_model-1-of-32.bin", "model.decoder.layers.0.self_attn.v_proj.weight": "pytorch_model-1-of-32.bin", "model.decoder.layers.0.self_attn.v_proj.bias": "pytorch_model-1-of-32.bin", "model.decoder.layers.0.self_attn.out_proj.weight": "pytorch_model-1-of-32.bin", "model.decoder.layers.0.self_attn.out_proj.bias": "pytorch_model-1-of-32.bin", "model.decoder.layers.0.self_attn_layer_norm.weight": "pytorch_model-1-of-32.bin", "model.decoder.layers.0.self_attn_layer_norm.bias": "pytorch_model-1-of-32.bin", "model.decoder.layers.0.fc1.weight": "pytorch_model-1-of-32.bin", "model.decoder.layers.0.fc1.bias": "pytorch_model-1-of-32.bin", "model.decoder.layers.0.fc2.weight": "pytorch_model-1-of-32.bin", "model.decoder.layers.0.fc2.bias": "pytorch_model-1-of-32.bin", "model.decoder.layers.0.final_layer_norm.weight": "pytorch_model-1-of-32.bin", "model.decoder.layers.0.final_layer_norm.bias": "pytorch_model-1-of-32.bin", "model.decoder.embed_tokens.weight": "pytorch_model-1-of-32.bin", "model.decoder.embed_positions.weight": "pytorch_model-1-of-32.bin", "model.decoder.final_layer_norm.weight": "pytorch_model-1-of-32.bin", "model.decoder.final_layer_norm.bias": "pytorch_model-1-of-32.bin", "model.decoder.layers.1.self_attn.q_proj.weight": "pytorch_model-2-of-32.bin", "model.decoder.layers.1.self_attn.q_proj.bias": "pytorch_model-2-of-32.bin", "model.decoder.layers.1.self_attn.k_proj.weight": "pytorch_model-2-of-32.bin", "model.decoder.layers.1.self_attn.k_proj.bias": "pytorch_model-2-of-32.bin", "model.decoder.layers.1.self_attn.v_proj.weight": "pytorch_model-2-of-32.bin", "model.decoder.layers.1.self_attn.v_proj.bias": "pytorch_model-2-of-32.bin", "model.decoder.layers.1.self_attn.out_proj.weight": "pytorch_model-2-of-32.bin", "model.decoder.layers.1.self_attn.out_proj.bias": "pytorch_model-2-of-32.bin", "model.decoder.layers.1.self_attn_layer_norm.weight": "pytorch_model-2-of-32.bin", "model.decoder.layers.1.self_attn_layer_norm.bias": "pytorch_model-2-of-32.bin", "model.decoder.layers.1.fc1.weight": "pytorch_model-2-of-32.bin", "model.decoder.layers.1.fc1.bias": "pytorch_model-2-of-32.bin", "model.decoder.layers.1.fc2.weight": "pytorch_model-2-of-32.bin", "model.decoder.layers.1.fc2.bias": "pytorch_model-2-of-32.bin", "model.decoder.layers.1.final_layer_norm.weight": "pytorch_model-2-of-32.bin", "model.decoder.layers.1.final_layer_norm.bias": "pytorch_model-2-of-32.bin", "model.decoder.layers.2.self_attn.q_proj.weight": "pytorch_model-3-of-32.bin", "model.decoder.layers.2.self_attn.q_proj.bias": "pytorch_model-3-of-32.bin", "model.decoder.layers.2.self_attn.k_proj.weight": "pytorch_model-3-of-32.bin", "model.decoder.layers.2.self_attn.k_proj.bias": "pytorch_model-3-of-32.bin", "model.decoder.layers.2.self_attn.v_proj.weight": "pytorch_model-3-of-32.bin", "model.decoder.layers.2.self_attn.v_proj.bias": "pytorch_model-3-of-32.bin", "model.decoder.layers.2.self_attn.out_proj.weight": "pytorch_model-3-of-32.bin", "model.decoder.layers.2.self_attn.out_proj.bias": "pytorch_model-3-of-32.bin", "model.decoder.layers.2.self_attn_layer_norm.weight": "pytorch_model-3-of-32.bin", "model.decoder.layers.2.self_attn_layer_norm.bias": "pytorch_model-3-of-32.bin", "model.decoder.layers.2.fc1.weight": "pytorch_model-3-of-32.bin", "model.decoder.layers.2.fc1.bias": "pytorch_model-3-of-32.bin", "model.decoder.layers.2.fc2.weight": "pytorch_model-3-of-32.bin", "model.decoder.layers.2.fc2.bias": "pytorch_model-3-of-32.bin", "model.decoder.layers.2.final_layer_norm.weight": "pytorch_model-3-of-32.bin", "model.decoder.layers.2.final_layer_norm.bias": "pytorch_model-3-of-32.bin", "model.decoder.layers.3.self_attn.q_proj.weight": "pytorch_model-4-of-32.bin", "model.decoder.layers.3.self_attn.q_proj.bias": "pytorch_model-4-of-32.bin", "model.decoder.layers.3.self_attn.k_proj.weight": "pytorch_model-4-of-32.bin", "model.decoder.layers.3.self_attn.k_proj.bias": "pytorch_model-4-of-32.bin", "model.decoder.layers.3.self_attn.v_proj.weight": "pytorch_model-4-of-32.bin", "model.decoder.layers.3.self_attn.v_proj.bias": "pytorch_model-4-of-32.bin", "model.decoder.layers.3.self_attn.out_proj.weight": "pytorch_model-4-of-32.bin", "model.decoder.layers.3.self_attn.out_proj.bias": "pytorch_model-4-of-32.bin", "model.decoder.layers.3.self_attn_layer_norm.weight": "pytorch_model-4-of-32.bin", "model.decoder.layers.3.self_attn_layer_norm.bias": "pytorch_model-4-of-32.bin", "model.decoder.layers.3.fc1.weight": "pytorch_model-4-of-32.bin", "model.decoder.layers.3.fc1.bias": "pytorch_model-4-of-32.bin", "model.decoder.layers.3.fc2.weight": "pytorch_model-4-of-32.bin", "model.decoder.layers.3.fc2.bias": "pytorch_model-4-of-32.bin", "model.decoder.layers.3.final_layer_norm.weight": "pytorch_model-4-of-32.bin", "model.decoder.layers.3.final_layer_norm.bias": "pytorch_model-4-of-32.bin", "model.decoder.layers.4.self_attn.q_proj.weight": "pytorch_model-5-of-32.bin", "model.decoder.layers.4.self_attn.q_proj.bias": "pytorch_model-5-of-32.bin", "model.decoder.layers.4.self_attn.k_proj.weight": "pytorch_model-5-of-32.bin", "model.decoder.layers.4.self_attn.k_proj.bias": "pytorch_model-5-of-32.bin", "model.decoder.layers.4.self_attn.v_proj.weight": "pytorch_model-5-of-32.bin", "model.decoder.layers.4.self_attn.v_proj.bias": "pytorch_model-5-of-32.bin", "model.decoder.layers.4.self_attn.out_proj.weight": "pytorch_model-5-of-32.bin", "model.decoder.layers.4.self_attn.out_proj.bias": "pytorch_model-5-of-32.bin", "model.decoder.layers.4.self_attn_layer_norm.weight": "pytorch_model-5-of-32.bin", "model.decoder.layers.4.self_attn_layer_norm.bias": "pytorch_model-5-of-32.bin", "model.decoder.layers.4.fc1.weight": "pytorch_model-5-of-32.bin", "model.decoder.layers.4.fc1.bias": "pytorch_model-5-of-32.bin", "model.decoder.layers.4.fc2.weight": "pytorch_model-5-of-32.bin", "model.decoder.layers.4.fc2.bias": "pytorch_model-5-of-32.bin", "model.decoder.layers.4.final_layer_norm.weight": "pytorch_model-5-of-32.bin", "model.decoder.layers.4.final_layer_norm.bias": "pytorch_model-5-of-32.bin", "model.decoder.layers.5.self_attn.q_proj.weight": "pytorch_model-6-of-32.bin", "model.decoder.layers.5.self_attn.q_proj.bias": "pytorch_model-6-of-32.bin", "model.decoder.layers.5.self_attn.k_proj.weight": "pytorch_model-6-of-32.bin", "model.decoder.layers.5.self_attn.k_proj.bias": "pytorch_model-6-of-32.bin", "model.decoder.layers.5.self_attn.v_proj.weight": "pytorch_model-6-of-32.bin", "model.decoder.layers.5.self_attn.v_proj.bias": "pytorch_model-6-of-32.bin", "model.decoder.layers.5.self_attn.out_proj.weight": "pytorch_model-6-of-32.bin", "model.decoder.layers.5.self_attn.out_proj.bias": "pytorch_model-6-of-32.bin", "model.decoder.layers.5.self_attn_layer_norm.weight": "pytorch_model-6-of-32.bin", "model.decoder.layers.5.self_attn_layer_norm.bias": "pytorch_model-6-of-32.bin", "model.decoder.layers.5.fc1.weight": "pytorch_model-6-of-32.bin", "model.decoder.layers.5.fc1.bias": "pytorch_model-6-of-32.bin", "model.decoder.layers.5.fc2.weight": "pytorch_model-6-of-32.bin", "model.decoder.layers.5.fc2.bias": "pytorch_model-6-of-32.bin", "model.decoder.layers.5.final_layer_norm.weight": "pytorch_model-6-of-32.bin", "model.decoder.layers.5.final_layer_norm.bias": "pytorch_model-6-of-32.bin", "model.decoder.layers.6.self_attn.q_proj.weight": "pytorch_model-7-of-32.bin", "model.decoder.layers.6.self_attn.q_proj.bias": "pytorch_model-7-of-32.bin", "model.decoder.layers.6.self_attn.k_proj.weight": "pytorch_model-7-of-32.bin", "model.decoder.layers.6.self_attn.k_proj.bias": "pytorch_model-7-of-32.bin", "model.decoder.layers.6.self_attn.v_proj.weight": "pytorch_model-7-of-32.bin", "model.decoder.layers.6.self_attn.v_proj.bias": "pytorch_model-7-of-32.bin", "model.decoder.layers.6.self_attn.out_proj.weight": "pytorch_model-7-of-32.bin", "model.decoder.layers.6.self_attn.out_proj.bias": "pytorch_model-7-of-32.bin", "model.decoder.layers.6.self_attn_layer_norm.weight": "pytorch_model-7-of-32.bin", "model.decoder.layers.6.self_attn_layer_norm.bias": "pytorch_model-7-of-32.bin", "model.decoder.layers.6.fc1.weight": "pytorch_model-7-of-32.bin", "model.decoder.layers.6.fc1.bias": "pytorch_model-7-of-32.bin", "model.decoder.layers.6.fc2.weight": "pytorch_model-7-of-32.bin", "model.decoder.layers.6.fc2.bias": "pytorch_model-7-of-32.bin", "model.decoder.layers.6.final_layer_norm.weight": "pytorch_model-7-of-32.bin", "model.decoder.layers.6.final_layer_norm.bias": "pytorch_model-7-of-32.bin", "model.decoder.layers.7.self_attn.q_proj.weight": "pytorch_model-8-of-32.bin", "model.decoder.layers.7.self_attn.q_proj.bias": "pytorch_model-8-of-32.bin", "model.decoder.layers.7.self_attn.k_proj.weight": "pytorch_model-8-of-32.bin", "model.decoder.layers.7.self_attn.k_proj.bias": "pytorch_model-8-of-32.bin", "model.decoder.layers.7.self_attn.v_proj.weight": "pytorch_model-8-of-32.bin", "model.decoder.layers.7.self_attn.v_proj.bias": "pytorch_model-8-of-32.bin", "model.decoder.layers.7.self_attn.out_proj.weight": "pytorch_model-8-of-32.bin", "model.decoder.layers.7.self_attn.out_proj.bias": "pytorch_model-8-of-32.bin", "model.decoder.layers.7.self_attn_layer_norm.weight": "pytorch_model-8-of-32.bin", "model.decoder.layers.7.self_attn_layer_norm.bias": "pytorch_model-8-of-32.bin", "model.decoder.layers.7.fc1.weight": "pytorch_model-8-of-32.bin", "model.decoder.layers.7.fc1.bias": "pytorch_model-8-of-32.bin", "model.decoder.layers.7.fc2.weight": "pytorch_model-8-of-32.bin", "model.decoder.layers.7.fc2.bias": "pytorch_model-8-of-32.bin", "model.decoder.layers.7.final_layer_norm.weight": "pytorch_model-8-of-32.bin", "model.decoder.layers.7.final_layer_norm.bias": "pytorch_model-8-of-32.bin", "model.decoder.layers.8.self_attn.q_proj.weight": "pytorch_model-9-of-32.bin", "model.decoder.layers.8.self_attn.q_proj.bias": "pytorch_model-9-of-32.bin", "model.decoder.layers.8.self_attn.k_proj.weight": "pytorch_model-9-of-32.bin", "model.decoder.layers.8.self_attn.k_proj.bias": "pytorch_model-9-of-32.bin", "model.decoder.layers.8.self_attn.v_proj.weight": "pytorch_model-9-of-32.bin", "model.decoder.layers.8.self_attn.v_proj.bias": "pytorch_model-9-of-32.bin", "model.decoder.layers.8.self_attn.out_proj.weight": "pytorch_model-9-of-32.bin", "model.decoder.layers.8.self_attn.out_proj.bias": "pytorch_model-9-of-32.bin", "model.decoder.layers.8.self_attn_layer_norm.weight": "pytorch_model-9-of-32.bin", "model.decoder.layers.8.self_attn_layer_norm.bias": "pytorch_model-9-of-32.bin", "model.decoder.layers.8.fc1.weight": "pytorch_model-9-of-32.bin", "model.decoder.layers.8.fc1.bias": "pytorch_model-9-of-32.bin", "model.decoder.layers.8.fc2.weight": "pytorch_model-9-of-32.bin", "model.decoder.layers.8.fc2.bias": "pytorch_model-9-of-32.bin", "model.decoder.layers.8.final_layer_norm.weight": "pytorch_model-9-of-32.bin", "model.decoder.layers.8.final_layer_norm.bias": "pytorch_model-9-of-32.bin", "model.decoder.layers.9.self_attn.q_proj.weight": "pytorch_model-10-of-32.bin", "model.decoder.layers.9.self_attn.q_proj.bias": "pytorch_model-10-of-32.bin", "model.decoder.layers.9.self_attn.k_proj.weight": "pytorch_model-10-of-32.bin", "model.decoder.layers.9.self_attn.k_proj.bias": "pytorch_model-10-of-32.bin", "model.decoder.layers.9.self_attn.v_proj.weight": "pytorch_model-10-of-32.bin", "model.decoder.layers.9.self_attn.v_proj.bias": "pytorch_model-10-of-32.bin", "model.decoder.layers.9.self_attn.out_proj.weight": "pytorch_model-10-of-32.bin", "model.decoder.layers.9.self_attn.out_proj.bias": "pytorch_model-10-of-32.bin", "model.decoder.layers.9.self_attn_layer_norm.weight": "pytorch_model-10-of-32.bin", "model.decoder.layers.9.self_attn_layer_norm.bias": "pytorch_model-10-of-32.bin", "model.decoder.layers.9.fc1.weight": "pytorch_model-10-of-32.bin", "model.decoder.layers.9.fc1.bias": "pytorch_model-10-of-32.bin", "model.decoder.layers.9.fc2.weight": "pytorch_model-10-of-32.bin", "model.decoder.layers.9.fc2.bias": "pytorch_model-10-of-32.bin", "model.decoder.layers.9.final_layer_norm.weight": "pytorch_model-10-of-32.bin", "model.decoder.layers.9.final_layer_norm.bias": "pytorch_model-10-of-32.bin", "model.decoder.layers.10.self_attn.q_proj.weight": "pytorch_model-11-of-32.bin", "model.decoder.layers.10.self_attn.q_proj.bias": "pytorch_model-11-of-32.bin", "model.decoder.layers.10.self_attn.k_proj.weight": "pytorch_model-11-of-32.bin", "model.decoder.layers.10.self_attn.k_proj.bias": "pytorch_model-11-of-32.bin", "model.decoder.layers.10.self_attn.v_proj.weight": "pytorch_model-11-of-32.bin", "model.decoder.layers.10.self_attn.v_proj.bias": "pytorch_model-11-of-32.bin", "model.decoder.layers.10.self_attn.out_proj.weight": "pytorch_model-11-of-32.bin", "model.decoder.layers.10.self_attn.out_proj.bias": "pytorch_model-11-of-32.bin", "model.decoder.layers.10.self_attn_layer_norm.weight": "pytorch_model-11-of-32.bin", "model.decoder.layers.10.self_attn_layer_norm.bias": "pytorch_model-11-of-32.bin", "model.decoder.layers.10.fc1.weight": "pytorch_model-11-of-32.bin", "model.decoder.layers.10.fc1.bias": "pytorch_model-11-of-32.bin", "model.decoder.layers.10.fc2.weight": "pytorch_model-11-of-32.bin", "model.decoder.layers.10.fc2.bias": "pytorch_model-11-of-32.bin", "model.decoder.layers.10.final_layer_norm.weight": "pytorch_model-11-of-32.bin", "model.decoder.layers.10.final_layer_norm.bias": "pytorch_model-11-of-32.bin", "model.decoder.layers.11.self_attn.q_proj.weight": "pytorch_model-12-of-32.bin", "model.decoder.layers.11.self_attn.q_proj.bias": "pytorch_model-12-of-32.bin", "model.decoder.layers.11.self_attn.k_proj.weight": "pytorch_model-12-of-32.bin", "model.decoder.layers.11.self_attn.k_proj.bias": "pytorch_model-12-of-32.bin", "model.decoder.layers.11.self_attn.v_proj.weight": "pytorch_model-12-of-32.bin", "model.decoder.layers.11.self_attn.v_proj.bias": "pytorch_model-12-of-32.bin", "model.decoder.layers.11.self_attn.out_proj.weight": "pytorch_model-12-of-32.bin", "model.decoder.layers.11.self_attn.out_proj.bias": "pytorch_model-12-of-32.bin", "model.decoder.layers.11.self_attn_layer_norm.weight": "pytorch_model-12-of-32.bin", "model.decoder.layers.11.self_attn_layer_norm.bias": "pytorch_model-12-of-32.bin", "model.decoder.layers.11.fc1.weight": "pytorch_model-12-of-32.bin", "model.decoder.layers.11.fc1.bias": "pytorch_model-12-of-32.bin", "model.decoder.layers.11.fc2.weight": "pytorch_model-12-of-32.bin", "model.decoder.layers.11.fc2.bias": "pytorch_model-12-of-32.bin", "model.decoder.layers.11.final_layer_norm.weight": "pytorch_model-12-of-32.bin", "model.decoder.layers.11.final_layer_norm.bias": "pytorch_model-12-of-32.bin", "model.decoder.layers.12.self_attn.q_proj.weight": "pytorch_model-13-of-32.bin", "model.decoder.layers.12.self_attn.q_proj.bias": "pytorch_model-13-of-32.bin", "model.decoder.layers.12.self_attn.k_proj.weight": "pytorch_model-13-of-32.bin", "model.decoder.layers.12.self_attn.k_proj.bias": "pytorch_model-13-of-32.bin", "model.decoder.layers.12.self_attn.v_proj.weight": "pytorch_model-13-of-32.bin", "model.decoder.layers.12.self_attn.v_proj.bias": "pytorch_model-13-of-32.bin", "model.decoder.layers.12.self_attn.out_proj.weight": "pytorch_model-13-of-32.bin", "model.decoder.layers.12.self_attn.out_proj.bias": "pytorch_model-13-of-32.bin", "model.decoder.layers.12.self_attn_layer_norm.weight": "pytorch_model-13-of-32.bin", "model.decoder.layers.12.self_attn_layer_norm.bias": "pytorch_model-13-of-32.bin", "model.decoder.layers.12.fc1.weight": "pytorch_model-13-of-32.bin", "model.decoder.layers.12.fc1.bias": "pytorch_model-13-of-32.bin", "model.decoder.layers.12.fc2.weight": "pytorch_model-13-of-32.bin", "model.decoder.layers.12.fc2.bias": "pytorch_model-13-of-32.bin", "model.decoder.layers.12.final_layer_norm.weight": "pytorch_model-13-of-32.bin", "model.decoder.layers.12.final_layer_norm.bias": "pytorch_model-13-of-32.bin", "model.decoder.layers.13.self_attn.q_proj.weight": "pytorch_model-14-of-32.bin", "model.decoder.layers.13.self_attn.q_proj.bias": "pytorch_model-14-of-32.bin", "model.decoder.layers.13.self_attn.k_proj.weight": "pytorch_model-14-of-32.bin", "model.decoder.layers.13.self_attn.k_proj.bias": "pytorch_model-14-of-32.bin", "model.decoder.layers.13.self_attn.v_proj.weight": "pytorch_model-14-of-32.bin", "model.decoder.layers.13.self_attn.v_proj.bias": "pytorch_model-14-of-32.bin", "model.decoder.layers.13.self_attn.out_proj.weight": "pytorch_model-14-of-32.bin", "model.decoder.layers.13.self_attn.out_proj.bias": "pytorch_model-14-of-32.bin", "model.decoder.layers.13.self_attn_layer_norm.weight": "pytorch_model-14-of-32.bin", "model.decoder.layers.13.self_attn_layer_norm.bias": "pytorch_model-14-of-32.bin", "model.decoder.layers.13.fc1.weight": "pytorch_model-14-of-32.bin", "model.decoder.layers.13.fc1.bias": "pytorch_model-14-of-32.bin", "model.decoder.layers.13.fc2.weight": "pytorch_model-14-of-32.bin", "model.decoder.layers.13.fc2.bias": "pytorch_model-14-of-32.bin", "model.decoder.layers.13.final_layer_norm.weight": "pytorch_model-14-of-32.bin", "model.decoder.layers.13.final_layer_norm.bias": "pytorch_model-14-of-32.bin", "model.decoder.layers.14.self_attn.q_proj.weight": "pytorch_model-15-of-32.bin", "model.decoder.layers.14.self_attn.q_proj.bias": "pytorch_model-15-of-32.bin", "model.decoder.layers.14.self_attn.k_proj.weight": "pytorch_model-15-of-32.bin", "model.decoder.layers.14.self_attn.k_proj.bias": "pytorch_model-15-of-32.bin", "model.decoder.layers.14.self_attn.v_proj.weight": "pytorch_model-15-of-32.bin", "model.decoder.layers.14.self_attn.v_proj.bias": "pytorch_model-15-of-32.bin", "model.decoder.layers.14.self_attn.out_proj.weight": "pytorch_model-15-of-32.bin", "model.decoder.layers.14.self_attn.out_proj.bias": "pytorch_model-15-of-32.bin", "model.decoder.layers.14.self_attn_layer_norm.weight": "pytorch_model-15-of-32.bin", "model.decoder.layers.14.self_attn_layer_norm.bias": "pytorch_model-15-of-32.bin", "model.decoder.layers.14.fc1.weight": "pytorch_model-15-of-32.bin", "model.decoder.layers.14.fc1.bias": "pytorch_model-15-of-32.bin", "model.decoder.layers.14.fc2.weight": "pytorch_model-15-of-32.bin", "model.decoder.layers.14.fc2.bias": "pytorch_model-15-of-32.bin", "model.decoder.layers.14.final_layer_norm.weight": "pytorch_model-15-of-32.bin", "model.decoder.layers.14.final_layer_norm.bias": "pytorch_model-15-of-32.bin", "model.decoder.layers.15.self_attn.q_proj.weight": "pytorch_model-16-of-32.bin", "model.decoder.layers.15.self_attn.q_proj.bias": "pytorch_model-16-of-32.bin", "model.decoder.layers.15.self_attn.k_proj.weight": "pytorch_model-16-of-32.bin", "model.decoder.layers.15.self_attn.k_proj.bias": "pytorch_model-16-of-32.bin", "model.decoder.layers.15.self_attn.v_proj.weight": "pytorch_model-16-of-32.bin", "model.decoder.layers.15.self_attn.v_proj.bias": "pytorch_model-16-of-32.bin", "model.decoder.layers.15.self_attn.out_proj.weight": "pytorch_model-16-of-32.bin", "model.decoder.layers.15.self_attn.out_proj.bias": "pytorch_model-16-of-32.bin", "model.decoder.layers.15.self_attn_layer_norm.weight": "pytorch_model-16-of-32.bin", "model.decoder.layers.15.self_attn_layer_norm.bias": "pytorch_model-16-of-32.bin", "model.decoder.layers.15.fc1.weight": "pytorch_model-16-of-32.bin", "model.decoder.layers.15.fc1.bias": "pytorch_model-16-of-32.bin", "model.decoder.layers.15.fc2.weight": "pytorch_model-16-of-32.bin", "model.decoder.layers.15.fc2.bias": "pytorch_model-16-of-32.bin", "model.decoder.layers.15.final_layer_norm.weight": "pytorch_model-16-of-32.bin", "model.decoder.layers.15.final_layer_norm.bias": "pytorch_model-16-of-32.bin", "model.decoder.layers.16.self_attn.q_proj.weight": "pytorch_model-17-of-32.bin", "model.decoder.layers.16.self_attn.q_proj.bias": "pytorch_model-17-of-32.bin", "model.decoder.layers.16.self_attn.k_proj.weight": "pytorch_model-17-of-32.bin", "model.decoder.layers.16.self_attn.k_proj.bias": "pytorch_model-17-of-32.bin", "model.decoder.layers.16.self_attn.v_proj.weight": "pytorch_model-17-of-32.bin", "model.decoder.layers.16.self_attn.v_proj.bias": "pytorch_model-17-of-32.bin", "model.decoder.layers.16.self_attn.out_proj.weight": "pytorch_model-17-of-32.bin", "model.decoder.layers.16.self_attn.out_proj.bias": "pytorch_model-17-of-32.bin", "model.decoder.layers.16.self_attn_layer_norm.weight": "pytorch_model-17-of-32.bin", "model.decoder.layers.16.self_attn_layer_norm.bias": "pytorch_model-17-of-32.bin", "model.decoder.layers.16.fc1.weight": "pytorch_model-17-of-32.bin", "model.decoder.layers.16.fc1.bias": "pytorch_model-17-of-32.bin", "model.decoder.layers.16.fc2.weight": "pytorch_model-17-of-32.bin", "model.decoder.layers.16.fc2.bias": "pytorch_model-17-of-32.bin", "model.decoder.layers.16.final_layer_norm.weight": "pytorch_model-17-of-32.bin", "model.decoder.layers.16.final_layer_norm.bias": "pytorch_model-17-of-32.bin", "model.decoder.layers.17.self_attn.q_proj.weight": "pytorch_model-18-of-32.bin", "model.decoder.layers.17.self_attn.q_proj.bias": "pytorch_model-18-of-32.bin", "model.decoder.layers.17.self_attn.k_proj.weight": "pytorch_model-18-of-32.bin", "model.decoder.layers.17.self_attn.k_proj.bias": "pytorch_model-18-of-32.bin", "model.decoder.layers.17.self_attn.v_proj.weight": "pytorch_model-18-of-32.bin", "model.decoder.layers.17.self_attn.v_proj.bias": "pytorch_model-18-of-32.bin", "model.decoder.layers.17.self_attn.out_proj.weight": "pytorch_model-18-of-32.bin", "model.decoder.layers.17.self_attn.out_proj.bias": "pytorch_model-18-of-32.bin", "model.decoder.layers.17.self_attn_layer_norm.weight": "pytorch_model-18-of-32.bin", "model.decoder.layers.17.self_attn_layer_norm.bias": "pytorch_model-18-of-32.bin", "model.decoder.layers.17.fc1.weight": "pytorch_model-18-of-32.bin", "model.decoder.layers.17.fc1.bias": "pytorch_model-18-of-32.bin", "model.decoder.layers.17.fc2.weight": "pytorch_model-18-of-32.bin", "model.decoder.layers.17.fc2.bias": "pytorch_model-18-of-32.bin", "model.decoder.layers.17.final_layer_norm.weight": "pytorch_model-18-of-32.bin", "model.decoder.layers.17.final_layer_norm.bias": "pytorch_model-18-of-32.bin", "model.decoder.layers.18.self_attn.q_proj.weight": "pytorch_model-19-of-32.bin", "model.decoder.layers.18.self_attn.q_proj.bias": "pytorch_model-19-of-32.bin", "model.decoder.layers.18.self_attn.k_proj.weight": "pytorch_model-19-of-32.bin", "model.decoder.layers.18.self_attn.k_proj.bias": "pytorch_model-19-of-32.bin", "model.decoder.layers.18.self_attn.v_proj.weight": "pytorch_model-19-of-32.bin", "model.decoder.layers.18.self_attn.v_proj.bias": "pytorch_model-19-of-32.bin", "model.decoder.layers.18.self_attn.out_proj.weight": "pytorch_model-19-of-32.bin", "model.decoder.layers.18.self_attn.out_proj.bias": "pytorch_model-19-of-32.bin", "model.decoder.layers.18.self_attn_layer_norm.weight": "pytorch_model-19-of-32.bin", "model.decoder.layers.18.self_attn_layer_norm.bias": "pytorch_model-19-of-32.bin", "model.decoder.layers.18.fc1.weight": "pytorch_model-19-of-32.bin", "model.decoder.layers.18.fc1.bias": "pytorch_model-19-of-32.bin", "model.decoder.layers.18.fc2.weight": "pytorch_model-19-of-32.bin", "model.decoder.layers.18.fc2.bias": "pytorch_model-19-of-32.bin", "model.decoder.layers.18.final_layer_norm.weight": "pytorch_model-19-of-32.bin", "model.decoder.layers.18.final_layer_norm.bias": "pytorch_model-19-of-32.bin", "model.decoder.layers.19.self_attn.q_proj.weight": "pytorch_model-20-of-32.bin", "model.decoder.layers.19.self_attn.q_proj.bias": "pytorch_model-20-of-32.bin", "model.decoder.layers.19.self_attn.k_proj.weight": "pytorch_model-20-of-32.bin", "model.decoder.layers.19.self_attn.k_proj.bias": "pytorch_model-20-of-32.bin", "model.decoder.layers.19.self_attn.v_proj.weight": "pytorch_model-20-of-32.bin", "model.decoder.layers.19.self_attn.v_proj.bias": "pytorch_model-20-of-32.bin", "model.decoder.layers.19.self_attn.out_proj.weight": "pytorch_model-20-of-32.bin", "model.decoder.layers.19.self_attn.out_proj.bias": "pytorch_model-20-of-32.bin", "model.decoder.layers.19.self_attn_layer_norm.weight": "pytorch_model-20-of-32.bin", "model.decoder.layers.19.self_attn_layer_norm.bias": "pytorch_model-20-of-32.bin", "model.decoder.layers.19.fc1.weight": "pytorch_model-20-of-32.bin", "model.decoder.layers.19.fc1.bias": "pytorch_model-20-of-32.bin", "model.decoder.layers.19.fc2.weight": "pytorch_model-20-of-32.bin", "model.decoder.layers.19.fc2.bias": "pytorch_model-20-of-32.bin", "model.decoder.layers.19.final_layer_norm.weight": "pytorch_model-20-of-32.bin", "model.decoder.layers.19.final_layer_norm.bias": "pytorch_model-20-of-32.bin", "model.decoder.layers.20.self_attn.q_proj.weight": "pytorch_model-21-of-32.bin", "model.decoder.layers.20.self_attn.q_proj.bias": "pytorch_model-21-of-32.bin", "model.decoder.layers.20.self_attn.k_proj.weight": "pytorch_model-21-of-32.bin", "model.decoder.layers.20.self_attn.k_proj.bias": "pytorch_model-21-of-32.bin", "model.decoder.layers.20.self_attn.v_proj.weight": "pytorch_model-21-of-32.bin", "model.decoder.layers.20.self_attn.v_proj.bias": "pytorch_model-21-of-32.bin", "model.decoder.layers.20.self_attn.out_proj.weight": "pytorch_model-21-of-32.bin", "model.decoder.layers.20.self_attn.out_proj.bias": "pytorch_model-21-of-32.bin", "model.decoder.layers.20.self_attn_layer_norm.weight": "pytorch_model-21-of-32.bin", "model.decoder.layers.20.self_attn_layer_norm.bias": "pytorch_model-21-of-32.bin", "model.decoder.layers.20.fc1.weight": "pytorch_model-21-of-32.bin", "model.decoder.layers.20.fc1.bias": "pytorch_model-21-of-32.bin", "model.decoder.layers.20.fc2.weight": "pytorch_model-21-of-32.bin", "model.decoder.layers.20.fc2.bias": "pytorch_model-21-of-32.bin", "model.decoder.layers.20.final_layer_norm.weight": "pytorch_model-21-of-32.bin", "model.decoder.layers.20.final_layer_norm.bias": "pytorch_model-21-of-32.bin", "model.decoder.layers.21.self_attn.q_proj.weight": "pytorch_model-22-of-32.bin", "model.decoder.layers.21.self_attn.q_proj.bias": "pytorch_model-22-of-32.bin", "model.decoder.layers.21.self_attn.k_proj.weight": "pytorch_model-22-of-32.bin", "model.decoder.layers.21.self_attn.k_proj.bias": "pytorch_model-22-of-32.bin", "model.decoder.layers.21.self_attn.v_proj.weight": "pytorch_model-22-of-32.bin", "model.decoder.layers.21.self_attn.v_proj.bias": "pytorch_model-22-of-32.bin", "model.decoder.layers.21.self_attn.out_proj.weight": "pytorch_model-22-of-32.bin", "model.decoder.layers.21.self_attn.out_proj.bias": "pytorch_model-22-of-32.bin", "model.decoder.layers.21.self_attn_layer_norm.weight": "pytorch_model-22-of-32.bin", "model.decoder.layers.21.self_attn_layer_norm.bias": "pytorch_model-22-of-32.bin", "model.decoder.layers.21.fc1.weight": "pytorch_model-22-of-32.bin", "model.decoder.layers.21.fc1.bias": "pytorch_model-22-of-32.bin", "model.decoder.layers.21.fc2.weight": "pytorch_model-22-of-32.bin", "model.decoder.layers.21.fc2.bias": "pytorch_model-22-of-32.bin", "model.decoder.layers.21.final_layer_norm.weight": "pytorch_model-22-of-32.bin", "model.decoder.layers.21.final_layer_norm.bias": "pytorch_model-22-of-32.bin", "model.decoder.layers.22.self_attn.q_proj.weight": "pytorch_model-23-of-32.bin", "model.decoder.layers.22.self_attn.q_proj.bias": "pytorch_model-23-of-32.bin", "model.decoder.layers.22.self_attn.k_proj.weight": "pytorch_model-23-of-32.bin", "model.decoder.layers.22.self_attn.k_proj.bias": "pytorch_model-23-of-32.bin", "model.decoder.layers.22.self_attn.v_proj.weight": "pytorch_model-23-of-32.bin", "model.decoder.layers.22.self_attn.v_proj.bias": "pytorch_model-23-of-32.bin", "model.decoder.layers.22.self_attn.out_proj.weight": "pytorch_model-23-of-32.bin", "model.decoder.layers.22.self_attn.out_proj.bias": "pytorch_model-23-of-32.bin", "model.decoder.layers.22.self_attn_layer_norm.weight": "pytorch_model-23-of-32.bin", "model.decoder.layers.22.self_attn_layer_norm.bias": "pytorch_model-23-of-32.bin", "model.decoder.layers.22.fc1.weight": "pytorch_model-23-of-32.bin", "model.decoder.layers.22.fc1.bias": "pytorch_model-23-of-32.bin", "model.decoder.layers.22.fc2.weight": "pytorch_model-23-of-32.bin", "model.decoder.layers.22.fc2.bias": "pytorch_model-23-of-32.bin", "model.decoder.layers.22.final_layer_norm.weight": "pytorch_model-23-of-32.bin", "model.decoder.layers.22.final_layer_norm.bias": "pytorch_model-23-of-32.bin", "model.decoder.layers.23.self_attn.q_proj.weight": "pytorch_model-24-of-32.bin", "model.decoder.layers.23.self_attn.q_proj.bias": "pytorch_model-24-of-32.bin", "model.decoder.layers.23.self_attn.k_proj.weight": "pytorch_model-24-of-32.bin", "model.decoder.layers.23.self_attn.k_proj.bias": "pytorch_model-24-of-32.bin", "model.decoder.layers.23.self_attn.v_proj.weight": "pytorch_model-24-of-32.bin", "model.decoder.layers.23.self_attn.v_proj.bias": "pytorch_model-24-of-32.bin", "model.decoder.layers.23.self_attn.out_proj.weight": "pytorch_model-24-of-32.bin", "model.decoder.layers.23.self_attn.out_proj.bias": "pytorch_model-24-of-32.bin", "model.decoder.layers.23.self_attn_layer_norm.weight": "pytorch_model-24-of-32.bin", "model.decoder.layers.23.self_attn_layer_norm.bias": "pytorch_model-24-of-32.bin", "model.decoder.layers.23.fc1.weight": "pytorch_model-24-of-32.bin", "model.decoder.layers.23.fc1.bias": "pytorch_model-24-of-32.bin", "model.decoder.layers.23.fc2.weight": "pytorch_model-24-of-32.bin", "model.decoder.layers.23.fc2.bias": "pytorch_model-24-of-32.bin", "model.decoder.layers.23.final_layer_norm.weight": "pytorch_model-24-of-32.bin", "model.decoder.layers.23.final_layer_norm.bias": "pytorch_model-24-of-32.bin", "model.decoder.layers.24.self_attn.q_proj.weight": "pytorch_model-25-of-32.bin", "model.decoder.layers.24.self_attn.q_proj.bias": "pytorch_model-25-of-32.bin", "model.decoder.layers.24.self_attn.k_proj.weight": "pytorch_model-25-of-32.bin", "model.decoder.layers.24.self_attn.k_proj.bias": "pytorch_model-25-of-32.bin", "model.decoder.layers.24.self_attn.v_proj.weight": "pytorch_model-25-of-32.bin", "model.decoder.layers.24.self_attn.v_proj.bias": "pytorch_model-25-of-32.bin", "model.decoder.layers.24.self_attn.out_proj.weight": "pytorch_model-25-of-32.bin", "model.decoder.layers.24.self_attn.out_proj.bias": "pytorch_model-25-of-32.bin", "model.decoder.layers.24.self_attn_layer_norm.weight": "pytorch_model-25-of-32.bin", "model.decoder.layers.24.self_attn_layer_norm.bias": "pytorch_model-25-of-32.bin", "model.decoder.layers.24.fc1.weight": "pytorch_model-25-of-32.bin", "model.decoder.layers.24.fc1.bias": "pytorch_model-25-of-32.bin", "model.decoder.layers.24.fc2.weight": "pytorch_model-25-of-32.bin", "model.decoder.layers.24.fc2.bias": "pytorch_model-25-of-32.bin", "model.decoder.layers.24.final_layer_norm.weight": "pytorch_model-25-of-32.bin", "model.decoder.layers.24.final_layer_norm.bias": "pytorch_model-25-of-32.bin", "model.decoder.layers.25.self_attn.q_proj.weight": "pytorch_model-26-of-32.bin", "model.decoder.layers.25.self_attn.q_proj.bias": "pytorch_model-26-of-32.bin", "model.decoder.layers.25.self_attn.k_proj.weight": "pytorch_model-26-of-32.bin", "model.decoder.layers.25.self_attn.k_proj.bias": "pytorch_model-26-of-32.bin", "model.decoder.layers.25.self_attn.v_proj.weight": "pytorch_model-26-of-32.bin", "model.decoder.layers.25.self_attn.v_proj.bias": "pytorch_model-26-of-32.bin", "model.decoder.layers.25.self_attn.out_proj.weight": "pytorch_model-26-of-32.bin", "model.decoder.layers.25.self_attn.out_proj.bias": "pytorch_model-26-of-32.bin", "model.decoder.layers.25.self_attn_layer_norm.weight": "pytorch_model-26-of-32.bin", "model.decoder.layers.25.self_attn_layer_norm.bias": "pytorch_model-26-of-32.bin", "model.decoder.layers.25.fc1.weight": "pytorch_model-26-of-32.bin", "model.decoder.layers.25.fc1.bias": "pytorch_model-26-of-32.bin", "model.decoder.layers.25.fc2.weight": "pytorch_model-26-of-32.bin", "model.decoder.layers.25.fc2.bias": "pytorch_model-26-of-32.bin", "model.decoder.layers.25.final_layer_norm.weight": "pytorch_model-26-of-32.bin", "model.decoder.layers.25.final_layer_norm.bias": "pytorch_model-26-of-32.bin", "model.decoder.layers.26.self_attn.q_proj.weight": "pytorch_model-27-of-32.bin", "model.decoder.layers.26.self_attn.q_proj.bias": "pytorch_model-27-of-32.bin", "model.decoder.layers.26.self_attn.k_proj.weight": "pytorch_model-27-of-32.bin", "model.decoder.layers.26.self_attn.k_proj.bias": "pytorch_model-27-of-32.bin", "model.decoder.layers.26.self_attn.v_proj.weight": "pytorch_model-27-of-32.bin", "model.decoder.layers.26.self_attn.v_proj.bias": "pytorch_model-27-of-32.bin", "model.decoder.layers.26.self_attn.out_proj.weight": "pytorch_model-27-of-32.bin", "model.decoder.layers.26.self_attn.out_proj.bias": "pytorch_model-27-of-32.bin", "model.decoder.layers.26.self_attn_layer_norm.weight": "pytorch_model-27-of-32.bin", "model.decoder.layers.26.self_attn_layer_norm.bias": "pytorch_model-27-of-32.bin", "model.decoder.layers.26.fc1.weight": "pytorch_model-27-of-32.bin", "model.decoder.layers.26.fc1.bias": "pytorch_model-27-of-32.bin", "model.decoder.layers.26.fc2.weight": "pytorch_model-27-of-32.bin", "model.decoder.layers.26.fc2.bias": "pytorch_model-27-of-32.bin", "model.decoder.layers.26.final_layer_norm.weight": "pytorch_model-27-of-32.bin", "model.decoder.layers.26.final_layer_norm.bias": "pytorch_model-27-of-32.bin", "model.decoder.layers.27.self_attn.q_proj.weight": "pytorch_model-28-of-32.bin", "model.decoder.layers.27.self_attn.q_proj.bias": "pytorch_model-28-of-32.bin", "model.decoder.layers.27.self_attn.k_proj.weight": "pytorch_model-28-of-32.bin", "model.decoder.layers.27.self_attn.k_proj.bias": "pytorch_model-28-of-32.bin", "model.decoder.layers.27.self_attn.v_proj.weight": "pytorch_model-28-of-32.bin", "model.decoder.layers.27.self_attn.v_proj.bias": "pytorch_model-28-of-32.bin", "model.decoder.layers.27.self_attn.out_proj.weight": "pytorch_model-28-of-32.bin", "model.decoder.layers.27.self_attn.out_proj.bias": "pytorch_model-28-of-32.bin", "model.decoder.layers.27.self_attn_layer_norm.weight": "pytorch_model-28-of-32.bin", "model.decoder.layers.27.self_attn_layer_norm.bias": "pytorch_model-28-of-32.bin", "model.decoder.layers.27.fc1.weight": "pytorch_model-28-of-32.bin", "model.decoder.layers.27.fc1.bias": "pytorch_model-28-of-32.bin", "model.decoder.layers.27.fc2.weight": "pytorch_model-28-of-32.bin", "model.decoder.layers.27.fc2.bias": "pytorch_model-28-of-32.bin", "model.decoder.layers.27.final_layer_norm.weight": "pytorch_model-28-of-32.bin", "model.decoder.layers.27.final_layer_norm.bias": "pytorch_model-28-of-32.bin", "model.decoder.layers.28.self_attn.q_proj.weight": "pytorch_model-29-of-32.bin", "model.decoder.layers.28.self_attn.q_proj.bias": "pytorch_model-29-of-32.bin", "model.decoder.layers.28.self_attn.k_proj.weight": "pytorch_model-29-of-32.bin", "model.decoder.layers.28.self_attn.k_proj.bias": "pytorch_model-29-of-32.bin", "model.decoder.layers.28.self_attn.v_proj.weight": "pytorch_model-29-of-32.bin", "model.decoder.layers.28.self_attn.v_proj.bias": "pytorch_model-29-of-32.bin", "model.decoder.layers.28.self_attn.out_proj.weight": "pytorch_model-29-of-32.bin", "model.decoder.layers.28.self_attn.out_proj.bias": "pytorch_model-29-of-32.bin", "model.decoder.layers.28.self_attn_layer_norm.weight": "pytorch_model-29-of-32.bin", "model.decoder.layers.28.self_attn_layer_norm.bias": "pytorch_model-29-of-32.bin", "model.decoder.layers.28.fc1.weight": "pytorch_model-29-of-32.bin", "model.decoder.layers.28.fc1.bias": "pytorch_model-29-of-32.bin", "model.decoder.layers.28.fc2.weight": "pytorch_model-29-of-32.bin", "model.decoder.layers.28.fc2.bias": "pytorch_model-29-of-32.bin", "model.decoder.layers.28.final_layer_norm.weight": "pytorch_model-29-of-32.bin", "model.decoder.layers.28.final_layer_norm.bias": "pytorch_model-29-of-32.bin", "model.decoder.layers.29.self_attn.q_proj.weight": "pytorch_model-30-of-32.bin", "model.decoder.layers.29.self_attn.q_proj.bias": "pytorch_model-30-of-32.bin", "model.decoder.layers.29.self_attn.k_proj.weight": "pytorch_model-30-of-32.bin", "model.decoder.layers.29.self_attn.k_proj.bias": "pytorch_model-30-of-32.bin", "model.decoder.layers.29.self_attn.v_proj.weight": "pytorch_model-30-of-32.bin", "model.decoder.layers.29.self_attn.v_proj.bias": "pytorch_model-30-of-32.bin", "model.decoder.layers.29.self_attn.out_proj.weight": "pytorch_model-30-of-32.bin", "model.decoder.layers.29.self_attn.out_proj.bias": "pytorch_model-30-of-32.bin", "model.decoder.layers.29.self_attn_layer_norm.weight": "pytorch_model-30-of-32.bin", "model.decoder.layers.29.self_attn_layer_norm.bias": "pytorch_model-30-of-32.bin", "model.decoder.layers.29.fc1.weight": "pytorch_model-30-of-32.bin", "model.decoder.layers.29.fc1.bias": "pytorch_model-30-of-32.bin", "model.decoder.layers.29.fc2.weight": "pytorch_model-30-of-32.bin", "model.decoder.layers.29.fc2.bias": "pytorch_model-30-of-32.bin", "model.decoder.layers.29.final_layer_norm.weight": "pytorch_model-30-of-32.bin", "model.decoder.layers.29.final_layer_norm.bias": "pytorch_model-30-of-32.bin", "model.decoder.layers.30.self_attn.q_proj.weight": "pytorch_model-31-of-32.bin", "model.decoder.layers.30.self_attn.q_proj.bias": "pytorch_model-31-of-32.bin", "model.decoder.layers.30.self_attn.k_proj.weight": "pytorch_model-31-of-32.bin", "model.decoder.layers.30.self_attn.k_proj.bias": "pytorch_model-31-of-32.bin", "model.decoder.layers.30.self_attn.v_proj.weight": "pytorch_model-31-of-32.bin", "model.decoder.layers.30.self_attn.v_proj.bias": "pytorch_model-31-of-32.bin", "model.decoder.layers.30.self_attn.out_proj.weight": "pytorch_model-31-of-32.bin", "model.decoder.layers.30.self_attn.out_proj.bias": "pytorch_model-31-of-32.bin", "model.decoder.layers.30.self_attn_layer_norm.weight": "pytorch_model-31-of-32.bin", "model.decoder.layers.30.self_attn_layer_norm.bias": "pytorch_model-31-of-32.bin", "model.decoder.layers.30.fc1.weight": "pytorch_model-31-of-32.bin", "model.decoder.layers.30.fc1.bias": "pytorch_model-31-of-32.bin", "model.decoder.layers.30.fc2.weight": "pytorch_model-31-of-32.bin", "model.decoder.layers.30.fc2.bias": "pytorch_model-31-of-32.bin", "model.decoder.layers.30.final_layer_norm.weight": "pytorch_model-31-of-32.bin", "model.decoder.layers.30.final_layer_norm.bias": "pytorch_model-31-of-32.bin", "model.decoder.layers.31.self_attn.q_proj.weight": "pytorch_model-32-of-32.bin", "model.decoder.layers.31.self_attn.q_proj.bias": "pytorch_model-32-of-32.bin", "model.decoder.layers.31.self_attn.k_proj.weight": "pytorch_model-32-of-32.bin", "model.decoder.layers.31.self_attn.k_proj.bias": "pytorch_model-32-of-32.bin", "model.decoder.layers.31.self_attn.v_proj.weight": "pytorch_model-32-of-32.bin", "model.decoder.layers.31.self_attn.v_proj.bias": "pytorch_model-32-of-32.bin", "model.decoder.layers.31.self_attn.out_proj.weight": "pytorch_model-32-of-32.bin", "model.decoder.layers.31.self_attn.out_proj.bias": "pytorch_model-32-of-32.bin", "model.decoder.layers.31.self_attn_layer_norm.weight": "pytorch_model-32-of-32.bin", "model.decoder.layers.31.self_attn_layer_norm.bias": "pytorch_model-32-of-32.bin", "model.decoder.layers.31.fc1.weight": "pytorch_model-32-of-32.bin", "model.decoder.layers.31.fc1.bias": "pytorch_model-32-of-32.bin", "model.decoder.layers.31.fc2.weight": "pytorch_model-32-of-32.bin", "model.decoder.layers.31.fc2.bias": "pytorch_model-32-of-32.bin", "model.decoder.layers.31.final_layer_norm.weight": "pytorch_model-32-of-32.bin", "model.decoder.layers.31.final_layer_norm.bias": "pytorch_model-32-of-32.bin", "lm_head.weight": "pytorch_model-32-of-32.bin"}}
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "[PAD]",
5
+ "unk_token": "<unk>"
6
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "model_max_length": 512,
4
+ "padding_side": "right",
5
+ "tokenizer_class": "PreTrainedTokenizerFast"
6
+ }
trainer_state.json ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.999934373154245,
5
+ "global_step": 8571,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.18,
12
+ "learning_rate": 1.9420185252014917e-05,
13
+ "loss": 0.5431,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 0.35,
18
+ "learning_rate": 1.8217250090220138e-05,
19
+ "loss": 0.3568,
20
+ "step": 1000
21
+ },
22
+ {
23
+ "epoch": 0.53,
24
+ "learning_rate": 1.701431492842536e-05,
25
+ "loss": 0.2985,
26
+ "step": 1500
27
+ },
28
+ {
29
+ "epoch": 0.7,
30
+ "learning_rate": 1.581137976663058e-05,
31
+ "loss": 0.2649,
32
+ "step": 2000
33
+ },
34
+ {
35
+ "epoch": 0.88,
36
+ "learning_rate": 1.4608444604835801e-05,
37
+ "loss": 0.241,
38
+ "step": 2500
39
+ },
40
+ {
41
+ "epoch": 1.05,
42
+ "learning_rate": 1.340550944304102e-05,
43
+ "loss": 0.209,
44
+ "step": 3000
45
+ },
46
+ {
47
+ "epoch": 1.23,
48
+ "learning_rate": 1.2202574281246243e-05,
49
+ "loss": 0.1683,
50
+ "step": 3500
51
+ },
52
+ {
53
+ "epoch": 1.4,
54
+ "learning_rate": 1.0999639119451462e-05,
55
+ "loss": 0.1611,
56
+ "step": 4000
57
+ },
58
+ {
59
+ "epoch": 1.58,
60
+ "learning_rate": 9.796703957656684e-06,
61
+ "loss": 0.1536,
62
+ "step": 4500
63
+ },
64
+ {
65
+ "epoch": 1.75,
66
+ "learning_rate": 8.593768795861904e-06,
67
+ "loss": 0.1457,
68
+ "step": 5000
69
+ },
70
+ {
71
+ "epoch": 1.93,
72
+ "learning_rate": 7.390833634067124e-06,
73
+ "loss": 0.141,
74
+ "step": 5500
75
+ },
76
+ {
77
+ "epoch": 2.1,
78
+ "learning_rate": 6.187898472272345e-06,
79
+ "loss": 0.1142,
80
+ "step": 6000
81
+ },
82
+ {
83
+ "epoch": 2.28,
84
+ "learning_rate": 4.9849633104775655e-06,
85
+ "loss": 0.0962,
86
+ "step": 6500
87
+ },
88
+ {
89
+ "epoch": 2.45,
90
+ "learning_rate": 3.7820281486827865e-06,
91
+ "loss": 0.0946,
92
+ "step": 7000
93
+ },
94
+ {
95
+ "epoch": 2.63,
96
+ "learning_rate": 2.579092986888007e-06,
97
+ "loss": 0.091,
98
+ "step": 7500
99
+ },
100
+ {
101
+ "epoch": 2.8,
102
+ "learning_rate": 1.3761578250932275e-06,
103
+ "loss": 0.0891,
104
+ "step": 8000
105
+ },
106
+ {
107
+ "epoch": 2.98,
108
+ "learning_rate": 1.7322266329844823e-07,
109
+ "loss": 0.0877,
110
+ "step": 8500
111
+ },
112
+ {
113
+ "epoch": 3.0,
114
+ "step": 8571,
115
+ "total_flos": 1.8800594957369344e+16,
116
+ "train_loss": 0.19064752925667625,
117
+ "train_runtime": 224598.7373,
118
+ "train_samples_per_second": 19.539,
119
+ "train_steps_per_second": 0.038
120
+ }
121
+ ],
122
+ "max_steps": 8571,
123
+ "num_train_epochs": 3,
124
+ "total_flos": 1.8800594957369344e+16,
125
+ "trial_name": null,
126
+ "trial_params": null
127
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bdd57b3fdca966c3798b3c5d2a963dd81d66fd9d263ec872b80bf98f4056cf0
3
+ size 5039