ArthurZ
/

fairseq-nllb-moe

Model card Files Files and versions Community

ArthurZ HF staff commited on Mar 13, 2023

Commit

7efae60

•

1 Parent(s): 0edb56d

Update convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py

Browse files

Files changed (1) hide show

convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py +23 -33

convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,9 +23,6 @@ from transformers.modeling_utils import dtype_byte_size
 from transformers.utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME
-# 'encoder.layers.7.moe_layer.experts.0.fc2.bias', 'encoder.layers.11.moe_layer.experts.0.fc1.weight',
 def remove_ignore_keys_(state_dict):
     ignore_keys = [
         "encoder.version",
@@ -48,30 +45,30 @@ def make_linear_from_emb(emb):
     return lin_layer
-def rename_fairseq_keys(state_dict, expert_idx = None):
-    # 'encoder.layers.7.moe_layer.experts.0.fc2.bias' ->'encoder.layers.7.ffn.mlp.experts.0.fc2.bias'
-    # 'encoder.layers.7.fc2.bias' ->  'encoder.layers.7.ffn.mlp.fc2.bias'
-    # encoder.layers.7.wg -> encoder.layers.7.ffn.mlp.router.classifier
     new_dict = {}
     for old_key in state_dict.keys():
         key = old_key
         if "experts" in key:
-            key = key.replace("moe_layer.experts.0", f"ffn.mlp.experts.{expert_idx}")
-        elif "fc2" :
-            key = key.replace(".fc2.", ".ffn.mlp.fc2")
-        elif "fc1" :
-            key = key.replace(".fc1.", ".ffn.mlp.fc1")
         elif "gate" in key:
             key = key.replace(".moe_layer.gate.wg", ".ffn.mlp.router.classifier")
         new_dict[key] = state_dict[old_key]
     return new_dict
-def shard_on_the_fly(
-    switch_checkpoint_path, dump_path, num_experts, dtype, weights_name: str = WEIGHTS_NAME
-):
     sharded_state_dicts = []
-    current_block = {}
     total_size = 0
     os.makedirs(dump_path, exist_ok=True)
@@ -105,7 +102,6 @@ def shard_on_the_fly(
     # Otherwise, let's build the index
     weight_map = {}
-    shards = {}
     for idx, shard in enumerate(sharded_state_dicts):
         shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
         temp_filename = os.path.join(dump_path, weights_name.replace(".bin", f"-{idx+1:05d}-of-???.bin"))
@@ -143,23 +139,17 @@ if __name__ == "__main__":
         help="Path to the output pytorch model.",
     )
     args = parser.parse_args()
-    # metadata, index = shard_on_the_fly(
-    #     args.nllb_moe_checkpoint_path,
-    #     args.pytorch_dump_folder_path,
-    #     128,
-    #     args.dtype,
-    # )
     config = NllbMoeConfig.from_pretrained(
-        "facebook/nllb-200-3.3B",
-        num_sparse_encoder_layers=4,
-        num_sparse_decoder_layers=4,
     )
     config.save_pretrained(args.pytorch_dump_folder_path)
-    model = NllbMoeModel(config)
     model.save_pretrained(args.pytorch_dump_folder_path)
-    # model.push_to_hub("ArthurZ/nllb-moe-54b", use_auth_token="")
-    # model.save_pretrained(args.pytorch_dump_folder_path)

+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 from transformers.utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME
 def remove_ignore_keys_(state_dict):
     ignore_keys = [
         "encoder.version",
     return lin_layer
+def rename_fairseq_keys(state_dict, expert_idx=None):
     new_dict = {}
     for old_key in state_dict.keys():
         key = old_key
         if "experts" in key:
+            key = key.replace("moe_layer.experts.0", f"ffn.mlp.experts.expert_{expert_idx}")
         elif "gate" in key:
             key = key.replace(".moe_layer.gate.wg", ".ffn.mlp.router.classifier")
+        if "fc2" and "experts" not in key:
+            key = key.replace(".fc2.", ".ffn.mlp.fc2.")
+        if "fc1" and "experts" not in key:
+            key = key.replace(".fc1.", ".ffn.mlp.fc1.")
+        if ".encoder_attn." in key:
+            key = key.replace(".encoder_attn.", ".cross_attention.")
+        if "encoder_attn_layer_norm" in key:
+            key = key.replace("encoder_attn_layer_norm", "cross_attention_layer_norm")
+        if "final_layer_norm" in key:
+            key = key.replace("final_layer_norm", "ffn.layer_norm")
         new_dict[key] = state_dict[old_key]
     return new_dict
+def shard_on_the_fly(switch_checkpoint_path, dump_path, num_experts, dtype, weights_name: str = WEIGHTS_NAME):
     sharded_state_dicts = []
     total_size = 0
     os.makedirs(dump_path, exist_ok=True)
     # Otherwise, let's build the index
     weight_map = {}
     for idx, shard in enumerate(sharded_state_dicts):
         shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
         temp_filename = os.path.join(dump_path, weights_name.replace(".bin", f"-{idx+1:05d}-of-???.bin"))
         help="Path to the output pytorch model.",
     )
     args = parser.parse_args()
+    metadata, index = shard_on_the_fly(
+        args.nllb_moe_checkpoint_path,
+        args.pytorch_dump_folder_path,
+        128,
+        args.dtype,
+    )
     config = NllbMoeConfig.from_pretrained(
+        "facebook/nllb-200-3.3B", encoder_sparse_step=4, decoder_sparse_step=4, num_experts=128
     )
     config.save_pretrained(args.pytorch_dump_folder_path)
+    model = NllbMoeModel.from_pretrained(args.pytorch_dump_folder_path)
+    print("Done")
     model.save_pretrained(args.pytorch_dump_folder_path)