thomasgauthier commited on
Commit
20e31fa
1 Parent(s): aa2f9f0

Added expert extraction code

Browse files
Files changed (1) hide show
  1. README.md +106 -1
README.md CHANGED
@@ -51,4 +51,109 @@ dtype: float16
51
  | [Unmixtraled-22B-v0.1-expert-5](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-5) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 5 MLPs | 1099.32373046875 |
52
  | [Unmixtraled-22B-v0.1-expert-6](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-6) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 6 MLPs | 341.5309753417969 |
53
  | [Unmixtraled-22B-v0.1-expert-7](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-7) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 7 MLPs | 2099.63818359375 |
54
- | [**Unmixtraled-22B-v0.1-lerp**](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-lerp) | **Mixtral 8x22B embed, attn, layernorm, lm_head + linear merge of expert 0-7 MLPs** | **1873.9874267578125** |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  | [Unmixtraled-22B-v0.1-expert-5](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-5) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 5 MLPs | 1099.32373046875 |
52
  | [Unmixtraled-22B-v0.1-expert-6](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-6) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 6 MLPs | 341.5309753417969 |
53
  | [Unmixtraled-22B-v0.1-expert-7](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-expert-7) | Mixtral 8x22B embed, attn, layernorm, lm_head + expert 7 MLPs | 2099.63818359375 |
54
+ | [**Unmixtraled-22B-v0.1-lerp**](https://huggingface.co/thomasgauthier/Unmixtraled-22B-v0.1-lerp) | **Mixtral 8x22B embed, attn, layernorm, lm_head + linear merge of expert 0-7 MLPs** | **1873.9874267578125** |
55
+
56
+ # Code
57
+
58
+ The following code was used to extract the experts and construct the dense models:
59
+
60
+ ```python
61
+ # pip install -U transformers huggingface_hub "git+https://github.com/arcee-ai/mergekit@7467108c05d56ef2bb4b8f33936d437dc448f7dd"
62
+
63
+ import fnmatch
64
+ import json
65
+ import os
66
+ import re
67
+ import shutil
68
+
69
+ import torch
70
+ from huggingface_hub import snapshot_download
71
+ from mergekit.architecture import get_architecture_info
72
+ from mergekit.common import ModelReference
73
+ from mergekit.io import LazyTensorLoader, TensorWriter
74
+ from tqdm import tqdm
75
+
76
+ MIXTRAL_MODEL_ID = "mistral-community/Mixtral-8x22B-v0.1"
77
+ MIXTRAL_PATH = snapshot_download(repo_id=MIXTRAL_MODEL_ID)
78
+ print(f"Mixtral downloaded to: {MIXTRAL_PATH}")
79
+
80
+ MISTRAL_PATH = snapshot_download(
81
+ repo_id="mistralai/Mistral-7B-v0.1", allow_patterns=["config.json"]
82
+ )
83
+ print(f"Mistral config downloaded to: {MISTRAL_PATH}")
84
+
85
+ with open(os.path.join(MISTRAL_PATH, "config.json"), "r") as f:
86
+ mistral_config = json.load(f)
87
+
88
+ with open(os.path.join(MIXTRAL_PATH, "config.json"), "r") as f:
89
+ mixtral_config = json.load(f)
90
+
91
+ combined_config = {
92
+ key: mixtral_config[key] for key in mistral_config if key in mixtral_config
93
+ }
94
+ combined_config["architectures"] = ["MistralForCausalLM"]
95
+ combined_config["model_type"] = "mistral"
96
+
97
+ mixtral_model_ref = ModelReference.parse(MIXTRAL_PATH)
98
+ mixtral_architecture_info = get_architecture_info(mixtral_model_ref.config())
99
+ mixtral_loader = LazyTensorLoader(mixtral_model_ref.tensor_index(), lazy_unpickle=True)
100
+
101
+ ALLOW_LIST = ["generation_config.json", "tokenizer.model", "tokenizer_config.json"]
102
+
103
+ def copy_directory(src, dest, allowed_patterns):
104
+ os.makedirs(dest, exist_ok=True)
105
+ for root, dirs, files in os.walk(src):
106
+ # Only keep directories that match at least one of the allowed patterns
107
+ dirs[:] = [d for d in dirs if any(fnmatch.fnmatch(d, pattern) for pattern in allowed_patterns)]
108
+ for file in files:
109
+ # Only copy files that match at least one of the allowed patterns
110
+ if any(fnmatch.fnmatch(file, pattern) for pattern in allowed_patterns):
111
+ src_path = os.path.join(root, file)
112
+ dest_path = os.path.join(dest, os.path.relpath(src_path, src))
113
+ os.makedirs(os.path.dirname(dest_path), exist_ok=True)
114
+ shutil.copy2(src_path, dest_path)
115
+
116
+ def get_tensor(layer_num, expert_num, tensor_type):
117
+ weight_name = f"model.layers.{layer_num}.block_sparse_moe.experts.{expert_num}.{tensor_type}.weight"
118
+ return mixtral_loader.get_tensor(weight_name)
119
+
120
+
121
+ def extract_layer_number(string):
122
+ match = re.search(r"layers\.(\d+)\.", string)
123
+ return int(match.group(1)) if match else None
124
+
125
+
126
+ def save_expert_as_dense(output_path, expert_num):
127
+ dense_model_ref = ModelReference.parse(output_path)
128
+ dense_architecture_info = get_architecture_info(dense_model_ref.config())
129
+
130
+ writer = TensorWriter(output_path, safe_serialization=True)
131
+
132
+ for weight_info in tqdm(dense_architecture_info.all_weights(dense_model_ref.config())):
133
+ if weight_info.name.endswith(".up_proj.weight"):
134
+ layer_num = extract_layer_number(weight_info.name)
135
+ writer.save_tensor(weight_info.name, get_tensor(layer_num, expert_num, "w3"))
136
+ elif weight_info.name.endswith(".down_proj.weight"):
137
+ layer_num = extract_layer_number(weight_info.name)
138
+ writer.save_tensor(weight_info.name, get_tensor(layer_num, expert_num, "w2"))
139
+ elif weight_info.name.endswith(".gate_proj.weight"):
140
+ layer_num = extract_layer_number(weight_info.name)
141
+ writer.save_tensor(weight_info.name, get_tensor(layer_num, expert_num, "w1"))
142
+ else:
143
+ writer.save_tensor(weight_info.name, mixtral_loader.get_tensor(weight_info.name))
144
+
145
+ writer.finalize()
146
+
147
+
148
+ num_experts = mixtral_config["num_local_experts"]
149
+
150
+ for expert_num in range(num_experts):
151
+ dense_path = f"./dense_expert_{expert_num}"
152
+ copy_directory(MIXTRAL_PATH, dense_path, ALLOW_LIST)
153
+
154
+ with open(os.path.join(dense_path, "config.json"), "w") as f:
155
+ json.dump(combined_config, f, indent=2)
156
+
157
+ save_expert_as_dense(dense_path, expert_num)
158
+ print(f"Dense model #{expert_num} saved to {os.path.abspath(dense_path)}")
159
+ ```