Files changed (1) hide show
  1. scripts/deepseek_slice.py +3 -173
scripts/deepseek_slice.py CHANGED
@@ -1,173 +1,3 @@
1
- from transformers import AutoModelForCausalLM, AutoTokenizer
2
- from safetensors.torch import safe_open, save_file
3
- import torch
4
- import os
5
- from pathlib import Path
6
- import json
7
- import re
8
-
9
- model_dir_name = "DeepSeek-V3-bf16"
10
- model_dir_path = Path(model_dir_name)
11
-
12
- output_dir_name = "DeepSeek-V3-slice"
13
- output_dir_path = Path(output_dir_name)
14
- os.makedirs(output_dir_name, exist_ok=True)
15
-
16
- try:
17
- tensor_map_json = json.load(open(model_dir_path / "model.safetensors.index.json"))
18
- weight_map = tensor_map_json["weight_map"]
19
- except FileNotFoundError:
20
- print("モデルのインデックスファイルが見つかりません")
21
- raise
22
-
23
- tensor_files = list(set(weight_map.values()))
24
- tensor_files.sort()
25
- print(f"変換対象のファイル数: {len(tensor_files)}")
26
-
27
- try:
28
- config_json = json.load(open(model_dir_path / "config.json"))
29
- except FileNotFoundError:
30
- print("モデルの設定ファイルが見つかりません")
31
- raise
32
-
33
- # experts
34
- n_routed_experts = int(config_json["n_routed_experts"])
35
-
36
- # layers
37
- num_hidden_layers = int(config_json["num_hidden_layers"])
38
-
39
- # active experts
40
- num_experts_per_tok = int(config_json["num_experts_per_tok"])
41
-
42
- # このlayer-idxからdenseレイヤーをMoEにする
43
- first_k_dense_replace = int(config_json["first_k_dense_replace"])
44
-
45
- converted_tensors_size = 0
46
-
47
- target_n_routed_experts = 64
48
-
49
-
50
- def print_tensor_info(tensor, key, new_key=None):
51
- print(f"key: {key} to {new_key if new_key else key}, shape: {tensor.shape}, size: {tensor.numel() * tensor.element_size() } Byte")
52
-
53
- def ensure_tensor_has_data(tensor):
54
- try:
55
- # テンソルが実際にアクセス可能かテスト
56
- tensor[0]
57
- return tensor
58
- except Exception as e:
59
- print(f"テンソルの再構築が必要: {e}")
60
- # テンソルを明示的に再構築
61
- return torch.tensor(tensor.cpu().numpy(), dtype=tensor.dtype)
62
-
63
- with open("layer_topk_idx_distribution.json", "r") as f:
64
- layer_topk_idx_distribution = json.load(f)
65
-
66
-
67
- for i, tensor_file_name in enumerate(tensor_files, 1):
68
- print(f"\n処理中: {tensor_file_name} ({i}/{len(tensor_files)})")
69
-
70
- tensor_path = model_dir_path / tensor_file_name
71
- tensor_data = safe_open(tensor_path, framework="pt")
72
- converted_tensors = {}
73
-
74
- for key in tensor_data.keys():
75
- tensor = tensor_data.get_tensor(key)
76
- tensor = ensure_tensor_has_data(tensor) # テンソルの実データを確保
77
-
78
- # レイヤーidxを取得 model.layers.0.から数値 ない場合もある
79
- layer_idx = int(re.search(r'model\.layers\.(\d+)\.', key).group(1)) if re.search(r'model\.layers\.(\d+)\.', key) else -1
80
-
81
- # レイヤーidxがない場合はそのまま保存
82
- if layer_idx < first_k_dense_replace:
83
- converted_tensors[key] = tensor.clone()
84
- converted_tensors_size += tensor.numel() * tensor.element_size()
85
- print_tensor_info(tensor, key, key)
86
- continue
87
-
88
- if layer_idx >= num_hidden_layers:
89
- del tensor_map_json["weight_map"][key]
90
- continue
91
-
92
- # layer_topk_idx_distribution から当該レイヤーで使いたい experts idx を取得
93
- if str(layer_idx) in layer_topk_idx_distribution:
94
- experts_list = layer_topk_idx_distribution[str(layer_idx)]["experts"][:target_n_routed_experts]
95
- else:
96
- step = n_routed_experts // target_n_routed_experts
97
- experts_list = list(range(0, n_routed_experts, step))[:target_n_routed_experts]
98
- experts_list.sort()
99
- experts_tensor = torch.tensor(experts_list, dtype=torch.long, device=tensor.device)
100
-
101
- # experts
102
- if ".mlp.experts." in key:
103
- experts_idx = int(re.search(r'\.mlp\.experts\.(\d+)\.', key).group(1))
104
- if experts_idx in experts_list:
105
- new_key = key.replace(f".mlp.experts.{experts_idx}.", f".mlp.experts.{experts_list.index(experts_idx)}.")
106
- converted_tensors[new_key] = tensor.clone()
107
- converted_tensors_size += tensor.numel() * tensor.element_size()
108
- print_tensor_info(tensor, key, new_key)
109
- tensor_map_json["weight_map"][new_key] = tensor_file_name
110
- else:
111
- print(f"skip experts: {key}")
112
- continue
113
-
114
- # shared-experts
115
- if ".mlp.shared_experts." in key:
116
- # shared-expertsを保存
117
- converted_tensors[key] = tensor.clone()
118
- converted_tensors_size += tensor.numel() * tensor.element_size()
119
- print_tensor_info(tensor, key, key)
120
- continue
121
-
122
- if ".mlp.gate.e_score_correction_bias" in key:
123
- # Tensor [256]を [target_n_routed_experts]に変換
124
- squeezed_tensor = tensor[experts_tensor].clone()
125
- converted_tensors[key] = squeezed_tensor
126
- converted_tensors_size += squeezed_tensor.numel() * squeezed_tensor.element_size()
127
- print_tensor_info(squeezed_tensor, key, key)
128
- continue
129
-
130
- if ".mlp.gate.weight" in key:
131
- # Tensor [256, 7168]を [target_n_routed_experts, 7168]に変換
132
- squeezed_tensor = tensor[experts_tensor, :].clone()
133
- converted_tensors[key] = squeezed_tensor
134
- converted_tensors_size += squeezed_tensor.numel() * squeezed_tensor.element_size()
135
- print_tensor_info(squeezed_tensor, key, key)
136
- continue
137
-
138
- converted_tensors[key] = tensor.clone()
139
- converted_tensors_size += tensor.numel() * tensor.element_size()
140
- print_tensor_info(tensor, key, key)
141
-
142
- save_file(converted_tensors, output_dir_path / tensor_file_name, metadata={"format": "pt"})
143
-
144
- print(f"\n変換完了!")
145
- print(f"合計サイズ: {converted_tensors_size / (1024**3):.2f} GB")
146
-
147
- # model.safetensors.index.json
148
-
149
- old_keys = list(tensor_map_json["weight_map"].keys())
150
- for key in old_keys:
151
- if ".mlp.experts." in key:
152
- experts_idx = int(re.search(r'\.mlp\.experts\.(\d+)\.', key).group(1))
153
- if experts_idx >= target_n_routed_experts:
154
- del tensor_map_json["weight_map"][key]
155
-
156
-
157
- tensor_map_json["metadata"]["total_size"] = converted_tensors_size
158
- with open(output_dir_path / "model.safetensors.index.json", "w") as f:
159
- json.dump(tensor_map_json, f, indent=4)
160
-
161
- # config.json
162
- output_config_json = config_json.copy()
163
- output_config_json["n_routed_experts"] = target_n_routed_experts
164
- # output_config_json["num_hidden_layers"] = num_hidden_layers
165
- output_config_json["num_experts_per_tok"] = 4
166
- # output_config_json["first_k_dense_replace"] = first_k_dense_replace
167
- # output_config_json["n_shared_experts"] = n_shared_experts
168
- # output_config_json["topk_group"] = topk_group
169
- # output_config_json["n_group"] = n_group
170
-
171
-
172
- with open(output_dir_path / "config.json", "w") as f:
173
- json.dump(output_config_json, f, indent=4)
 
1
+ \[
2
+ R = P \land Q \land C
3
+ \]