|
import json
|
|
import os
|
|
import re
|
|
from huggingface_hub import snapshot_download
|
|
|
|
import torch
|
|
from safetensors import safe_open
|
|
from transformers import AutoProcessor, MllamaForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM, AutoConfig
|
|
|
|
|
|
total_layers=32
|
|
|
|
|
|
cross_attention_layers = [3, 8, 13, 18, 23, 28, 33, 38]
|
|
|
|
|
|
target_model = "meta-llama/Llama-3.1-8B-Instruct"
|
|
print(f"Target model: {target_model}")
|
|
|
|
source_model = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
|
print(f"Source model: {source_model}")
|
|
|
|
def create_inverse_layer_mapping(total_layers=total_layers, cross_attn_layers=cross_attention_layers):
|
|
"""
|
|
Creates a mapping from 90B/11B layer indices to 70B/8B layer indices.
|
|
"""
|
|
mapping = {}
|
|
removed_layers = []
|
|
|
|
|
|
for i in range(40):
|
|
if i not in cross_attn_layers and len(mapping) < total_layers:
|
|
mapping[i] = len(mapping)
|
|
else:
|
|
removed_layers.append(i)
|
|
return mapping, removed_layers
|
|
|
|
def load_sharded_state_dict(model_id):
|
|
"""
|
|
Load a sharded state dict from either a local directory or a Hugging Face model ID.
|
|
|
|
Args:
|
|
model_id: Either a local path or a Hugging Face model ID (e.g., "meta-llama/Llama-2-7b")
|
|
|
|
Returns:
|
|
dict: The loaded state dictionary
|
|
"""
|
|
|
|
if os.path.isdir(model_id):
|
|
model_dir = model_id
|
|
else:
|
|
|
|
print(f"Downloading model from Hugging Face: {model_id}")
|
|
model_dir = snapshot_download(
|
|
model_id,
|
|
allow_patterns=["*.safetensors*", "*.json"],
|
|
ignore_patterns=["*.bin", "*.md", "*.py"]
|
|
)
|
|
|
|
|
|
index_file = os.path.join(model_dir, 'model.safetensors.index.json')
|
|
if not os.path.exists(index_file):
|
|
raise FileNotFoundError(f"Could not find index file: {index_file}")
|
|
|
|
with open(index_file, 'r') as f:
|
|
index_data = json.load(f)
|
|
|
|
weight_map = index_data['weight_map']
|
|
state_dict = {}
|
|
shard_to_params = {}
|
|
|
|
|
|
for param_name, shard_file in weight_map.items():
|
|
if shard_file not in shard_to_params:
|
|
shard_to_params[shard_file] = []
|
|
shard_to_params[shard_file].append(param_name)
|
|
|
|
|
|
for shard_file, params_in_shard in shard_to_params.items():
|
|
shard_path = os.path.join(model_dir, shard_file)
|
|
with safe_open(shard_path, framework="pt", device="cpu") as f:
|
|
for name in params_in_shard:
|
|
state_dict[name] = f.get_tensor(name)
|
|
|
|
return state_dict
|
|
|
|
def compare_model_states(model, new_state_dict):
|
|
current_state = model.state_dict()
|
|
unchanged_params = []
|
|
changed_params = []
|
|
missing_params = []
|
|
|
|
for name, param in current_state.items():
|
|
if name not in new_state_dict:
|
|
missing_params.append(name)
|
|
elif torch.equal(param, new_state_dict[name]):
|
|
unchanged_params.append(name)
|
|
else:
|
|
sum_abs_diff = torch.sum(torch.abs(param - new_state_dict[name]))
|
|
changed_params.append({'name': name, 'sum_abs_diff': sum_abs_diff.item()})
|
|
|
|
return {
|
|
'unchanged': unchanged_params,
|
|
'changed': changed_params,
|
|
'missing': missing_params
|
|
}
|
|
|
|
|
|
layer_mapping, removed_layers = create_inverse_layer_mapping()
|
|
|
|
|
|
source_state_dict = load_sharded_state_dict(source_model)
|
|
|
|
|
|
target_state_dict = {}
|
|
|
|
|
|
for name, param in source_state_dict.items():
|
|
|
|
if not (name.startswith('language_model.model.layers.') or
|
|
name == 'language_model.model.embed_tokens.weight' or
|
|
name == 'language_model.lm_head.weight' or
|
|
name == 'language_model.model.norm.weight'):
|
|
continue
|
|
|
|
if name.startswith('language_model.model.layers.'):
|
|
|
|
layer_match = re.match(r'language_model\.model\.layers\.(\d+)\.(.+)', name)
|
|
if layer_match:
|
|
source_layer = int(layer_match.group(1))
|
|
if source_layer in layer_mapping:
|
|
target_layer = layer_mapping[source_layer]
|
|
new_name = f'model.layers.{target_layer}.{layer_match.group(2)}'
|
|
target_state_dict[new_name] = param
|
|
elif name == 'language_model.lm_head.weight':
|
|
|
|
target_state_dict['lm_head.weight'] = param
|
|
elif name == 'language_model.model.embed_tokens.weight':
|
|
|
|
original_embed_size = 128256
|
|
target_state_dict['model.embed_tokens.weight'] = param[:original_embed_size, :]
|
|
elif name == 'language_model.model.norm.weight':
|
|
|
|
target_state_dict['model.norm.weight'] = param
|
|
|
|
|
|
|
|
with open('target_state_dict.txt', 'w') as f:
|
|
f.write('\n'.join(target_state_dict.keys()))
|
|
|
|
config = AutoConfig.from_pretrained(target_model)
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
None,
|
|
config=config,
|
|
state_dict = target_state_dict,
|
|
torch_dtype=torch.bfloat16,
|
|
)
|
|
|
|
'''
|
|
|
|
origmodel = AutoModelForCausalLM.from_pretrained(
|
|
target_model,
|
|
torch_dtype=torch.bfloat16,
|
|
)
|
|
|
|
result = compare_model_states(model, origmodel.state_dict())
|
|
print("Unchanged parameters:", len(result['unchanged']))
|
|
print("Changed parameters:", len(result['changed']))
|
|
print("Missing parameters:", len(result['missing']))
|
|
|
|
#write result to file
|
|
with open('result.txt', 'w') as f:
|
|
f.write(json.dumps(result, indent=2))
|
|
|
|
'''
|
|
|
|
processor = AutoTokenizer.from_pretrained(target_model)
|
|
|
|
|
|
model.save_pretrained("Llama-3.2-8B-extracted")
|
|
processor.save_pretrained("Llama-3.2-8B-extracted") |