chargoddard commited on
Commit
2bece07
1 Parent(s): 0a07df8

Upload frankenmerge script

Browse files
Files changed (1) hide show
  1. frankenllama_22b.py +188 -0
frankenllama_22b.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Charles O. Goddard
3
+ # 7/20/2023
4
+ """Script used to generate the base frankenmerge. Output will need fine-tuning to be useful."""
5
+
6
+ import copy
7
+ import torch
8
+ from torch import Tensor, nn
9
+ import transformers
10
+
11
+ from transformers.models.llama.modeling_llama import (
12
+ LlamaForCausalLM,
13
+ LlamaDecoderLayer,
14
+ )
15
+ from transformers import LlamaForCausalLM, LlamaConfig
16
+
17
+ import torch
18
+ import transformers
19
+ import numpy as np
20
+
21
+
22
+ MODEL_NAME_13B = "meta-llama/Llama-2-13b-hf" # primary model
23
+ MODEL_NAME_33B = "huggyllama/llama-30b" # donor
24
+ BLOCK_DIAGONAL = True
25
+ # If BLOCK_DIAGONAL is set to True, each tensor in the resultant model will form a
26
+ # block diagonal matrix, as illustrated below:
27
+
28
+ # a a a 0 0
29
+ # a a a 0 0
30
+ # a a a 0 0
31
+ # 0 0 0 b b
32
+ # 0 0 0 b b
33
+
34
+ # In this configuration, the states (hidden and intermediate) from the original
35
+ # and donor models are completely decoupled. That is, the hidden states
36
+ # corresponding to the original model remain unchanged, and the new dimensions
37
+ # added from the donor model do not depend on the hidden states of the original model.
38
+
39
+ # If BLOCK_DIAGONAL is set to False, the tensors will instead have the following form:
40
+
41
+ # a a a 0 0
42
+ # a a a 0 0
43
+ # a a a 0 0
44
+ # b b b b b
45
+ # b b b b b
46
+
47
+ # In this case, the output of the newly added attention heads depends on the hidden
48
+ # state values as if they were part of the donor model. Although the original model's
49
+ # hidden states remain unchanged in either case, interaction between the new and old
50
+ # features will result in features of varying usefulness.
51
+
52
+
53
+ class NoInit:
54
+ def __enter__(self):
55
+ def noop(*args, **kwargs):
56
+ pass
57
+
58
+ (k, u, n) = (
59
+ torch.nn.init.kaiming_uniform_,
60
+ torch.nn.init.uniform_,
61
+ torch.nn.init.normal_,
62
+ )
63
+ torch.nn.init.kaiming_uniform_ = noop
64
+ torch.nn.init.uniform_ = noop
65
+ torch.nn.init.normal_ = noop
66
+
67
+ transformers.modeling_utils._init_weights = False
68
+ self.funcs = (k, u, n)
69
+
70
+ def __exit__(self, *args):
71
+ (k, u, n) = self.funcs
72
+ (
73
+ torch.nn.init.kaiming_uniform_,
74
+ torch.nn.init.uniform_,
75
+ torch.nn.init.normal_,
76
+ ) = (
77
+ k,
78
+ u,
79
+ n,
80
+ )
81
+ transformers.modeling_utils._init_weights = True
82
+
83
+
84
+ def format_kmb(n, digits=None):
85
+ n = int(n)
86
+ if n < 1000:
87
+ return str(n)
88
+ elif n < 1000_000:
89
+ return f"{round(n/1000, digits)}k"
90
+ elif n < 1000 * 1000 * 1000:
91
+ return f"{round(n/(1000*1000), digits)}m"
92
+ else:
93
+ return f"{round(n/(1000*1000*1000), digits)}b"
94
+
95
+
96
+ def count_params(model):
97
+ model_parameters = filter(lambda p: p.requires_grad, model.parameters())
98
+ params = sum([np.prod(p.size()) for p in model_parameters])
99
+ return int(params)
100
+
101
+
102
+ torch.set_default_dtype(torch.float16)
103
+
104
+ config_13b: LlamaConfig = LlamaConfig.from_pretrained(MODEL_NAME_13B)
105
+ config_33b: LlamaConfig = LlamaConfig.from_pretrained(MODEL_NAME_33B)
106
+ config_more = copy.deepcopy(config_13b)
107
+ config_more.intermediate_size = config_33b.intermediate_size
108
+ config_more.hidden_size = config_33b.hidden_size
109
+ config_more.num_key_value_heads = config_33b.num_key_value_heads
110
+ config_more.num_attention_heads = config_33b.num_key_value_heads
111
+
112
+ print(config_more)
113
+
114
+ with NoInit():
115
+ model = LlamaForCausalLM(config_more)
116
+
117
+ print(f"{format_kmb(count_params(model), 3)} parameters")
118
+
119
+
120
+ def merge_tensors_inplace(dest: Tensor, s0: Tensor, s1: Tensor, block_diagonal: bool):
121
+ dest.zero_()
122
+ if block_diagonal:
123
+ dest[s0.shape[0] :, s0.shape[1] :] = s1[
124
+ s0.shape[0] : dest.shape[0],
125
+ s0.shape[1] : dest.shape[1],
126
+ ]
127
+ else:
128
+ dest[s0.shape[0] :, :] = s1[
129
+ s0.shape[0] : dest.shape[0],
130
+ : dest.shape[1],
131
+ ]
132
+ dest[: s0.shape[0], : s0.shape[1]] = s0
133
+
134
+
135
+ with NoInit():
136
+ donor_13b = (
137
+ LlamaForCausalLM.from_pretrained(MODEL_NAME_13B).to(torch.float16).eval()
138
+ )
139
+ donor_33b = (
140
+ LlamaForCausalLM.from_pretrained(MODEL_NAME_33B).to(torch.float16).eval()
141
+ )
142
+
143
+ with torch.no_grad():
144
+ for layer_idx in range(len(model.model.layers)):
145
+ layer: LlamaDecoderLayer = model.model.layers[layer_idx]
146
+ l13: LlamaDecoderLayer = donor_13b.model.layers[layer_idx]
147
+ l33: LlamaDecoderLayer = donor_33b.model.layers[layer_idx]
148
+
149
+ for name in ("q_proj", "k_proj", "v_proj", "o_proj"):
150
+ dest: nn.Linear = getattr(layer.self_attn, name)
151
+ s13: nn.Linear = getattr(l13.self_attn, name)
152
+ s33: nn.Linear = getattr(l33.self_attn, name)
153
+ merge_tensors_inplace(dest.weight, s13.weight, s33.weight, BLOCK_DIAGONAL)
154
+
155
+ for name in ("up_proj", "gate_proj", "down_proj"):
156
+ dest: nn.Linear = getattr(layer.mlp, name)
157
+ s13: nn.Linear = getattr(l13.mlp, name)
158
+ s33: nn.Linear = getattr(l33.mlp, name)
159
+ merge_tensors_inplace(dest.weight, s13.weight, s33.weight, BLOCK_DIAGONAL)
160
+
161
+ layer.input_layernorm.weight[:] = l33.input_layernorm.weight[
162
+ : layer.input_layernorm.weight.shape[0]
163
+ ]
164
+ layer.input_layernorm.weight[
165
+ : l13.input_layernorm.weight.shape[0]
166
+ ] = l13.input_layernorm.weight
167
+ layer.post_attention_layernorm.weight[:] = l33.post_attention_layernorm.weight[
168
+ : layer.post_attention_layernorm.weight.shape[0]
169
+ ]
170
+ layer.post_attention_layernorm.weight[
171
+ : l13.post_attention_layernorm.weight.shape[0]
172
+ ] = l13.post_attention_layernorm.weight
173
+
174
+ # have initial output depend on only original llama2-13b features, so model
175
+ # starts unimpaired and can learn to incorporate the new features as well
176
+ model.lm_head.weight.zero_()
177
+ model.lm_head.weight[
178
+ : donor_13b.lm_head.weight.shape[0], : donor_13b.lm_head.weight.shape[1]
179
+ ] = donor_13b.lm_head.weight
180
+
181
+ merge_tensors_inplace(
182
+ model.model.embed_tokens.weight,
183
+ donor_13b.model.embed_tokens.weight,
184
+ donor_33b.model.embed_tokens.weight,
185
+ BLOCK_DIAGONAL,
186
+ )
187
+
188
+ model.save_pretrained("./llama2-22b/", safe_serialization=True)