Update patch.diff
Browse files- patch.diff +16 -18
patch.diff
CHANGED
@@ -1,23 +1,21 @@
|
|
1 |
diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
|
2 |
-
index a0fbe4680a..
|
3 |
--- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
|
4 |
+++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
|
5 |
-
@@ -17,10 +17,
|
6 |
import os
|
7 |
import shutil
|
8 |
import warnings
|
9 |
-
|
10 |
-
+from turtledemo.sorting_animate import instructions1
|
11 |
+from typing import List
|
12 |
import torch
|
13 |
|
14 |
-from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast
|
15 |
-
+from setup import install_requires
|
16 |
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast, GenerationConfig
|
17 |
from transformers.convert_slow_tokenizer import TikTokenConverter
|
18 |
|
19 |
|
20 |
-
@@ -85,8 +
|
21 |
"65B": 8,
|
22 |
"70B": 8,
|
23 |
"70Bf": 8,
|
@@ -30,7 +28,7 @@ index a0fbe4680a..a6de9fe3ef 100644
|
|
30 |
|
31 |
def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
|
32 |
return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
|
33 |
-
@@ -107,9 +
|
34 |
input_base_path,
|
35 |
model_size=None,
|
36 |
safe_serialization=True,
|
@@ -42,7 +40,7 @@ index a0fbe4680a..a6de9fe3ef 100644
|
|
42 |
):
|
43 |
os.makedirs(model_path, exist_ok=True)
|
44 |
tmp_model_path = os.path.join(model_path, "tmp")
|
45 |
-
@@ -125,18 +
|
46 |
dims_per_head = dim // n_heads
|
47 |
base = params.get("rope_theta", 10000.0)
|
48 |
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
|
@@ -64,7 +62,7 @@ index a0fbe4680a..a6de9fe3ef 100644
|
|
64 |
if params.get("n_kv_heads", None) is not None:
|
65 |
num_key_value_heads = params["n_kv_heads"] # for GQA / MQA
|
66 |
num_key_value_heads_per_shard = num_key_value_heads // num_shards
|
67 |
-
@@ -144,8 +
|
68 |
else: # compatibility with other checkpoints
|
69 |
num_key_value_heads = n_heads
|
70 |
num_key_value_heads_per_shard = n_heads_per_shard
|
@@ -74,7 +72,7 @@ index a0fbe4680a..a6de9fe3ef 100644
|
|
74 |
|
75 |
# permute for sliced rotary
|
76 |
def permute(w, n_heads, dim1=dim, dim2=dim):
|
77 |
-
@@ -159,11 +
|
78 |
loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
|
79 |
else:
|
80 |
# Sharded
|
@@ -89,7 +87,7 @@ index a0fbe4680a..a6de9fe3ef 100644
|
|
89 |
param_count = 0
|
90 |
index_dict = {"weight_map": {}}
|
91 |
for layer_i in range(n_layers):
|
92 |
-
@@ -263,7 +
|
93 |
"lm_head.weight": loaded["output.weight"],
|
94 |
}
|
95 |
else:
|
@@ -98,7 +96,7 @@ index a0fbe4680a..a6de9fe3ef 100644
|
|
98 |
state_dict = {
|
99 |
"model.norm.weight": loaded[0]["norm.weight"],
|
100 |
"model.embed_tokens.weight": torch.cat(
|
101 |
-
@@ -282,6 +
|
102 |
write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
|
103 |
ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
|
104 |
multiple_of = params["multiple_of"] if "multiple_of" in params else 256
|
@@ -117,7 +115,7 @@ index a0fbe4680a..a6de9fe3ef 100644
|
|
117 |
config = LlamaConfig(
|
118 |
hidden_size=dim,
|
119 |
intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
|
120 |
-
@@ -292,11 +
|
121 |
vocab_size=vocab_size,
|
122 |
rope_theta=base,
|
123 |
max_position_embeddings=max_position_embeddings,
|
@@ -141,7 +139,7 @@ index a0fbe4680a..a6de9fe3ef 100644
|
|
141 |
# Make space so we can load the model properly now.
|
142 |
del state_dict
|
143 |
del loaded
|
144 |
-
@@ -313,7 +
|
145 |
|
146 |
|
147 |
class Llama3Converter(TikTokenConverter):
|
@@ -150,7 +148,7 @@ index a0fbe4680a..a6de9fe3ef 100644
|
|
150 |
super().__init__(vocab_file, **kwargs)
|
151 |
tokenizer = self.converted()
|
152 |
chat_template = (
|
153 |
-
@@ -327,34 +
|
154 |
"{% endfor %}"
|
155 |
"{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
|
156 |
)
|
@@ -196,7 +194,7 @@ index a0fbe4680a..a6de9fe3ef 100644
|
|
196 |
else:
|
197 |
tokenizer = tokenizer_class(input_tokenizer_path)
|
198 |
print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
|
199 |
-
@@ -362,6 +
|
200 |
return tokenizer
|
201 |
|
202 |
|
@@ -234,7 +232,7 @@ index a0fbe4680a..a6de9fe3ef 100644
|
|
234 |
def main():
|
235 |
parser = argparse.ArgumentParser()
|
236 |
parser.add_argument(
|
237 |
-
@@ -383,9 +
|
238 |
# Different Llama versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used.
|
239 |
parser.add_argument(
|
240 |
"--llama_version",
|
@@ -247,7 +245,7 @@ index a0fbe4680a..a6de9fe3ef 100644
|
|
247 |
help="Version of the Llama model to convert. Currently supports Llama1 and Llama2. Controls the context size",
|
248 |
)
|
249 |
parser.add_argument(
|
250 |
-
@@ -394,11 +
|
251 |
type=int,
|
252 |
help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth",
|
253 |
)
|
@@ -283,7 +281,7 @@ index a0fbe4680a..a6de9fe3ef 100644
|
|
283 |
if args.model_size != "tokenizer_only":
|
284 |
write_model(
|
285 |
model_path=args.output_dir,
|
286 |
-
@@ -408,6 +
|
287 |
llama_version=args.llama_version,
|
288 |
vocab_size=vocab_size,
|
289 |
num_shards=args.num_shards,
|
|
|
1 |
diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
|
2 |
+
index a0fbe4680a..73944d2c2c 100644
|
3 |
--- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
|
4 |
+++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
|
5 |
+
@@ -17,10 +17,10 @@ import json
|
6 |
import os
|
7 |
import shutil
|
8 |
import warnings
|
9 |
-
|
|
|
10 |
+from typing import List
|
11 |
import torch
|
12 |
|
13 |
-from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast
|
|
|
14 |
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast, GenerationConfig
|
15 |
from transformers.convert_slow_tokenizer import TikTokenConverter
|
16 |
|
17 |
|
18 |
+
@@ -85,8 +85,12 @@ NUM_SHARDS = {
|
19 |
"65B": 8,
|
20 |
"70B": 8,
|
21 |
"70Bf": 8,
|
|
|
28 |
|
29 |
def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
|
30 |
return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
|
31 |
+
@@ -107,9 +111,10 @@ def write_model(
|
32 |
input_base_path,
|
33 |
model_size=None,
|
34 |
safe_serialization=True,
|
|
|
40 |
):
|
41 |
os.makedirs(model_path, exist_ok=True)
|
42 |
tmp_model_path = os.path.join(model_path, "tmp")
|
43 |
+
@@ -125,18 +130,11 @@ def write_model(
|
44 |
dims_per_head = dim // n_heads
|
45 |
base = params.get("rope_theta", 10000.0)
|
46 |
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
|
|
|
62 |
if params.get("n_kv_heads", None) is not None:
|
63 |
num_key_value_heads = params["n_kv_heads"] # for GQA / MQA
|
64 |
num_key_value_heads_per_shard = num_key_value_heads // num_shards
|
65 |
+
@@ -144,8 +142,7 @@ def write_model(
|
66 |
else: # compatibility with other checkpoints
|
67 |
num_key_value_heads = n_heads
|
68 |
num_key_value_heads_per_shard = n_heads_per_shard
|
|
|
72 |
|
73 |
# permute for sliced rotary
|
74 |
def permute(w, n_heads, dim1=dim, dim2=dim):
|
75 |
+
@@ -159,11 +156,9 @@ def write_model(
|
76 |
loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
|
77 |
else:
|
78 |
# Sharded
|
|
|
87 |
param_count = 0
|
88 |
index_dict = {"weight_map": {}}
|
89 |
for layer_i in range(n_layers):
|
90 |
+
@@ -263,7 +258,7 @@ def write_model(
|
91 |
"lm_head.weight": loaded["output.weight"],
|
92 |
}
|
93 |
else:
|
|
|
96 |
state_dict = {
|
97 |
"model.norm.weight": loaded[0]["norm.weight"],
|
98 |
"model.embed_tokens.weight": torch.cat(
|
99 |
+
@@ -282,6 +277,18 @@ def write_model(
|
100 |
write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
|
101 |
ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
|
102 |
multiple_of = params["multiple_of"] if "multiple_of" in params else 256
|
|
|
115 |
config = LlamaConfig(
|
116 |
hidden_size=dim,
|
117 |
intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
|
118 |
+
@@ -292,11 +299,21 @@ def write_model(
|
119 |
vocab_size=vocab_size,
|
120 |
rope_theta=base,
|
121 |
max_position_embeddings=max_position_embeddings,
|
|
|
139 |
# Make space so we can load the model properly now.
|
140 |
del state_dict
|
141 |
del loaded
|
142 |
+
@@ -313,7 +330,7 @@ def write_model(
|
143 |
|
144 |
|
145 |
class Llama3Converter(TikTokenConverter):
|
|
|
148 |
super().__init__(vocab_file, **kwargs)
|
149 |
tokenizer = self.converted()
|
150 |
chat_template = (
|
151 |
+
@@ -327,34 +344,27 @@ class Llama3Converter(TikTokenConverter):
|
152 |
"{% endfor %}"
|
153 |
"{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
|
154 |
)
|
|
|
194 |
else:
|
195 |
tokenizer = tokenizer_class(input_tokenizer_path)
|
196 |
print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
|
197 |
+
@@ -362,6 +372,37 @@ def write_tokenizer(tokenizer_path, input_tokenizer_path, llama_version=2):
|
198 |
return tokenizer
|
199 |
|
200 |
|
|
|
232 |
def main():
|
233 |
parser = argparse.ArgumentParser()
|
234 |
parser.add_argument(
|
235 |
+
@@ -383,9 +424,9 @@ def main():
|
236 |
# Different Llama versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used.
|
237 |
parser.add_argument(
|
238 |
"--llama_version",
|
|
|
245 |
help="Version of the Llama model to convert. Currently supports Llama1 and Llama2. Controls the context size",
|
246 |
)
|
247 |
parser.add_argument(
|
248 |
+
@@ -394,11 +435,34 @@ def main():
|
249 |
type=int,
|
250 |
help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth",
|
251 |
)
|
|
|
281 |
if args.model_size != "tokenizer_only":
|
282 |
write_model(
|
283 |
model_path=args.output_dir,
|
284 |
+
@@ -408,6 +472,7 @@ def main():
|
285 |
llama_version=args.llama_version,
|
286 |
vocab_size=vocab_size,
|
287 |
num_shards=args.num_shards,
|