Update patch.diff
Browse files- patch.diff +16 -57
patch.diff
CHANGED
@@ -1,21 +1,23 @@
|
|
1 |
diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
|
2 |
-
index
|
3 |
--- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
|
4 |
+++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
|
5 |
-
@@ -17,10 +17,
|
6 |
import os
|
7 |
import shutil
|
8 |
import warnings
|
9 |
-
|
|
|
10 |
+from typing import List
|
11 |
import torch
|
12 |
|
13 |
-from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast
|
|
|
14 |
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast, GenerationConfig
|
15 |
from transformers.convert_slow_tokenizer import TikTokenConverter
|
16 |
|
17 |
|
18 |
-
@@ -85,8 +
|
19 |
"65B": 8,
|
20 |
"70B": 8,
|
21 |
"70Bf": 8,
|
@@ -28,7 +30,7 @@ index a0fbe4680..50c7ed738 100644
|
|
28 |
|
29 |
def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
|
30 |
return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
|
31 |
-
@@ -107,9 +
|
32 |
input_base_path,
|
33 |
model_size=None,
|
34 |
safe_serialization=True,
|
@@ -40,7 +42,7 @@ index a0fbe4680..50c7ed738 100644
|
|
40 |
):
|
41 |
os.makedirs(model_path, exist_ok=True)
|
42 |
tmp_model_path = os.path.join(model_path, "tmp")
|
43 |
-
@@ -125,18 +
|
44 |
dims_per_head = dim // n_heads
|
45 |
base = params.get("rope_theta", 10000.0)
|
46 |
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
|
@@ -62,7 +64,7 @@ index a0fbe4680..50c7ed738 100644
|
|
62 |
if params.get("n_kv_heads", None) is not None:
|
63 |
num_key_value_heads = params["n_kv_heads"] # for GQA / MQA
|
64 |
num_key_value_heads_per_shard = num_key_value_heads // num_shards
|
65 |
-
@@ -144,8 +
|
66 |
else: # compatibility with other checkpoints
|
67 |
num_key_value_heads = n_heads
|
68 |
num_key_value_heads_per_shard = n_heads_per_shard
|
@@ -72,7 +74,7 @@ index a0fbe4680..50c7ed738 100644
|
|
72 |
|
73 |
# permute for sliced rotary
|
74 |
def permute(w, n_heads, dim1=dim, dim2=dim):
|
75 |
-
@@ -159,11 +
|
76 |
loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
|
77 |
else:
|
78 |
# Sharded
|
@@ -87,7 +89,7 @@ index a0fbe4680..50c7ed738 100644
|
|
87 |
param_count = 0
|
88 |
index_dict = {"weight_map": {}}
|
89 |
for layer_i in range(n_layers):
|
90 |
-
@@ -263,7 +
|
91 |
"lm_head.weight": loaded["output.weight"],
|
92 |
}
|
93 |
else:
|
@@ -96,7 +98,7 @@ index a0fbe4680..50c7ed738 100644
|
|
96 |
state_dict = {
|
97 |
"model.norm.weight": loaded[0]["norm.weight"],
|
98 |
"model.embed_tokens.weight": torch.cat(
|
99 |
-
@@ -282,6 +
|
100 |
write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
|
101 |
ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
|
102 |
multiple_of = params["multiple_of"] if "multiple_of" in params else 256
|
@@ -115,7 +117,7 @@ index a0fbe4680..50c7ed738 100644
|
|
115 |
config = LlamaConfig(
|
116 |
hidden_size=dim,
|
117 |
intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
|
118 |
-
@@ -292,11 +
|
119 |
vocab_size=vocab_size,
|
120 |
rope_theta=base,
|
121 |
max_position_embeddings=max_position_embeddings,
|
@@ -139,7 +141,7 @@ index a0fbe4680..50c7ed738 100644
|
|
139 |
# Make space so we can load the model properly now.
|
140 |
del state_dict
|
141 |
del loaded
|
142 |
-
@@ -313,7 +
|
143 |
|
144 |
|
145 |
class Llama3Converter(TikTokenConverter):
|
@@ -148,7 +150,7 @@ index a0fbe4680..50c7ed738 100644
|
|
148 |
super().__init__(vocab_file, **kwargs)
|
149 |
tokenizer = self.converted()
|
150 |
chat_template = (
|
151 |
-
@@ -327,34 +
|
152 |
"{% endfor %}"
|
153 |
"{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
|
154 |
)
|
@@ -167,14 +169,13 @@ index a0fbe4680..50c7ed738 100644
|
|
167 |
- ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)]
|
168 |
tokenizer.add_special_tokens(special_tokens)
|
169 |
|
170 |
-
+ print("MODEL MAX LENGTH", model_max_length)
|
171 |
-
+
|
172 |
self.tokenizer = PreTrainedTokenizerFast(
|
173 |
tokenizer_object=tokenizer,
|
174 |
bos_token="<|begin_of_text|>",
|
175 |
- eos_token="<|end_of_text|>",
|
|
|
176 |
+ eos_token="<|end_of_text|>" if not instruct else "<|eot_id|>",
|
177 |
-
|
178 |
model_input_names=["input_ids", "attention_mask"],
|
179 |
+ model_max_length=model_max_length,
|
180 |
)
|
@@ -290,45 +291,3 @@ index a0fbe4680..50c7ed738 100644
|
|
290 |
)
|
291 |
|
292 |
|
293 |
-
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
|
294 |
-
index 8cbe8fe35..65b4bb56b 100644
|
295 |
-
--- a/src/transformers/models/llama/modeling_llama.py
|
296 |
-
+++ b/src/transformers/models/llama/modeling_llama.py
|
297 |
-
@@ -90,6 +90,29 @@ class LlamaRMSNorm(nn.Module):
|
298 |
-
|
299 |
-
ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
|
300 |
-
|
301 |
-
+def apply_scaling(freqs: torch.Tensor):
|
302 |
-
+ # Values obtained from grid search
|
303 |
-
+ scale_factor = 8
|
304 |
-
+ low_freq_factor = 1
|
305 |
-
+ high_freq_factor = 4
|
306 |
-
+ old_context_len = 8192 # original llama3 length
|
307 |
-
+
|
308 |
-
+ low_freq_wavelen = old_context_len / low_freq_factor
|
309 |
-
+ high_freq_wavelen = old_context_len / high_freq_factor
|
310 |
-
+ new_freqs = []
|
311 |
-
+ for freq in freqs:
|
312 |
-
+ wavelen = 2 * math.pi / freq
|
313 |
-
+ if wavelen < high_freq_wavelen:
|
314 |
-
+ new_freqs.append(freq)
|
315 |
-
+ elif wavelen > low_freq_wavelen:
|
316 |
-
+ new_freqs.append(freq / scale_factor)
|
317 |
-
+ else:
|
318 |
-
+ assert low_freq_wavelen != high_freq_wavelen
|
319 |
-
+ smooth = (old_context_len / wavelen - low_freq_factor) / (
|
320 |
-
+ high_freq_factor - low_freq_factor
|
321 |
-
+ )
|
322 |
-
+ new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq)
|
323 |
-
+ return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
|
324 |
-
|
325 |
-
class LlamaRotaryEmbedding(nn.Module):
|
326 |
-
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
|
327 |
-
@@ -99,6 +122,7 @@ class LlamaRotaryEmbedding(nn.Module):
|
328 |
-
self.max_position_embeddings = max_position_embeddings
|
329 |
-
self.base = base
|
330 |
-
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
|
331 |
-
+ inv_freq = apply_scaling(inv_freq)
|
332 |
-
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
333 |
-
# For BC we register cos and sin cached
|
334 |
-
self.max_seq_len_cached = max_position_embeddings
|
|
|
1 |
diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
|
2 |
+
index a0fbe4680a..a6de9fe3ef 100644
|
3 |
--- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
|
4 |
+++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
|
5 |
+
@@ -17,10 +17,12 @@ import json
|
6 |
import os
|
7 |
import shutil
|
8 |
import warnings
|
9 |
-
|
10 |
+
+from turtledemo.sorting_animate import instructions1
|
11 |
+from typing import List
|
12 |
import torch
|
13 |
|
14 |
-from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast
|
15 |
+
+from setup import install_requires
|
16 |
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast, GenerationConfig
|
17 |
from transformers.convert_slow_tokenizer import TikTokenConverter
|
18 |
|
19 |
|
20 |
+
@@ -85,8 +87,12 @@ NUM_SHARDS = {
|
21 |
"65B": 8,
|
22 |
"70B": 8,
|
23 |
"70Bf": 8,
|
|
|
30 |
|
31 |
def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
|
32 |
return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
|
33 |
+
@@ -107,9 +113,10 @@ def write_model(
|
34 |
input_base_path,
|
35 |
model_size=None,
|
36 |
safe_serialization=True,
|
|
|
42 |
):
|
43 |
os.makedirs(model_path, exist_ok=True)
|
44 |
tmp_model_path = os.path.join(model_path, "tmp")
|
45 |
+
@@ -125,18 +132,11 @@ def write_model(
|
46 |
dims_per_head = dim // n_heads
|
47 |
base = params.get("rope_theta", 10000.0)
|
48 |
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
|
|
|
64 |
if params.get("n_kv_heads", None) is not None:
|
65 |
num_key_value_heads = params["n_kv_heads"] # for GQA / MQA
|
66 |
num_key_value_heads_per_shard = num_key_value_heads // num_shards
|
67 |
+
@@ -144,8 +144,7 @@ def write_model(
|
68 |
else: # compatibility with other checkpoints
|
69 |
num_key_value_heads = n_heads
|
70 |
num_key_value_heads_per_shard = n_heads_per_shard
|
|
|
74 |
|
75 |
# permute for sliced rotary
|
76 |
def permute(w, n_heads, dim1=dim, dim2=dim):
|
77 |
+
@@ -159,11 +158,9 @@ def write_model(
|
78 |
loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
|
79 |
else:
|
80 |
# Sharded
|
|
|
89 |
param_count = 0
|
90 |
index_dict = {"weight_map": {}}
|
91 |
for layer_i in range(n_layers):
|
92 |
+
@@ -263,7 +260,7 @@ def write_model(
|
93 |
"lm_head.weight": loaded["output.weight"],
|
94 |
}
|
95 |
else:
|
|
|
98 |
state_dict = {
|
99 |
"model.norm.weight": loaded[0]["norm.weight"],
|
100 |
"model.embed_tokens.weight": torch.cat(
|
101 |
+
@@ -282,6 +279,18 @@ def write_model(
|
102 |
write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
|
103 |
ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
|
104 |
multiple_of = params["multiple_of"] if "multiple_of" in params else 256
|
|
|
117 |
config = LlamaConfig(
|
118 |
hidden_size=dim,
|
119 |
intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
|
120 |
+
@@ -292,11 +301,21 @@ def write_model(
|
121 |
vocab_size=vocab_size,
|
122 |
rope_theta=base,
|
123 |
max_position_embeddings=max_position_embeddings,
|
|
|
141 |
# Make space so we can load the model properly now.
|
142 |
del state_dict
|
143 |
del loaded
|
144 |
+
@@ -313,7 +332,7 @@ def write_model(
|
145 |
|
146 |
|
147 |
class Llama3Converter(TikTokenConverter):
|
|
|
150 |
super().__init__(vocab_file, **kwargs)
|
151 |
tokenizer = self.converted()
|
152 |
chat_template = (
|
153 |
+
@@ -327,34 +346,27 @@ class Llama3Converter(TikTokenConverter):
|
154 |
"{% endfor %}"
|
155 |
"{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
|
156 |
)
|
|
|
169 |
- ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)]
|
170 |
tokenizer.add_special_tokens(special_tokens)
|
171 |
|
|
|
|
|
172 |
self.tokenizer = PreTrainedTokenizerFast(
|
173 |
tokenizer_object=tokenizer,
|
174 |
bos_token="<|begin_of_text|>",
|
175 |
- eos_token="<|end_of_text|>",
|
176 |
+
- chat_template=chat_template,
|
177 |
+ eos_token="<|end_of_text|>" if not instruct else "<|eot_id|>",
|
178 |
+
+ chat_template=chat_template if instruct else None,
|
179 |
model_input_names=["input_ids", "attention_mask"],
|
180 |
+ model_max_length=model_max_length,
|
181 |
)
|
|
|
291 |
)
|
292 |
|
293 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|