lysandre HF staff commited on
Commit
5a75758
·
verified ·
1 Parent(s): 27a2933

Update patch.diff

Browse files
Files changed (1) hide show
  1. patch.diff +16 -57
patch.diff CHANGED
@@ -1,21 +1,23 @@
1
  diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
2
- index a0fbe4680..50c7ed738 100644
3
  --- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
4
  +++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
5
- @@ -17,10 +17,10 @@ import json
6
  import os
7
  import shutil
8
  import warnings
9
  -
 
10
  +from typing import List
11
  import torch
12
 
13
  -from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast
 
14
  +from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast, GenerationConfig
15
  from transformers.convert_slow_tokenizer import TikTokenConverter
16
 
17
 
18
- @@ -85,8 +85,12 @@ NUM_SHARDS = {
19
  "65B": 8,
20
  "70B": 8,
21
  "70Bf": 8,
@@ -28,7 +30,7 @@ index a0fbe4680..50c7ed738 100644
28
 
29
  def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
30
  return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
31
- @@ -107,9 +111,10 @@ def write_model(
32
  input_base_path,
33
  model_size=None,
34
  safe_serialization=True,
@@ -40,7 +42,7 @@ index a0fbe4680..50c7ed738 100644
40
  ):
41
  os.makedirs(model_path, exist_ok=True)
42
  tmp_model_path = os.path.join(model_path, "tmp")
43
- @@ -125,18 +130,11 @@ def write_model(
44
  dims_per_head = dim // n_heads
45
  base = params.get("rope_theta", 10000.0)
46
  inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
@@ -62,7 +64,7 @@ index a0fbe4680..50c7ed738 100644
62
  if params.get("n_kv_heads", None) is not None:
63
  num_key_value_heads = params["n_kv_heads"] # for GQA / MQA
64
  num_key_value_heads_per_shard = num_key_value_heads // num_shards
65
- @@ -144,8 +142,7 @@ def write_model(
66
  else: # compatibility with other checkpoints
67
  num_key_value_heads = n_heads
68
  num_key_value_heads_per_shard = n_heads_per_shard
@@ -72,7 +74,7 @@ index a0fbe4680..50c7ed738 100644
72
 
73
  # permute for sliced rotary
74
  def permute(w, n_heads, dim1=dim, dim2=dim):
75
- @@ -159,11 +156,9 @@ def write_model(
76
  loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
77
  else:
78
  # Sharded
@@ -87,7 +89,7 @@ index a0fbe4680..50c7ed738 100644
87
  param_count = 0
88
  index_dict = {"weight_map": {}}
89
  for layer_i in range(n_layers):
90
- @@ -263,7 +258,7 @@ def write_model(
91
  "lm_head.weight": loaded["output.weight"],
92
  }
93
  else:
@@ -96,7 +98,7 @@ index a0fbe4680..50c7ed738 100644
96
  state_dict = {
97
  "model.norm.weight": loaded[0]["norm.weight"],
98
  "model.embed_tokens.weight": torch.cat(
99
- @@ -282,6 +277,18 @@ def write_model(
100
  write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
101
  ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
102
  multiple_of = params["multiple_of"] if "multiple_of" in params else 256
@@ -115,7 +117,7 @@ index a0fbe4680..50c7ed738 100644
115
  config = LlamaConfig(
116
  hidden_size=dim,
117
  intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
118
- @@ -292,11 +299,21 @@ def write_model(
119
  vocab_size=vocab_size,
120
  rope_theta=base,
121
  max_position_embeddings=max_position_embeddings,
@@ -139,7 +141,7 @@ index a0fbe4680..50c7ed738 100644
139
  # Make space so we can load the model properly now.
140
  del state_dict
141
  del loaded
142
- @@ -313,7 +330,7 @@ def write_model(
143
 
144
 
145
  class Llama3Converter(TikTokenConverter):
@@ -148,7 +150,7 @@ index a0fbe4680..50c7ed738 100644
148
  super().__init__(vocab_file, **kwargs)
149
  tokenizer = self.converted()
150
  chat_template = (
151
- @@ -327,34 +344,29 @@ class Llama3Converter(TikTokenConverter):
152
  "{% endfor %}"
153
  "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
154
  )
@@ -167,14 +169,13 @@ index a0fbe4680..50c7ed738 100644
167
  - ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)]
168
  tokenizer.add_special_tokens(special_tokens)
169
 
170
- + print("MODEL MAX LENGTH", model_max_length)
171
- +
172
  self.tokenizer = PreTrainedTokenizerFast(
173
  tokenizer_object=tokenizer,
174
  bos_token="<|begin_of_text|>",
175
  - eos_token="<|end_of_text|>",
 
176
  + eos_token="<|end_of_text|>" if not instruct else "<|eot_id|>",
177
- chat_template=chat_template,
178
  model_input_names=["input_ids", "attention_mask"],
179
  + model_max_length=model_max_length,
180
  )
@@ -290,45 +291,3 @@ index a0fbe4680..50c7ed738 100644
290
  )
291
 
292
 
293
- diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
294
- index 8cbe8fe35..65b4bb56b 100644
295
- --- a/src/transformers/models/llama/modeling_llama.py
296
- +++ b/src/transformers/models/llama/modeling_llama.py
297
- @@ -90,6 +90,29 @@ class LlamaRMSNorm(nn.Module):
298
-
299
- ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
300
-
301
- +def apply_scaling(freqs: torch.Tensor):
302
- + # Values obtained from grid search
303
- + scale_factor = 8
304
- + low_freq_factor = 1
305
- + high_freq_factor = 4
306
- + old_context_len = 8192 # original llama3 length
307
- +
308
- + low_freq_wavelen = old_context_len / low_freq_factor
309
- + high_freq_wavelen = old_context_len / high_freq_factor
310
- + new_freqs = []
311
- + for freq in freqs:
312
- + wavelen = 2 * math.pi / freq
313
- + if wavelen < high_freq_wavelen:
314
- + new_freqs.append(freq)
315
- + elif wavelen > low_freq_wavelen:
316
- + new_freqs.append(freq / scale_factor)
317
- + else:
318
- + assert low_freq_wavelen != high_freq_wavelen
319
- + smooth = (old_context_len / wavelen - low_freq_factor) / (
320
- + high_freq_factor - low_freq_factor
321
- + )
322
- + new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq)
323
- + return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
324
-
325
- class LlamaRotaryEmbedding(nn.Module):
326
- def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
327
- @@ -99,6 +122,7 @@ class LlamaRotaryEmbedding(nn.Module):
328
- self.max_position_embeddings = max_position_embeddings
329
- self.base = base
330
- inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
331
- + inv_freq = apply_scaling(inv_freq)
332
- self.register_buffer("inv_freq", inv_freq, persistent=False)
333
- # For BC we register cos and sin cached
334
- self.max_seq_len_cached = max_position_embeddings
 
1
  diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
2
+ index a0fbe4680a..a6de9fe3ef 100644
3
  --- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
4
  +++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
5
+ @@ -17,10 +17,12 @@ import json
6
  import os
7
  import shutil
8
  import warnings
9
  -
10
+ +from turtledemo.sorting_animate import instructions1
11
  +from typing import List
12
  import torch
13
 
14
  -from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast
15
+ +from setup import install_requires
16
  +from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast, GenerationConfig
17
  from transformers.convert_slow_tokenizer import TikTokenConverter
18
 
19
 
20
+ @@ -85,8 +87,12 @@ NUM_SHARDS = {
21
  "65B": 8,
22
  "70B": 8,
23
  "70Bf": 8,
 
30
 
31
  def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
32
  return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
33
+ @@ -107,9 +113,10 @@ def write_model(
34
  input_base_path,
35
  model_size=None,
36
  safe_serialization=True,
 
42
  ):
43
  os.makedirs(model_path, exist_ok=True)
44
  tmp_model_path = os.path.join(model_path, "tmp")
45
+ @@ -125,18 +132,11 @@ def write_model(
46
  dims_per_head = dim // n_heads
47
  base = params.get("rope_theta", 10000.0)
48
  inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
 
64
  if params.get("n_kv_heads", None) is not None:
65
  num_key_value_heads = params["n_kv_heads"] # for GQA / MQA
66
  num_key_value_heads_per_shard = num_key_value_heads // num_shards
67
+ @@ -144,8 +144,7 @@ def write_model(
68
  else: # compatibility with other checkpoints
69
  num_key_value_heads = n_heads
70
  num_key_value_heads_per_shard = n_heads_per_shard
 
74
 
75
  # permute for sliced rotary
76
  def permute(w, n_heads, dim1=dim, dim2=dim):
77
+ @@ -159,11 +158,9 @@ def write_model(
78
  loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
79
  else:
80
  # Sharded
 
89
  param_count = 0
90
  index_dict = {"weight_map": {}}
91
  for layer_i in range(n_layers):
92
+ @@ -263,7 +260,7 @@ def write_model(
93
  "lm_head.weight": loaded["output.weight"],
94
  }
95
  else:
 
98
  state_dict = {
99
  "model.norm.weight": loaded[0]["norm.weight"],
100
  "model.embed_tokens.weight": torch.cat(
101
+ @@ -282,6 +279,18 @@ def write_model(
102
  write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
103
  ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
104
  multiple_of = params["multiple_of"] if "multiple_of" in params else 256
 
117
  config = LlamaConfig(
118
  hidden_size=dim,
119
  intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
120
+ @@ -292,11 +301,21 @@ def write_model(
121
  vocab_size=vocab_size,
122
  rope_theta=base,
123
  max_position_embeddings=max_position_embeddings,
 
141
  # Make space so we can load the model properly now.
142
  del state_dict
143
  del loaded
144
+ @@ -313,7 +332,7 @@ def write_model(
145
 
146
 
147
  class Llama3Converter(TikTokenConverter):
 
150
  super().__init__(vocab_file, **kwargs)
151
  tokenizer = self.converted()
152
  chat_template = (
153
+ @@ -327,34 +346,27 @@ class Llama3Converter(TikTokenConverter):
154
  "{% endfor %}"
155
  "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
156
  )
 
169
  - ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)]
170
  tokenizer.add_special_tokens(special_tokens)
171
 
 
 
172
  self.tokenizer = PreTrainedTokenizerFast(
173
  tokenizer_object=tokenizer,
174
  bos_token="<|begin_of_text|>",
175
  - eos_token="<|end_of_text|>",
176
+ - chat_template=chat_template,
177
  + eos_token="<|end_of_text|>" if not instruct else "<|eot_id|>",
178
+ + chat_template=chat_template if instruct else None,
179
  model_input_names=["input_ids", "attention_mask"],
180
  + model_max_length=model_max_length,
181
  )
 
291
  )
292
 
293