lysandre HF staff commited on
Commit
f59ac25
·
verified ·
1 Parent(s): 5a75758

Update patch.diff

Browse files
Files changed (1) hide show
  1. patch.diff +16 -18
patch.diff CHANGED
@@ -1,23 +1,21 @@
1
  diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
2
- index a0fbe4680a..a6de9fe3ef 100644
3
  --- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
4
  +++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
5
- @@ -17,10 +17,12 @@ import json
6
  import os
7
  import shutil
8
  import warnings
9
  -
10
- +from turtledemo.sorting_animate import instructions1
11
  +from typing import List
12
  import torch
13
 
14
  -from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast
15
- +from setup import install_requires
16
  +from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast, GenerationConfig
17
  from transformers.convert_slow_tokenizer import TikTokenConverter
18
 
19
 
20
- @@ -85,8 +87,12 @@ NUM_SHARDS = {
21
  "65B": 8,
22
  "70B": 8,
23
  "70Bf": 8,
@@ -30,7 +28,7 @@ index a0fbe4680a..a6de9fe3ef 100644
30
 
31
  def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
32
  return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
33
- @@ -107,9 +113,10 @@ def write_model(
34
  input_base_path,
35
  model_size=None,
36
  safe_serialization=True,
@@ -42,7 +40,7 @@ index a0fbe4680a..a6de9fe3ef 100644
42
  ):
43
  os.makedirs(model_path, exist_ok=True)
44
  tmp_model_path = os.path.join(model_path, "tmp")
45
- @@ -125,18 +132,11 @@ def write_model(
46
  dims_per_head = dim // n_heads
47
  base = params.get("rope_theta", 10000.0)
48
  inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
@@ -64,7 +62,7 @@ index a0fbe4680a..a6de9fe3ef 100644
64
  if params.get("n_kv_heads", None) is not None:
65
  num_key_value_heads = params["n_kv_heads"] # for GQA / MQA
66
  num_key_value_heads_per_shard = num_key_value_heads // num_shards
67
- @@ -144,8 +144,7 @@ def write_model(
68
  else: # compatibility with other checkpoints
69
  num_key_value_heads = n_heads
70
  num_key_value_heads_per_shard = n_heads_per_shard
@@ -74,7 +72,7 @@ index a0fbe4680a..a6de9fe3ef 100644
74
 
75
  # permute for sliced rotary
76
  def permute(w, n_heads, dim1=dim, dim2=dim):
77
- @@ -159,11 +158,9 @@ def write_model(
78
  loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
79
  else:
80
  # Sharded
@@ -89,7 +87,7 @@ index a0fbe4680a..a6de9fe3ef 100644
89
  param_count = 0
90
  index_dict = {"weight_map": {}}
91
  for layer_i in range(n_layers):
92
- @@ -263,7 +260,7 @@ def write_model(
93
  "lm_head.weight": loaded["output.weight"],
94
  }
95
  else:
@@ -98,7 +96,7 @@ index a0fbe4680a..a6de9fe3ef 100644
98
  state_dict = {
99
  "model.norm.weight": loaded[0]["norm.weight"],
100
  "model.embed_tokens.weight": torch.cat(
101
- @@ -282,6 +279,18 @@ def write_model(
102
  write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
103
  ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
104
  multiple_of = params["multiple_of"] if "multiple_of" in params else 256
@@ -117,7 +115,7 @@ index a0fbe4680a..a6de9fe3ef 100644
117
  config = LlamaConfig(
118
  hidden_size=dim,
119
  intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
120
- @@ -292,11 +301,21 @@ def write_model(
121
  vocab_size=vocab_size,
122
  rope_theta=base,
123
  max_position_embeddings=max_position_embeddings,
@@ -141,7 +139,7 @@ index a0fbe4680a..a6de9fe3ef 100644
141
  # Make space so we can load the model properly now.
142
  del state_dict
143
  del loaded
144
- @@ -313,7 +332,7 @@ def write_model(
145
 
146
 
147
  class Llama3Converter(TikTokenConverter):
@@ -150,7 +148,7 @@ index a0fbe4680a..a6de9fe3ef 100644
150
  super().__init__(vocab_file, **kwargs)
151
  tokenizer = self.converted()
152
  chat_template = (
153
- @@ -327,34 +346,27 @@ class Llama3Converter(TikTokenConverter):
154
  "{% endfor %}"
155
  "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
156
  )
@@ -196,7 +194,7 @@ index a0fbe4680a..a6de9fe3ef 100644
196
  else:
197
  tokenizer = tokenizer_class(input_tokenizer_path)
198
  print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
199
- @@ -362,6 +374,37 @@ def write_tokenizer(tokenizer_path, input_tokenizer_path, llama_version=2):
200
  return tokenizer
201
 
202
 
@@ -234,7 +232,7 @@ index a0fbe4680a..a6de9fe3ef 100644
234
  def main():
235
  parser = argparse.ArgumentParser()
236
  parser.add_argument(
237
- @@ -383,9 +426,9 @@ def main():
238
  # Different Llama versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used.
239
  parser.add_argument(
240
  "--llama_version",
@@ -247,7 +245,7 @@ index a0fbe4680a..a6de9fe3ef 100644
247
  help="Version of the Llama model to convert. Currently supports Llama1 and Llama2. Controls the context size",
248
  )
249
  parser.add_argument(
250
- @@ -394,11 +437,34 @@ def main():
251
  type=int,
252
  help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth",
253
  )
@@ -283,7 +281,7 @@ index a0fbe4680a..a6de9fe3ef 100644
283
  if args.model_size != "tokenizer_only":
284
  write_model(
285
  model_path=args.output_dir,
286
- @@ -408,6 +474,7 @@ def main():
287
  llama_version=args.llama_version,
288
  vocab_size=vocab_size,
289
  num_shards=args.num_shards,
 
1
  diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
2
+ index a0fbe4680a..73944d2c2c 100644
3
  --- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
4
  +++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
5
+ @@ -17,10 +17,10 @@ import json
6
  import os
7
  import shutil
8
  import warnings
9
  -
 
10
  +from typing import List
11
  import torch
12
 
13
  -from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast
 
14
  +from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast, GenerationConfig
15
  from transformers.convert_slow_tokenizer import TikTokenConverter
16
 
17
 
18
+ @@ -85,8 +85,12 @@ NUM_SHARDS = {
19
  "65B": 8,
20
  "70B": 8,
21
  "70Bf": 8,
 
28
 
29
  def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
30
  return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
31
+ @@ -107,9 +111,10 @@ def write_model(
32
  input_base_path,
33
  model_size=None,
34
  safe_serialization=True,
 
40
  ):
41
  os.makedirs(model_path, exist_ok=True)
42
  tmp_model_path = os.path.join(model_path, "tmp")
43
+ @@ -125,18 +130,11 @@ def write_model(
44
  dims_per_head = dim // n_heads
45
  base = params.get("rope_theta", 10000.0)
46
  inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
 
62
  if params.get("n_kv_heads", None) is not None:
63
  num_key_value_heads = params["n_kv_heads"] # for GQA / MQA
64
  num_key_value_heads_per_shard = num_key_value_heads // num_shards
65
+ @@ -144,8 +142,7 @@ def write_model(
66
  else: # compatibility with other checkpoints
67
  num_key_value_heads = n_heads
68
  num_key_value_heads_per_shard = n_heads_per_shard
 
72
 
73
  # permute for sliced rotary
74
  def permute(w, n_heads, dim1=dim, dim2=dim):
75
+ @@ -159,11 +156,9 @@ def write_model(
76
  loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
77
  else:
78
  # Sharded
 
87
  param_count = 0
88
  index_dict = {"weight_map": {}}
89
  for layer_i in range(n_layers):
90
+ @@ -263,7 +258,7 @@ def write_model(
91
  "lm_head.weight": loaded["output.weight"],
92
  }
93
  else:
 
96
  state_dict = {
97
  "model.norm.weight": loaded[0]["norm.weight"],
98
  "model.embed_tokens.weight": torch.cat(
99
+ @@ -282,6 +277,18 @@ def write_model(
100
  write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
101
  ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
102
  multiple_of = params["multiple_of"] if "multiple_of" in params else 256
 
115
  config = LlamaConfig(
116
  hidden_size=dim,
117
  intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
118
+ @@ -292,11 +299,21 @@ def write_model(
119
  vocab_size=vocab_size,
120
  rope_theta=base,
121
  max_position_embeddings=max_position_embeddings,
 
139
  # Make space so we can load the model properly now.
140
  del state_dict
141
  del loaded
142
+ @@ -313,7 +330,7 @@ def write_model(
143
 
144
 
145
  class Llama3Converter(TikTokenConverter):
 
148
  super().__init__(vocab_file, **kwargs)
149
  tokenizer = self.converted()
150
  chat_template = (
151
+ @@ -327,34 +344,27 @@ class Llama3Converter(TikTokenConverter):
152
  "{% endfor %}"
153
  "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
154
  )
 
194
  else:
195
  tokenizer = tokenizer_class(input_tokenizer_path)
196
  print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
197
+ @@ -362,6 +372,37 @@ def write_tokenizer(tokenizer_path, input_tokenizer_path, llama_version=2):
198
  return tokenizer
199
 
200
 
 
232
  def main():
233
  parser = argparse.ArgumentParser()
234
  parser.add_argument(
235
+ @@ -383,9 +424,9 @@ def main():
236
  # Different Llama versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used.
237
  parser.add_argument(
238
  "--llama_version",
 
245
  help="Version of the Llama model to convert. Currently supports Llama1 and Llama2. Controls the context size",
246
  )
247
  parser.add_argument(
248
+ @@ -394,11 +435,34 @@ def main():
249
  type=int,
250
  help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth",
251
  )
 
281
  if args.model_size != "tokenizer_only":
282
  write_model(
283
  model_path=args.output_dir,
284
+ @@ -408,6 +472,7 @@ def main():
285
  llama_version=args.llama_version,
286
  vocab_size=vocab_size,
287
  num_shards=args.num_shards,