tvosch commited on
Commit
a905447
1 Parent(s): c34077d

quick qlora support

Browse files
Files changed (4) hide show
  1. README.md +9 -0
  2. app.py +34 -145
  3. estimate_train_vram.py +51 -0
  4. vram_helpers.py +103 -0
README.md CHANGED
@@ -10,3 +10,12 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+
15
+
16
+ TODO:
17
+ - include vocab size as manual parameter
18
+ - include finetuning techniques (LoRA/QLoRA/LoftQ)
19
+ - include inference (KV cache memory)
20
+ - include number of experts for Mixture of Experts (MoE) models
21
+ - include DeepSpeed communication memory overhead (`allgather_bucket_size`)
app.py CHANGED
@@ -1,4 +1,3 @@
1
- from dataclasses import dataclass
2
  import argparse
3
  from functools import partial
4
 
@@ -6,14 +5,8 @@ from functools import partial
6
  import gradio as gr
7
  from transformers import AutoConfig
8
 
9
-
10
- PRECISION_TO_BYTES = {"float32": 4,
11
- "fp32": 4,
12
- "float16": 2,
13
- "fp16": 2,
14
- "bfloat16": 2,
15
- "bf16": 2,
16
- "int8": 1}
17
 
18
  ZERO_STAGES = [0, 1, 2, 3]
19
  BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64]
@@ -21,29 +14,6 @@ OPTIMIZERS = ["adam", "adamw", "sgd"]
21
  HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
22
 
23
 
24
- @dataclass
25
- class ModelConfig:
26
- model_size: float
27
- hidden_size: int
28
- sequence_length: int
29
- num_layers: int
30
- num_heads: int
31
-
32
- def overwrite_with_hf_config(self, config: dict):
33
- self.model_size = round(get_model_size_from_config(config) / 10**9, 2)
34
- self.hidden_size = config["hidden_size"]
35
- self.sequence_length = config["max_position_embeddings"]
36
- self.num_layers = config["num_hidden_layers"]
37
- self.num_heads = config["num_attention_heads"]
38
-
39
- @dataclass
40
- class TrainingConfig:
41
- micro_batch_size: int
42
- num_gpus: int
43
- optimizer: str
44
- zero_stage: int
45
- gradient_checkpointing: False
46
- mixed_precision: False
47
 
48
 
49
  def parse_args():
@@ -55,51 +25,20 @@ def parse_args():
55
  parser.add_argument("--sequence_length", type=int, default=8192, help="Sequence length")
56
  parser.add_argument("--num_layers", type=int, default=32, help="Number of layers")
57
  parser.add_argument("--num_heads", type=int, default=32, help="Number of heads")
 
 
58
  parser.add_argument("--micro_batch_size", type=int, default=4, help="Micro batch size (batch size per device/GPU)")
59
  parser.add_argument("--zero_stage", type=int, default=0, choices=ZERO_STAGES, help="ZeRO optimization stage")
60
  parser.add_argument("--gradient_checkpointing", action="store_false", help="Enable gradient checkpointing")
61
- parser.add_argument("--mixed_precision", action="store_false", help="Enable mixed precision for model training")
62
  parser.add_argument("--optimizer", type=str, default="adamw", choices=OPTIMIZERS, help="Type of optimizer")
63
  parser.add_argument("--num_gpus", type=int, default=4, help="Number of GPUs. Necessary for estimating ZeRO stages")
64
  parser.add_argument("--cache_dir", type=str, default=None, help="HuggingFace cache directory to download config from")
 
65
 
66
  parser.add_argument("--no-app", action="store_true", help="Launch gradio app. Otherwise, commandline output")
67
  return parser
68
 
69
- def get_model_size_from_config(config: dict):
70
- # Embedding parameters:
71
- embedding_params = config["vocab_size"] * config["hidden_size"]
72
 
73
- # Transformer layer parameters
74
- def transformer_layer_params(hidden_size, intermediate_size, num_key_value_heads):
75
- input_layernorm_params = hidden_size
76
- mlp_down_proj_params = hidden_size * intermediate_size
77
- mlp_gate_proj_params = intermediate_size * hidden_size
78
- mlp_up_proj_params = intermediate_size * hidden_size
79
- post_attention_layernorm_params = hidden_size
80
- self_attn_k_proj_params = (hidden_size // (num_key_value_heads // 2)) * hidden_size
81
- self_attn_o_proj_params = hidden_size * hidden_size
82
- self_attn_q_proj_params = hidden_size * hidden_size
83
- self_attn_v_proj_params = (hidden_size // (num_key_value_heads // 2)) * hidden_size
84
-
85
- total_layer_params = (
86
- input_layernorm_params + mlp_down_proj_params + mlp_gate_proj_params + mlp_up_proj_params +
87
- post_attention_layernorm_params + self_attn_k_proj_params + self_attn_o_proj_params +
88
- self_attn_q_proj_params + self_attn_v_proj_params
89
- )
90
-
91
- return total_layer_params
92
-
93
- # Total parameters for all transformer layers
94
- single_layer_params = transformer_layer_params(config["hidden_size"], config["intermediate_size"], config["num_key_value_heads"])
95
- total_transformer_params = config["num_hidden_layers"] * single_layer_params
96
-
97
- # Output layer parameters
98
- output_params = config["vocab_size"] * config["hidden_size"]
99
-
100
- # Total parameters
101
- total_params = embedding_params + total_transformer_params + output_params
102
- return total_params
103
 
104
 
105
  def download_config_from_hub(repo_id: str, cache_dir: str):
@@ -128,66 +67,11 @@ def scrape_config_from_hub(repo_id):
128
 
129
  return config
130
 
131
- def model_memory(parameters, precision = "bf16", mixed_precision = False):
132
- if mixed_precision:
133
- return parameters * (PRECISION_TO_BYTES["fp32"] + PRECISION_TO_BYTES["fp16"])
134
- return parameters * PRECISION_TO_BYTES[precision]
135
-
136
-
137
- def gradients_memory(parameters, precision = "fp32"):
138
- return parameters * PRECISION_TO_BYTES[precision]
139
-
140
- def optimizer_memory(parameters, optimizer= "adamw", precision = "fp32"):
141
- optimizer_choices = {"adam": 3,
142
- "adamw": 2,
143
- "sgd": 1}
144
- return optimizer_choices[optimizer] * parameters * PRECISION_TO_BYTES[precision]
145
-
146
- def activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads):
147
- # Reference: https://arxiv.org/pdf/2205.05198
148
- # Activations assumed to be in 16-bit floating precision
149
- bytes_per_layer = sequence_length * micro_batch_size * hidden_size * (34 + 5 * (num_heads * sequence_length / hidden_size))
150
- bytes_model = bytes_per_layer * num_layers
151
- return round(bytes_model / 10**9, 2)
152
-
153
- def vram_required(model_size, hidden_size, sequence_length, num_layers, num_heads, micro_batch_size, num_gpus, optimizer, zero_stage, gradient_checkpointing, mixed_precision):
154
- # Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
155
-
156
- model_vram = model_memory(model_size, mixed_precision=mixed_precision)
157
- gradients_vram = gradients_memory(model_size)
158
- optimizer_vram = optimizer_memory(model_size, optimizer=optimizer)
159
-
160
- # Baseline
161
- if zero_stage == 0:
162
- pass
163
- # Optimizer state partitioning
164
- if zero_stage >= 1:
165
- optimizer_vram = optimizer_vram / num_gpus
166
- # Gradient + Optimzer state partitioning
167
- if zero_stage >= 2:
168
- gradients_vram = gradients_vram / num_gpus
169
- # Parameter partitioning + Gradient + Optimizer partitioning
170
- if zero_stage == 3:
171
- aggregated_vram = model_vram / num_gpus
172
-
173
- aggregated_vram = round(model_vram, 2) + gradients_vram + optimizer_vram
174
-
175
- activations_vram = activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads)
176
- if gradient_checkpointing:
177
- activations_vram = round(activations_vram ** 0.5, 2)
178
-
179
- total_vram = aggregated_vram + activations_vram
180
- return {"total": total_vram, "model": model_vram, "gradients": gradients_vram, "optimizer": optimizer_vram, "activations": activations_vram}
181
 
182
  def build_interface(estimate_vram_fn):
183
- training_params = []
184
  with gr.Blocks() as app:
185
-
186
  option = gr.Radio(["Repo ID", "Model Parameters"], label="Select Input Type")
187
-
188
- repo_id = gr.Textbox(label="Repo ID", visible=False)
189
-
190
-
191
 
192
  with gr.Row(visible=False) as model_params_row:
193
  model_params = [gr.Slider(label="Model Size", minimum=0.1, maximum=400, step=0.1, value=7, info="Model size (in billion parameters)"),
@@ -217,6 +101,7 @@ def build_interface(estimate_vram_fn):
217
  gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=True, info="Enable gradient checkpointing"),
218
  gr.Dropdown(label="Mixed precision", choices=[False, True], value=False, info="Enable mixed precision for model training"),
219
  gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"),
 
220
  gr.Slider(label="Num GPUs", minimum=1, maximum=64, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"),
221
  gr.Textbox(label="Cache dir", value=None, placeholder=".huggingface_configs", info="HuggingFace cache directory to download config from")
222
  ]
@@ -225,51 +110,55 @@ def build_interface(estimate_vram_fn):
225
 
226
  output = gr.Textbox(label="Total estimated VRAM per device/GPU (in GB)")
227
 
 
 
 
 
 
 
228
  submit_btn.click(
229
- fn=estimate_vram_fn,
230
- inputs=[repo_id, *model_params, *training_params],
231
- outputs=[output]
232
- )
233
-
234
  return app
235
 
236
 
237
- def estimate_vram(arg_keys, *args):
238
- params = dict(zip(arg_keys, args))
239
- print("Parameters: ", params)
240
-
241
- model_config = ModelConfig(params["model_size"], params["hidden_size"], params["sequence_length"], params["num_layers"], params["num_heads"])
242
- training_config = TrainingConfig(params["micro_batch_size"], params["num_gpus"], params["optimizer"], params["zero_stage"], params["gradient_checkpointing"], params["mixed_precision"])
243
- if not params["repo_id"]:
244
  return "No model selected!"
245
  # If cache directory set, then download config
246
- if params["cache_dir"]:
247
- config = scrape_config_from_hub(params["repo_id"])
248
  model_config.overwrite_with_hf_config(config)
249
  # By default, scrape config.json from hub
250
  else:
251
- config = download_config_from_hub(params["repo_id"], params["cache_dir"])
252
  model_config.overwrite_with_hf_config(config.to_dict())
253
 
254
- total_vram_dict = vram_required(**vars(model_config), **vars(training_config))
 
 
255
  output_str = f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB activations"
256
  return output_str
257
 
258
  if __name__ == "__main__":
259
  parser = parse_args()
260
  args = parser.parse_args()
261
-
262
  # Launch gradio interface
263
  if not args.no_app:
264
  import gradio as gr
265
- arg_keys = list(vars(args).keys())
266
- estimate_vram_fn = partial(estimate_vram, arg_keys)
267
  interface = build_interface(estimate_vram_fn)
268
  interface.launch()
269
  # Command line interface
270
  else:
271
- model_config = ModelConfig(args.model_size, args.hidden_size, args.sequence_length, args.num_layers, args.num_heads)
272
- training_config = TrainingConfig(args.micro_batch_size, args.num_gpus, args.optimizer, args.zero_stage, args.gradient_checkpointing, args.mixed_precision)
273
  if args.repo_id:
274
  # If cache directory set, then download config
275
  if args.cache_dir:
@@ -279,5 +168,5 @@ if __name__ == "__main__":
279
  config = scrape_config_from_hub(args.repo_id)
280
  model_config.overwrite_with_hf_config(config)
281
 
282
- total_vram_dict = vram_required(**vars(model_config), **vars(training_config))
283
- print(f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB activations")
 
 
1
  import argparse
2
  from functools import partial
3
 
 
5
  import gradio as gr
6
  from transformers import AutoConfig
7
 
8
+ from estimate_train_vram import vram_required
9
+ from vram_helpers import ModelConfig, TrainingConfig, filter_params_for_dataclass
 
 
 
 
 
 
10
 
11
  ZERO_STAGES = [0, 1, 2, 3]
12
  BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64]
 
14
  HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
15
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
 
19
  def parse_args():
 
25
  parser.add_argument("--sequence_length", type=int, default=8192, help="Sequence length")
26
  parser.add_argument("--num_layers", type=int, default=32, help="Number of layers")
27
  parser.add_argument("--num_heads", type=int, default=32, help="Number of heads")
28
+ parser.add_argument("--mixed_precision", action="store_false", help="Enable mixed precision for model training")
29
+ parser.add_argument("--precision", type=str, default="bf16", help="Model precision for training")
30
  parser.add_argument("--micro_batch_size", type=int, default=4, help="Micro batch size (batch size per device/GPU)")
31
  parser.add_argument("--zero_stage", type=int, default=0, choices=ZERO_STAGES, help="ZeRO optimization stage")
32
  parser.add_argument("--gradient_checkpointing", action="store_false", help="Enable gradient checkpointing")
 
33
  parser.add_argument("--optimizer", type=str, default="adamw", choices=OPTIMIZERS, help="Type of optimizer")
34
  parser.add_argument("--num_gpus", type=int, default=4, help="Number of GPUs. Necessary for estimating ZeRO stages")
35
  parser.add_argument("--cache_dir", type=str, default=None, help="HuggingFace cache directory to download config from")
36
+ parser.add_argument("--qlora", action="store_false", help="Enable QLoRA in case of finetuning")
37
 
38
  parser.add_argument("--no-app", action="store_true", help="Launch gradio app. Otherwise, commandline output")
39
  return parser
40
 
 
 
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
 
44
  def download_config_from_hub(repo_id: str, cache_dir: str):
 
67
 
68
  return config
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  def build_interface(estimate_vram_fn):
 
72
  with gr.Blocks() as app:
 
73
  option = gr.Radio(["Repo ID", "Model Parameters"], label="Select Input Type")
74
+ repo_id = gr.Textbox(label="Repo ID", visible=False, placeholder="mistralai/Mistral-7B-v0.1")
 
 
 
75
 
76
  with gr.Row(visible=False) as model_params_row:
77
  model_params = [gr.Slider(label="Model Size", minimum=0.1, maximum=400, step=0.1, value=7, info="Model size (in billion parameters)"),
 
101
  gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=True, info="Enable gradient checkpointing"),
102
  gr.Dropdown(label="Mixed precision", choices=[False, True], value=False, info="Enable mixed precision for model training"),
103
  gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"),
104
+ gr.Dropdown(label="QLoRA", choices=[False, True], value=False, info="Finetune with QLoRA enabled"),
105
  gr.Slider(label="Num GPUs", minimum=1, maximum=64, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"),
106
  gr.Textbox(label="Cache dir", value=None, placeholder=".huggingface_configs", info="HuggingFace cache directory to download config from")
107
  ]
 
110
 
111
  output = gr.Textbox(label="Total estimated VRAM per device/GPU (in GB)")
112
 
113
+ def create_combined_params_dict(repo_id, *values):
114
+ all_params = model_params + training_params
115
+ combined_dict = {param.label.lower().replace(" ", "_"): value for param, value in zip(all_params, values)}
116
+ combined_dict["repo_id"] = repo_id
117
+ return combined_dict
118
+
119
  submit_btn.click(
120
+ fn=lambda repo_id, *values: estimate_vram_fn(create_combined_params_dict(repo_id, *values)),
121
+ inputs=[repo_id] + model_params + training_params,
122
+ outputs=[output]
123
+ )
 
124
  return app
125
 
126
 
127
+ def estimate_vram(gradio_params):
128
+ model_config = ModelConfig(**filter_params_for_dataclass(ModelConfig, gradio_params))
129
+ training_config = TrainingConfig(**filter_params_for_dataclass(TrainingConfig, gradio_params))
130
+ # Update model config
131
+ if not gradio_params["repo_id"]:
 
 
132
  return "No model selected!"
133
  # If cache directory set, then download config
134
+ if gradio_params["cache_dir"]:
135
+ config = scrape_config_from_hub(gradio_params["repo_id"])
136
  model_config.overwrite_with_hf_config(config)
137
  # By default, scrape config.json from hub
138
  else:
139
+ config = download_config_from_hub(gradio_params["repo_id"], gradio_params["cache_dir"])
140
  model_config.overwrite_with_hf_config(config.to_dict())
141
 
142
+ if gradio_params["qlora"]:
143
+ model_config.precision = "int4"
144
+ total_vram_dict = vram_required(model_config, training_config)
145
  output_str = f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB activations"
146
  return output_str
147
 
148
  if __name__ == "__main__":
149
  parser = parse_args()
150
  args = parser.parse_args()
151
+
152
  # Launch gradio interface
153
  if not args.no_app:
154
  import gradio as gr
155
+ estimate_vram_fn = partial(estimate_vram)
 
156
  interface = build_interface(estimate_vram_fn)
157
  interface.launch()
158
  # Command line interface
159
  else:
160
+ model_config = ModelConfig(**filter_params_for_dataclass(ModelConfig, vars(args)))
161
+ training_config = TrainingConfig(**filter_params_for_dataclass(TrainingConfig, vars(args)))
162
  if args.repo_id:
163
  # If cache directory set, then download config
164
  if args.cache_dir:
 
168
  config = scrape_config_from_hub(args.repo_id)
169
  model_config.overwrite_with_hf_config(config)
170
 
171
+ total_vram_dict = vram_required(model_config, training_config)
172
+ print(f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB (activations)")
estimate_train_vram.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from vram_helpers import model_memory, gradients_memory, optimizer_memory, activations_memory
3
+
4
+
5
+ def vram_required(model_config, training_config):
6
+ # Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
7
+
8
+ trainable_parameters = model_config.model_size
9
+ if training_config.qlora:
10
+ model_config.precision = "int4"
11
+ # Generally around 4-8% of trainable parameters so take upper bound
12
+ trainable_parameters = 0.1 * model_config.model_size
13
+
14
+ model_vram = model_memory(parameters=trainable_parameters,
15
+ precision=model_config.precision,
16
+ mixed_precision=model_config.mixed_precision)
17
+
18
+ gradients_vram = gradients_memory(parameters=trainable_parameters)
19
+ optimizer_vram = optimizer_memory(parameters=trainable_parameters, optimizer=training_config.optimizer)
20
+
21
+ # Baseline
22
+ if training_config.zero_stage == 0:
23
+ pass
24
+ # Optimizer state partitioning
25
+ if training_config.zero_stage >= 1:
26
+ optimizer_vram = optimizer_vram / training_config.num_gpus
27
+ # Gradient + Optimzer state partitioning
28
+ if training_config.zero_stage >= 2:
29
+ gradients_vram = gradients_vram / training_config.num_gpus
30
+ # Parameter partitioning + Gradient + Optimizer partitioning
31
+ if training_config.zero_stage == 3:
32
+ aggregated_vram = model_vram / training_config.num_gpus
33
+
34
+ aggregated_vram = model_vram + gradients_vram + optimizer_vram
35
+
36
+ activations_vram = activations_memory(model_config.num_layers,
37
+ model_config.sequence_length,
38
+ training_config.micro_batch_size,
39
+ model_config.hidden_size,
40
+ model_config.num_heads)
41
+ if training_config.gradient_checkpointing:
42
+ activations_vram = round(activations_vram ** 0.5, 2)
43
+
44
+ total_vram = aggregated_vram + activations_vram
45
+ return {k: round(v, 2) for k, v in {
46
+ "total": total_vram,
47
+ "model": model_vram,
48
+ "gradients": gradients_vram,
49
+ "optimizer": optimizer_vram,
50
+ "activations": activations_vram
51
+ }.items()}
vram_helpers.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, fields
2
+ from typing import Optional
3
+
4
+
5
+ PRECISION_TO_BYTES = {"float32": 4,
6
+ "fp32": 4,
7
+ "float16": 2,
8
+ "fp16": 2,
9
+ "bfloat16": 2,
10
+ "bf16": 2,
11
+ "int8": 1,
12
+ "int4": 0.5}
13
+
14
+
15
+ @dataclass
16
+ class ModelConfig:
17
+ model_size: float
18
+ hidden_size: int
19
+ sequence_length: int
20
+ num_layers: int
21
+ num_heads: int
22
+ mixed_precision: bool = False
23
+ precision: str = "bf16"
24
+ repo_id: Optional[str] = None
25
+
26
+ def overwrite_with_hf_config(self, config: dict):
27
+ self.model_size = round(get_model_size_from_config(config) / 10**9, 2)
28
+ self.hidden_size = config["hidden_size"]
29
+ self.sequence_length = config["max_position_embeddings"]
30
+ self.num_layers = config["num_hidden_layers"]
31
+ self.num_heads = config["num_attention_heads"]
32
+
33
+ @dataclass
34
+ class TrainingConfig:
35
+ micro_batch_size: int
36
+ num_gpus: int
37
+ optimizer: str
38
+ zero_stage: int
39
+ qlora: bool = False
40
+ gradient_checkpointing: bool = False
41
+
42
+ # Utility function to filter params based on dataclass fields
43
+ def filter_params_for_dataclass(dataclass_type, params):
44
+ return {field.name: params[field.name] for field in fields(dataclass_type) if field.name in params}
45
+
46
+ def get_model_size_from_config(config: dict):
47
+ # Embedding parameters:
48
+ embedding_params = config["vocab_size"] * config["hidden_size"]
49
+
50
+ # Transformer layer parameters
51
+ def transformer_layer_params(hidden_size, intermediate_size, num_key_value_heads):
52
+ input_layernorm_params = hidden_size
53
+ mlp_down_proj_params = hidden_size * intermediate_size
54
+ mlp_gate_proj_params = intermediate_size * hidden_size
55
+ mlp_up_proj_params = intermediate_size * hidden_size
56
+ post_attention_layernorm_params = hidden_size
57
+ self_attn_k_proj_params = (hidden_size // (num_key_value_heads // 2)) * hidden_size
58
+ self_attn_o_proj_params = hidden_size * hidden_size
59
+ self_attn_q_proj_params = hidden_size * hidden_size
60
+ self_attn_v_proj_params = (hidden_size // (num_key_value_heads // 2)) * hidden_size
61
+
62
+ total_layer_params = (
63
+ input_layernorm_params + mlp_down_proj_params + mlp_gate_proj_params + mlp_up_proj_params +
64
+ post_attention_layernorm_params + self_attn_k_proj_params + self_attn_o_proj_params +
65
+ self_attn_q_proj_params + self_attn_v_proj_params
66
+ )
67
+
68
+ return total_layer_params
69
+
70
+ # Total parameters for all transformer layers
71
+ single_layer_params = transformer_layer_params(config["hidden_size"], config["intermediate_size"], config["num_key_value_heads"])
72
+ total_transformer_params = config["num_hidden_layers"] * single_layer_params
73
+
74
+ # Output layer parameters
75
+ output_params = config["vocab_size"] * config["hidden_size"]
76
+
77
+ # Total parameters
78
+ total_params = embedding_params + total_transformer_params + output_params
79
+ return total_params
80
+
81
+ def model_memory(parameters, precision = "bf16", mixed_precision = False):
82
+ if mixed_precision:
83
+ return parameters * (PRECISION_TO_BYTES["fp32"] + PRECISION_TO_BYTES["fp16"])
84
+ return parameters * PRECISION_TO_BYTES[precision]
85
+
86
+
87
+ def gradients_memory(parameters, precision = "fp32"):
88
+ return parameters * PRECISION_TO_BYTES[precision]
89
+
90
+ def optimizer_memory(parameters, optimizer= "adamw", precision = "fp32"):
91
+ optimizer_choices = {"adam": 3, # Adam: stores precision copies of the optimizer parameters, momentum, and variance -> 4 + 4 + 4 = 12 bytes per model parameter
92
+ "adamw": 3, # AdamW: Same for Adam
93
+ "sgd": 2, # For SGD: optimier parameters and gradients -> 4 + 4 = 8 bytes per model parameter
94
+ "adam-8bit": 1.5, # Adam 8-bit: same for Adam-> 2 + 2 + 2 = 6 bytes per model parameter
95
+ }
96
+ return optimizer_choices[optimizer] * parameters * PRECISION_TO_BYTES[precision]
97
+
98
+ def activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads):
99
+ # Reference: https://arxiv.org/pdf/2205.05198
100
+ # Activations assumed to be in 16-bit floating precision
101
+ bytes_per_layer = sequence_length * micro_batch_size * hidden_size * (34 + 5 * (num_heads * sequence_length / hidden_size))
102
+ bytes_model = bytes_per_layer * num_layers
103
+ return bytes_model / 10**9