Spaces:
Sleeping
Sleeping
quick qlora support
Browse files- README.md +9 -0
- app.py +34 -145
- estimate_train_vram.py +51 -0
- vram_helpers.py +103 -0
README.md
CHANGED
@@ -10,3 +10,12 @@ pinned: false
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
TODO:
|
17 |
+
- include vocab size as manual parameter
|
18 |
+
- include finetuning techniques (LoRA/QLoRA/LoftQ)
|
19 |
+
- include inference (KV cache memory)
|
20 |
+
- include number of experts for Mixture of Experts (MoE) models
|
21 |
+
- include DeepSpeed communication memory overhead (`allgather_bucket_size`)
|
app.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
import argparse
|
3 |
from functools import partial
|
4 |
|
@@ -6,14 +5,8 @@ from functools import partial
|
|
6 |
import gradio as gr
|
7 |
from transformers import AutoConfig
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
"fp32": 4,
|
12 |
-
"float16": 2,
|
13 |
-
"fp16": 2,
|
14 |
-
"bfloat16": 2,
|
15 |
-
"bf16": 2,
|
16 |
-
"int8": 1}
|
17 |
|
18 |
ZERO_STAGES = [0, 1, 2, 3]
|
19 |
BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64]
|
@@ -21,29 +14,6 @@ OPTIMIZERS = ["adam", "adamw", "sgd"]
|
|
21 |
HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
|
22 |
|
23 |
|
24 |
-
@dataclass
|
25 |
-
class ModelConfig:
|
26 |
-
model_size: float
|
27 |
-
hidden_size: int
|
28 |
-
sequence_length: int
|
29 |
-
num_layers: int
|
30 |
-
num_heads: int
|
31 |
-
|
32 |
-
def overwrite_with_hf_config(self, config: dict):
|
33 |
-
self.model_size = round(get_model_size_from_config(config) / 10**9, 2)
|
34 |
-
self.hidden_size = config["hidden_size"]
|
35 |
-
self.sequence_length = config["max_position_embeddings"]
|
36 |
-
self.num_layers = config["num_hidden_layers"]
|
37 |
-
self.num_heads = config["num_attention_heads"]
|
38 |
-
|
39 |
-
@dataclass
|
40 |
-
class TrainingConfig:
|
41 |
-
micro_batch_size: int
|
42 |
-
num_gpus: int
|
43 |
-
optimizer: str
|
44 |
-
zero_stage: int
|
45 |
-
gradient_checkpointing: False
|
46 |
-
mixed_precision: False
|
47 |
|
48 |
|
49 |
def parse_args():
|
@@ -55,51 +25,20 @@ def parse_args():
|
|
55 |
parser.add_argument("--sequence_length", type=int, default=8192, help="Sequence length")
|
56 |
parser.add_argument("--num_layers", type=int, default=32, help="Number of layers")
|
57 |
parser.add_argument("--num_heads", type=int, default=32, help="Number of heads")
|
|
|
|
|
58 |
parser.add_argument("--micro_batch_size", type=int, default=4, help="Micro batch size (batch size per device/GPU)")
|
59 |
parser.add_argument("--zero_stage", type=int, default=0, choices=ZERO_STAGES, help="ZeRO optimization stage")
|
60 |
parser.add_argument("--gradient_checkpointing", action="store_false", help="Enable gradient checkpointing")
|
61 |
-
parser.add_argument("--mixed_precision", action="store_false", help="Enable mixed precision for model training")
|
62 |
parser.add_argument("--optimizer", type=str, default="adamw", choices=OPTIMIZERS, help="Type of optimizer")
|
63 |
parser.add_argument("--num_gpus", type=int, default=4, help="Number of GPUs. Necessary for estimating ZeRO stages")
|
64 |
parser.add_argument("--cache_dir", type=str, default=None, help="HuggingFace cache directory to download config from")
|
|
|
65 |
|
66 |
parser.add_argument("--no-app", action="store_true", help="Launch gradio app. Otherwise, commandline output")
|
67 |
return parser
|
68 |
|
69 |
-
def get_model_size_from_config(config: dict):
|
70 |
-
# Embedding parameters:
|
71 |
-
embedding_params = config["vocab_size"] * config["hidden_size"]
|
72 |
|
73 |
-
# Transformer layer parameters
|
74 |
-
def transformer_layer_params(hidden_size, intermediate_size, num_key_value_heads):
|
75 |
-
input_layernorm_params = hidden_size
|
76 |
-
mlp_down_proj_params = hidden_size * intermediate_size
|
77 |
-
mlp_gate_proj_params = intermediate_size * hidden_size
|
78 |
-
mlp_up_proj_params = intermediate_size * hidden_size
|
79 |
-
post_attention_layernorm_params = hidden_size
|
80 |
-
self_attn_k_proj_params = (hidden_size // (num_key_value_heads // 2)) * hidden_size
|
81 |
-
self_attn_o_proj_params = hidden_size * hidden_size
|
82 |
-
self_attn_q_proj_params = hidden_size * hidden_size
|
83 |
-
self_attn_v_proj_params = (hidden_size // (num_key_value_heads // 2)) * hidden_size
|
84 |
-
|
85 |
-
total_layer_params = (
|
86 |
-
input_layernorm_params + mlp_down_proj_params + mlp_gate_proj_params + mlp_up_proj_params +
|
87 |
-
post_attention_layernorm_params + self_attn_k_proj_params + self_attn_o_proj_params +
|
88 |
-
self_attn_q_proj_params + self_attn_v_proj_params
|
89 |
-
)
|
90 |
-
|
91 |
-
return total_layer_params
|
92 |
-
|
93 |
-
# Total parameters for all transformer layers
|
94 |
-
single_layer_params = transformer_layer_params(config["hidden_size"], config["intermediate_size"], config["num_key_value_heads"])
|
95 |
-
total_transformer_params = config["num_hidden_layers"] * single_layer_params
|
96 |
-
|
97 |
-
# Output layer parameters
|
98 |
-
output_params = config["vocab_size"] * config["hidden_size"]
|
99 |
-
|
100 |
-
# Total parameters
|
101 |
-
total_params = embedding_params + total_transformer_params + output_params
|
102 |
-
return total_params
|
103 |
|
104 |
|
105 |
def download_config_from_hub(repo_id: str, cache_dir: str):
|
@@ -128,66 +67,11 @@ def scrape_config_from_hub(repo_id):
|
|
128 |
|
129 |
return config
|
130 |
|
131 |
-
def model_memory(parameters, precision = "bf16", mixed_precision = False):
|
132 |
-
if mixed_precision:
|
133 |
-
return parameters * (PRECISION_TO_BYTES["fp32"] + PRECISION_TO_BYTES["fp16"])
|
134 |
-
return parameters * PRECISION_TO_BYTES[precision]
|
135 |
-
|
136 |
-
|
137 |
-
def gradients_memory(parameters, precision = "fp32"):
|
138 |
-
return parameters * PRECISION_TO_BYTES[precision]
|
139 |
-
|
140 |
-
def optimizer_memory(parameters, optimizer= "adamw", precision = "fp32"):
|
141 |
-
optimizer_choices = {"adam": 3,
|
142 |
-
"adamw": 2,
|
143 |
-
"sgd": 1}
|
144 |
-
return optimizer_choices[optimizer] * parameters * PRECISION_TO_BYTES[precision]
|
145 |
-
|
146 |
-
def activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads):
|
147 |
-
# Reference: https://arxiv.org/pdf/2205.05198
|
148 |
-
# Activations assumed to be in 16-bit floating precision
|
149 |
-
bytes_per_layer = sequence_length * micro_batch_size * hidden_size * (34 + 5 * (num_heads * sequence_length / hidden_size))
|
150 |
-
bytes_model = bytes_per_layer * num_layers
|
151 |
-
return round(bytes_model / 10**9, 2)
|
152 |
-
|
153 |
-
def vram_required(model_size, hidden_size, sequence_length, num_layers, num_heads, micro_batch_size, num_gpus, optimizer, zero_stage, gradient_checkpointing, mixed_precision):
|
154 |
-
# Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
|
155 |
-
|
156 |
-
model_vram = model_memory(model_size, mixed_precision=mixed_precision)
|
157 |
-
gradients_vram = gradients_memory(model_size)
|
158 |
-
optimizer_vram = optimizer_memory(model_size, optimizer=optimizer)
|
159 |
-
|
160 |
-
# Baseline
|
161 |
-
if zero_stage == 0:
|
162 |
-
pass
|
163 |
-
# Optimizer state partitioning
|
164 |
-
if zero_stage >= 1:
|
165 |
-
optimizer_vram = optimizer_vram / num_gpus
|
166 |
-
# Gradient + Optimzer state partitioning
|
167 |
-
if zero_stage >= 2:
|
168 |
-
gradients_vram = gradients_vram / num_gpus
|
169 |
-
# Parameter partitioning + Gradient + Optimizer partitioning
|
170 |
-
if zero_stage == 3:
|
171 |
-
aggregated_vram = model_vram / num_gpus
|
172 |
-
|
173 |
-
aggregated_vram = round(model_vram, 2) + gradients_vram + optimizer_vram
|
174 |
-
|
175 |
-
activations_vram = activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads)
|
176 |
-
if gradient_checkpointing:
|
177 |
-
activations_vram = round(activations_vram ** 0.5, 2)
|
178 |
-
|
179 |
-
total_vram = aggregated_vram + activations_vram
|
180 |
-
return {"total": total_vram, "model": model_vram, "gradients": gradients_vram, "optimizer": optimizer_vram, "activations": activations_vram}
|
181 |
|
182 |
def build_interface(estimate_vram_fn):
|
183 |
-
training_params = []
|
184 |
with gr.Blocks() as app:
|
185 |
-
|
186 |
option = gr.Radio(["Repo ID", "Model Parameters"], label="Select Input Type")
|
187 |
-
|
188 |
-
repo_id = gr.Textbox(label="Repo ID", visible=False)
|
189 |
-
|
190 |
-
|
191 |
|
192 |
with gr.Row(visible=False) as model_params_row:
|
193 |
model_params = [gr.Slider(label="Model Size", minimum=0.1, maximum=400, step=0.1, value=7, info="Model size (in billion parameters)"),
|
@@ -217,6 +101,7 @@ def build_interface(estimate_vram_fn):
|
|
217 |
gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=True, info="Enable gradient checkpointing"),
|
218 |
gr.Dropdown(label="Mixed precision", choices=[False, True], value=False, info="Enable mixed precision for model training"),
|
219 |
gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"),
|
|
|
220 |
gr.Slider(label="Num GPUs", minimum=1, maximum=64, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"),
|
221 |
gr.Textbox(label="Cache dir", value=None, placeholder=".huggingface_configs", info="HuggingFace cache directory to download config from")
|
222 |
]
|
@@ -225,51 +110,55 @@ def build_interface(estimate_vram_fn):
|
|
225 |
|
226 |
output = gr.Textbox(label="Total estimated VRAM per device/GPU (in GB)")
|
227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
submit_btn.click(
|
229 |
-
fn=estimate_vram_fn,
|
230 |
-
inputs=[repo_id
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
return app
|
235 |
|
236 |
|
237 |
-
def estimate_vram(
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
training_config = TrainingConfig(params["micro_batch_size"], params["num_gpus"], params["optimizer"], params["zero_stage"], params["gradient_checkpointing"], params["mixed_precision"])
|
243 |
-
if not params["repo_id"]:
|
244 |
return "No model selected!"
|
245 |
# If cache directory set, then download config
|
246 |
-
if
|
247 |
-
config = scrape_config_from_hub(
|
248 |
model_config.overwrite_with_hf_config(config)
|
249 |
# By default, scrape config.json from hub
|
250 |
else:
|
251 |
-
config = download_config_from_hub(
|
252 |
model_config.overwrite_with_hf_config(config.to_dict())
|
253 |
|
254 |
-
|
|
|
|
|
255 |
output_str = f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB activations"
|
256 |
return output_str
|
257 |
|
258 |
if __name__ == "__main__":
|
259 |
parser = parse_args()
|
260 |
args = parser.parse_args()
|
261 |
-
|
262 |
# Launch gradio interface
|
263 |
if not args.no_app:
|
264 |
import gradio as gr
|
265 |
-
|
266 |
-
estimate_vram_fn = partial(estimate_vram, arg_keys)
|
267 |
interface = build_interface(estimate_vram_fn)
|
268 |
interface.launch()
|
269 |
# Command line interface
|
270 |
else:
|
271 |
-
model_config = ModelConfig(
|
272 |
-
training_config = TrainingConfig(
|
273 |
if args.repo_id:
|
274 |
# If cache directory set, then download config
|
275 |
if args.cache_dir:
|
@@ -279,5 +168,5 @@ if __name__ == "__main__":
|
|
279 |
config = scrape_config_from_hub(args.repo_id)
|
280 |
model_config.overwrite_with_hf_config(config)
|
281 |
|
282 |
-
total_vram_dict = vram_required(
|
283 |
-
print(f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB activations")
|
|
|
|
|
1 |
import argparse
|
2 |
from functools import partial
|
3 |
|
|
|
5 |
import gradio as gr
|
6 |
from transformers import AutoConfig
|
7 |
|
8 |
+
from estimate_train_vram import vram_required
|
9 |
+
from vram_helpers import ModelConfig, TrainingConfig, filter_params_for_dataclass
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
ZERO_STAGES = [0, 1, 2, 3]
|
12 |
BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64]
|
|
|
14 |
HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
|
15 |
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
|
19 |
def parse_args():
|
|
|
25 |
parser.add_argument("--sequence_length", type=int, default=8192, help="Sequence length")
|
26 |
parser.add_argument("--num_layers", type=int, default=32, help="Number of layers")
|
27 |
parser.add_argument("--num_heads", type=int, default=32, help="Number of heads")
|
28 |
+
parser.add_argument("--mixed_precision", action="store_false", help="Enable mixed precision for model training")
|
29 |
+
parser.add_argument("--precision", type=str, default="bf16", help="Model precision for training")
|
30 |
parser.add_argument("--micro_batch_size", type=int, default=4, help="Micro batch size (batch size per device/GPU)")
|
31 |
parser.add_argument("--zero_stage", type=int, default=0, choices=ZERO_STAGES, help="ZeRO optimization stage")
|
32 |
parser.add_argument("--gradient_checkpointing", action="store_false", help="Enable gradient checkpointing")
|
|
|
33 |
parser.add_argument("--optimizer", type=str, default="adamw", choices=OPTIMIZERS, help="Type of optimizer")
|
34 |
parser.add_argument("--num_gpus", type=int, default=4, help="Number of GPUs. Necessary for estimating ZeRO stages")
|
35 |
parser.add_argument("--cache_dir", type=str, default=None, help="HuggingFace cache directory to download config from")
|
36 |
+
parser.add_argument("--qlora", action="store_false", help="Enable QLoRA in case of finetuning")
|
37 |
|
38 |
parser.add_argument("--no-app", action="store_true", help="Launch gradio app. Otherwise, commandline output")
|
39 |
return parser
|
40 |
|
|
|
|
|
|
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
|
44 |
def download_config_from_hub(repo_id: str, cache_dir: str):
|
|
|
67 |
|
68 |
return config
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
def build_interface(estimate_vram_fn):
|
|
|
72 |
with gr.Blocks() as app:
|
|
|
73 |
option = gr.Radio(["Repo ID", "Model Parameters"], label="Select Input Type")
|
74 |
+
repo_id = gr.Textbox(label="Repo ID", visible=False, placeholder="mistralai/Mistral-7B-v0.1")
|
|
|
|
|
|
|
75 |
|
76 |
with gr.Row(visible=False) as model_params_row:
|
77 |
model_params = [gr.Slider(label="Model Size", minimum=0.1, maximum=400, step=0.1, value=7, info="Model size (in billion parameters)"),
|
|
|
101 |
gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=True, info="Enable gradient checkpointing"),
|
102 |
gr.Dropdown(label="Mixed precision", choices=[False, True], value=False, info="Enable mixed precision for model training"),
|
103 |
gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"),
|
104 |
+
gr.Dropdown(label="QLoRA", choices=[False, True], value=False, info="Finetune with QLoRA enabled"),
|
105 |
gr.Slider(label="Num GPUs", minimum=1, maximum=64, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"),
|
106 |
gr.Textbox(label="Cache dir", value=None, placeholder=".huggingface_configs", info="HuggingFace cache directory to download config from")
|
107 |
]
|
|
|
110 |
|
111 |
output = gr.Textbox(label="Total estimated VRAM per device/GPU (in GB)")
|
112 |
|
113 |
+
def create_combined_params_dict(repo_id, *values):
|
114 |
+
all_params = model_params + training_params
|
115 |
+
combined_dict = {param.label.lower().replace(" ", "_"): value for param, value in zip(all_params, values)}
|
116 |
+
combined_dict["repo_id"] = repo_id
|
117 |
+
return combined_dict
|
118 |
+
|
119 |
submit_btn.click(
|
120 |
+
fn=lambda repo_id, *values: estimate_vram_fn(create_combined_params_dict(repo_id, *values)),
|
121 |
+
inputs=[repo_id] + model_params + training_params,
|
122 |
+
outputs=[output]
|
123 |
+
)
|
|
|
124 |
return app
|
125 |
|
126 |
|
127 |
+
def estimate_vram(gradio_params):
|
128 |
+
model_config = ModelConfig(**filter_params_for_dataclass(ModelConfig, gradio_params))
|
129 |
+
training_config = TrainingConfig(**filter_params_for_dataclass(TrainingConfig, gradio_params))
|
130 |
+
# Update model config
|
131 |
+
if not gradio_params["repo_id"]:
|
|
|
|
|
132 |
return "No model selected!"
|
133 |
# If cache directory set, then download config
|
134 |
+
if gradio_params["cache_dir"]:
|
135 |
+
config = scrape_config_from_hub(gradio_params["repo_id"])
|
136 |
model_config.overwrite_with_hf_config(config)
|
137 |
# By default, scrape config.json from hub
|
138 |
else:
|
139 |
+
config = download_config_from_hub(gradio_params["repo_id"], gradio_params["cache_dir"])
|
140 |
model_config.overwrite_with_hf_config(config.to_dict())
|
141 |
|
142 |
+
if gradio_params["qlora"]:
|
143 |
+
model_config.precision = "int4"
|
144 |
+
total_vram_dict = vram_required(model_config, training_config)
|
145 |
output_str = f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB activations"
|
146 |
return output_str
|
147 |
|
148 |
if __name__ == "__main__":
|
149 |
parser = parse_args()
|
150 |
args = parser.parse_args()
|
151 |
+
|
152 |
# Launch gradio interface
|
153 |
if not args.no_app:
|
154 |
import gradio as gr
|
155 |
+
estimate_vram_fn = partial(estimate_vram)
|
|
|
156 |
interface = build_interface(estimate_vram_fn)
|
157 |
interface.launch()
|
158 |
# Command line interface
|
159 |
else:
|
160 |
+
model_config = ModelConfig(**filter_params_for_dataclass(ModelConfig, vars(args)))
|
161 |
+
training_config = TrainingConfig(**filter_params_for_dataclass(TrainingConfig, vars(args)))
|
162 |
if args.repo_id:
|
163 |
# If cache directory set, then download config
|
164 |
if args.cache_dir:
|
|
|
168 |
config = scrape_config_from_hub(args.repo_id)
|
169 |
model_config.overwrite_with_hf_config(config)
|
170 |
|
171 |
+
total_vram_dict = vram_required(model_config, training_config)
|
172 |
+
print(f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB (activations)")
|
estimate_train_vram.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from vram_helpers import model_memory, gradients_memory, optimizer_memory, activations_memory
|
3 |
+
|
4 |
+
|
5 |
+
def vram_required(model_config, training_config):
|
6 |
+
# Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
|
7 |
+
|
8 |
+
trainable_parameters = model_config.model_size
|
9 |
+
if training_config.qlora:
|
10 |
+
model_config.precision = "int4"
|
11 |
+
# Generally around 4-8% of trainable parameters so take upper bound
|
12 |
+
trainable_parameters = 0.1 * model_config.model_size
|
13 |
+
|
14 |
+
model_vram = model_memory(parameters=trainable_parameters,
|
15 |
+
precision=model_config.precision,
|
16 |
+
mixed_precision=model_config.mixed_precision)
|
17 |
+
|
18 |
+
gradients_vram = gradients_memory(parameters=trainable_parameters)
|
19 |
+
optimizer_vram = optimizer_memory(parameters=trainable_parameters, optimizer=training_config.optimizer)
|
20 |
+
|
21 |
+
# Baseline
|
22 |
+
if training_config.zero_stage == 0:
|
23 |
+
pass
|
24 |
+
# Optimizer state partitioning
|
25 |
+
if training_config.zero_stage >= 1:
|
26 |
+
optimizer_vram = optimizer_vram / training_config.num_gpus
|
27 |
+
# Gradient + Optimzer state partitioning
|
28 |
+
if training_config.zero_stage >= 2:
|
29 |
+
gradients_vram = gradients_vram / training_config.num_gpus
|
30 |
+
# Parameter partitioning + Gradient + Optimizer partitioning
|
31 |
+
if training_config.zero_stage == 3:
|
32 |
+
aggregated_vram = model_vram / training_config.num_gpus
|
33 |
+
|
34 |
+
aggregated_vram = model_vram + gradients_vram + optimizer_vram
|
35 |
+
|
36 |
+
activations_vram = activations_memory(model_config.num_layers,
|
37 |
+
model_config.sequence_length,
|
38 |
+
training_config.micro_batch_size,
|
39 |
+
model_config.hidden_size,
|
40 |
+
model_config.num_heads)
|
41 |
+
if training_config.gradient_checkpointing:
|
42 |
+
activations_vram = round(activations_vram ** 0.5, 2)
|
43 |
+
|
44 |
+
total_vram = aggregated_vram + activations_vram
|
45 |
+
return {k: round(v, 2) for k, v in {
|
46 |
+
"total": total_vram,
|
47 |
+
"model": model_vram,
|
48 |
+
"gradients": gradients_vram,
|
49 |
+
"optimizer": optimizer_vram,
|
50 |
+
"activations": activations_vram
|
51 |
+
}.items()}
|
vram_helpers.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass, fields
|
2 |
+
from typing import Optional
|
3 |
+
|
4 |
+
|
5 |
+
PRECISION_TO_BYTES = {"float32": 4,
|
6 |
+
"fp32": 4,
|
7 |
+
"float16": 2,
|
8 |
+
"fp16": 2,
|
9 |
+
"bfloat16": 2,
|
10 |
+
"bf16": 2,
|
11 |
+
"int8": 1,
|
12 |
+
"int4": 0.5}
|
13 |
+
|
14 |
+
|
15 |
+
@dataclass
|
16 |
+
class ModelConfig:
|
17 |
+
model_size: float
|
18 |
+
hidden_size: int
|
19 |
+
sequence_length: int
|
20 |
+
num_layers: int
|
21 |
+
num_heads: int
|
22 |
+
mixed_precision: bool = False
|
23 |
+
precision: str = "bf16"
|
24 |
+
repo_id: Optional[str] = None
|
25 |
+
|
26 |
+
def overwrite_with_hf_config(self, config: dict):
|
27 |
+
self.model_size = round(get_model_size_from_config(config) / 10**9, 2)
|
28 |
+
self.hidden_size = config["hidden_size"]
|
29 |
+
self.sequence_length = config["max_position_embeddings"]
|
30 |
+
self.num_layers = config["num_hidden_layers"]
|
31 |
+
self.num_heads = config["num_attention_heads"]
|
32 |
+
|
33 |
+
@dataclass
|
34 |
+
class TrainingConfig:
|
35 |
+
micro_batch_size: int
|
36 |
+
num_gpus: int
|
37 |
+
optimizer: str
|
38 |
+
zero_stage: int
|
39 |
+
qlora: bool = False
|
40 |
+
gradient_checkpointing: bool = False
|
41 |
+
|
42 |
+
# Utility function to filter params based on dataclass fields
|
43 |
+
def filter_params_for_dataclass(dataclass_type, params):
|
44 |
+
return {field.name: params[field.name] for field in fields(dataclass_type) if field.name in params}
|
45 |
+
|
46 |
+
def get_model_size_from_config(config: dict):
|
47 |
+
# Embedding parameters:
|
48 |
+
embedding_params = config["vocab_size"] * config["hidden_size"]
|
49 |
+
|
50 |
+
# Transformer layer parameters
|
51 |
+
def transformer_layer_params(hidden_size, intermediate_size, num_key_value_heads):
|
52 |
+
input_layernorm_params = hidden_size
|
53 |
+
mlp_down_proj_params = hidden_size * intermediate_size
|
54 |
+
mlp_gate_proj_params = intermediate_size * hidden_size
|
55 |
+
mlp_up_proj_params = intermediate_size * hidden_size
|
56 |
+
post_attention_layernorm_params = hidden_size
|
57 |
+
self_attn_k_proj_params = (hidden_size // (num_key_value_heads // 2)) * hidden_size
|
58 |
+
self_attn_o_proj_params = hidden_size * hidden_size
|
59 |
+
self_attn_q_proj_params = hidden_size * hidden_size
|
60 |
+
self_attn_v_proj_params = (hidden_size // (num_key_value_heads // 2)) * hidden_size
|
61 |
+
|
62 |
+
total_layer_params = (
|
63 |
+
input_layernorm_params + mlp_down_proj_params + mlp_gate_proj_params + mlp_up_proj_params +
|
64 |
+
post_attention_layernorm_params + self_attn_k_proj_params + self_attn_o_proj_params +
|
65 |
+
self_attn_q_proj_params + self_attn_v_proj_params
|
66 |
+
)
|
67 |
+
|
68 |
+
return total_layer_params
|
69 |
+
|
70 |
+
# Total parameters for all transformer layers
|
71 |
+
single_layer_params = transformer_layer_params(config["hidden_size"], config["intermediate_size"], config["num_key_value_heads"])
|
72 |
+
total_transformer_params = config["num_hidden_layers"] * single_layer_params
|
73 |
+
|
74 |
+
# Output layer parameters
|
75 |
+
output_params = config["vocab_size"] * config["hidden_size"]
|
76 |
+
|
77 |
+
# Total parameters
|
78 |
+
total_params = embedding_params + total_transformer_params + output_params
|
79 |
+
return total_params
|
80 |
+
|
81 |
+
def model_memory(parameters, precision = "bf16", mixed_precision = False):
|
82 |
+
if mixed_precision:
|
83 |
+
return parameters * (PRECISION_TO_BYTES["fp32"] + PRECISION_TO_BYTES["fp16"])
|
84 |
+
return parameters * PRECISION_TO_BYTES[precision]
|
85 |
+
|
86 |
+
|
87 |
+
def gradients_memory(parameters, precision = "fp32"):
|
88 |
+
return parameters * PRECISION_TO_BYTES[precision]
|
89 |
+
|
90 |
+
def optimizer_memory(parameters, optimizer= "adamw", precision = "fp32"):
|
91 |
+
optimizer_choices = {"adam": 3, # Adam: stores precision copies of the optimizer parameters, momentum, and variance -> 4 + 4 + 4 = 12 bytes per model parameter
|
92 |
+
"adamw": 3, # AdamW: Same for Adam
|
93 |
+
"sgd": 2, # For SGD: optimier parameters and gradients -> 4 + 4 = 8 bytes per model parameter
|
94 |
+
"adam-8bit": 1.5, # Adam 8-bit: same for Adam-> 2 + 2 + 2 = 6 bytes per model parameter
|
95 |
+
}
|
96 |
+
return optimizer_choices[optimizer] * parameters * PRECISION_TO_BYTES[precision]
|
97 |
+
|
98 |
+
def activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads):
|
99 |
+
# Reference: https://arxiv.org/pdf/2205.05198
|
100 |
+
# Activations assumed to be in 16-bit floating precision
|
101 |
+
bytes_per_layer = sequence_length * micro_batch_size * hidden_size * (34 + 5 * (num_heads * sequence_length / hidden_size))
|
102 |
+
bytes_model = bytes_per_layer * num_layers
|
103 |
+
return bytes_model / 10**9
|