Spaces:
Running
Running
mismatch default value and choice list of gradio dropdown
Browse files
app.py
CHANGED
@@ -16,6 +16,7 @@ PRECISION_TO_BYTES = {"float32": 4,
|
|
16 |
"int8": 1}
|
17 |
|
18 |
ZERO_STAGES = [0, 1, 2, 3]
|
|
|
19 |
OPTIMIZERS = ["adam", "adamw", "sgd"]
|
20 |
HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
|
21 |
|
@@ -151,33 +152,32 @@ def activations_memory(num_layers, sequence_length, micro_batch_size, hidden_siz
|
|
151 |
|
152 |
def vram_required(model_size, hidden_size, sequence_length, num_layers, num_heads, micro_batch_size, num_gpus, optimizer, zero_stage, gradient_checkpointing, mixed_precision):
|
153 |
# Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
|
|
|
154 |
model_vram = model_memory(model_size, mixed_precision=mixed_precision)
|
155 |
gradients_vram = gradients_memory(model_size)
|
156 |
optimizer_vram = optimizer_memory(model_size, optimizer=optimizer)
|
157 |
|
158 |
# Baseline
|
159 |
if zero_stage == 0:
|
160 |
-
|
161 |
# Optimizer state partitioning
|
162 |
-
if zero_stage
|
163 |
-
|
164 |
# Gradient + Optimzer state partitioning
|
165 |
-
if zero_stage
|
166 |
-
|
167 |
# Parameter partitioning + Gradient + Optimizer partitioning
|
168 |
if zero_stage == 3:
|
169 |
-
aggregated_vram =
|
170 |
|
171 |
-
|
172 |
|
173 |
activations_vram = activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads)
|
174 |
if gradient_checkpointing:
|
175 |
-
activations_vram = activations_vram ** 0.5
|
176 |
|
177 |
-
print(f"Activations require {activations_vram} GB with gradient checkpointing: {gradient_checkpointing}")
|
178 |
total_vram = aggregated_vram + activations_vram
|
179 |
-
|
180 |
-
return total_vram
|
181 |
|
182 |
def build_interface(estimate_vram_fn):
|
183 |
training_params = []
|
@@ -190,11 +190,11 @@ def build_interface(estimate_vram_fn):
|
|
190 |
|
191 |
|
192 |
with gr.Row(visible=False) as model_params_row:
|
193 |
-
model_params = [gr.Slider(label="Model Size", minimum=0.1, maximum=
|
194 |
gr.Slider(label="Hidden size", minimum=256, maximum=8192, step=128, value=4096, info="Hidden size"),
|
195 |
-
gr.Slider(label="Sequence length", minimum=256, maximum=
|
196 |
-
gr.Slider(label="Num layers", minimum=
|
197 |
-
gr.Slider(label="Num heads", minimum=
|
198 |
]
|
199 |
|
200 |
|
@@ -212,16 +212,17 @@ def build_interface(estimate_vram_fn):
|
|
212 |
|
213 |
|
214 |
with gr.Row(equal_height=True):
|
215 |
-
training_params = [gr.Dropdown(label="Micro batch size", choices=
|
216 |
gr.Dropdown(label="ZeRO stage", choices=ZERO_STAGES, value=0, info="ZeRO optimization stage"),
|
217 |
-
gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=
|
218 |
-
gr.Dropdown(label="Mixed
|
219 |
gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"),
|
220 |
gr.Slider(label="Num GPUs", minimum=1, maximum=64, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"),
|
221 |
gr.Textbox(label="Cache dir", value=None, placeholder=".huggingface_configs", info="HuggingFace cache directory to download config from")
|
222 |
]
|
223 |
|
224 |
submit_btn = gr.Button("Estimate!")
|
|
|
225 |
output = gr.Textbox(label="Total estimated VRAM per device/GPU (in GB)")
|
226 |
|
227 |
submit_btn.click(
|
@@ -235,22 +236,24 @@ def build_interface(estimate_vram_fn):
|
|
235 |
|
236 |
def estimate_vram(arg_keys, *args):
|
237 |
params = dict(zip(arg_keys, args))
|
238 |
-
print(params)
|
239 |
|
240 |
model_config = ModelConfig(params["model_size"], params["hidden_size"], params["sequence_length"], params["num_layers"], params["num_heads"])
|
241 |
training_config = TrainingConfig(params["micro_batch_size"], params["num_gpus"], params["optimizer"], params["zero_stage"], params["gradient_checkpointing"], params["mixed_precision"])
|
242 |
-
if params["repo_id"]:
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
|
|
251 |
|
252 |
-
|
253 |
-
|
|
|
254 |
|
255 |
if __name__ == "__main__":
|
256 |
parser = parse_args()
|
@@ -276,4 +279,5 @@ if __name__ == "__main__":
|
|
276 |
config = scrape_config_from_hub(args.repo_id)
|
277 |
model_config.overwrite_with_hf_config(config)
|
278 |
|
279 |
-
|
|
|
|
16 |
"int8": 1}
|
17 |
|
18 |
ZERO_STAGES = [0, 1, 2, 3]
|
19 |
+
BATCH_SIZES = [1,2,4,8,16,32,64]
|
20 |
OPTIMIZERS = ["adam", "adamw", "sgd"]
|
21 |
HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
|
22 |
|
|
|
152 |
|
153 |
def vram_required(model_size, hidden_size, sequence_length, num_layers, num_heads, micro_batch_size, num_gpus, optimizer, zero_stage, gradient_checkpointing, mixed_precision):
|
154 |
# Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
|
155 |
+
|
156 |
model_vram = model_memory(model_size, mixed_precision=mixed_precision)
|
157 |
gradients_vram = gradients_memory(model_size)
|
158 |
optimizer_vram = optimizer_memory(model_size, optimizer=optimizer)
|
159 |
|
160 |
# Baseline
|
161 |
if zero_stage == 0:
|
162 |
+
pass
|
163 |
# Optimizer state partitioning
|
164 |
+
if zero_stage >= 1:
|
165 |
+
optimizer_vram = optimizer_vram / num_gpus
|
166 |
# Gradient + Optimzer state partitioning
|
167 |
+
if zero_stage >= 2:
|
168 |
+
gradients_vram = gradients_vram / num_gpus
|
169 |
# Parameter partitioning + Gradient + Optimizer partitioning
|
170 |
if zero_stage == 3:
|
171 |
+
aggregated_vram = model_vram / num_gpus
|
172 |
|
173 |
+
aggregated_vram = round(model_vram, 2) + gradients_vram + optimizer_vram
|
174 |
|
175 |
activations_vram = activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads)
|
176 |
if gradient_checkpointing:
|
177 |
+
activations_vram = round(activations_vram ** 0.5, 2)
|
178 |
|
|
|
179 |
total_vram = aggregated_vram + activations_vram
|
180 |
+
return {"total": total_vram, "model": model_vram, "gradients": gradients_vram, "optimizer": optimizer_vram, "activations": activations_vram}
|
|
|
181 |
|
182 |
def build_interface(estimate_vram_fn):
|
183 |
training_params = []
|
|
|
190 |
|
191 |
|
192 |
with gr.Row(visible=False) as model_params_row:
|
193 |
+
model_params = [gr.Slider(label="Model Size", minimum=0.1, maximum=400, step=0.1, value=7, info="Model size (in billion parameters)"),
|
194 |
gr.Slider(label="Hidden size", minimum=256, maximum=8192, step=128, value=4096, info="Hidden size"),
|
195 |
+
gr.Slider(label="Sequence length", minimum=256, maximum=128_000, step=256, value=8192, info="Sequence length"),
|
196 |
+
gr.Slider(label="Num layers", minimum=8, maximum=64, step=1, value=32, info="Number of layers"),
|
197 |
+
gr.Slider(label="Num heads", minimum=8, maximum=64, step=1, value=32, info="Number of attention heads")
|
198 |
]
|
199 |
|
200 |
|
|
|
212 |
|
213 |
|
214 |
with gr.Row(equal_height=True):
|
215 |
+
training_params = [gr.Dropdown(label="Micro batch size", choices=BATCH_SIZES, value=4, info="Micro batch size (batch size per device/GPU)"),
|
216 |
gr.Dropdown(label="ZeRO stage", choices=ZERO_STAGES, value=0, info="ZeRO optimization stage"),
|
217 |
+
gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=True, info="Enable gradient checkpointing"),
|
218 |
+
gr.Dropdown(label="Mixed precision", choices=[False, True], value=False, info="Enable mixed precision for model training"),
|
219 |
gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"),
|
220 |
gr.Slider(label="Num GPUs", minimum=1, maximum=64, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"),
|
221 |
gr.Textbox(label="Cache dir", value=None, placeholder=".huggingface_configs", info="HuggingFace cache directory to download config from")
|
222 |
]
|
223 |
|
224 |
submit_btn = gr.Button("Estimate!")
|
225 |
+
|
226 |
output = gr.Textbox(label="Total estimated VRAM per device/GPU (in GB)")
|
227 |
|
228 |
submit_btn.click(
|
|
|
236 |
|
237 |
def estimate_vram(arg_keys, *args):
|
238 |
params = dict(zip(arg_keys, args))
|
239 |
+
print("Parameters: ", params)
|
240 |
|
241 |
model_config = ModelConfig(params["model_size"], params["hidden_size"], params["sequence_length"], params["num_layers"], params["num_heads"])
|
242 |
training_config = TrainingConfig(params["micro_batch_size"], params["num_gpus"], params["optimizer"], params["zero_stage"], params["gradient_checkpointing"], params["mixed_precision"])
|
243 |
+
if not params["repo_id"]:
|
244 |
+
return "No model selected!"
|
245 |
+
# If cache directory set, then download config
|
246 |
+
if params["cache_dir"]:
|
247 |
+
config = scrape_config_from_hub(params["repo_id"])
|
248 |
+
model_config.overwrite_with_hf_config(config)
|
249 |
+
# By default, scrape config.json from hub
|
250 |
+
else:
|
251 |
+
config = download_config_from_hub(params["repo_id"], params["cache_dir"])
|
252 |
+
model_config.overwrite_with_hf_config(config.to_dict())
|
253 |
|
254 |
+
total_vram_dict = vram_required(**vars(model_config), **vars(training_config))
|
255 |
+
output_str = f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB activations"
|
256 |
+
return output_str
|
257 |
|
258 |
if __name__ == "__main__":
|
259 |
parser = parse_args()
|
|
|
279 |
config = scrape_config_from_hub(args.repo_id)
|
280 |
model_config.overwrite_with_hf_config(config)
|
281 |
|
282 |
+
total_vram_dict = vram_required(**vars(model_config), **vars(training_config))
|
283 |
+
print(f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB activations")
|