tvosch commited on
Commit
a827007
1 Parent(s): 1089a77

initial app commit

Browse files
Files changed (1) hide show
  1. app.py +309 -0
app.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ import argparse
3
+ from functools import partial
4
+
5
+
6
+ import gradio as gr
7
+ from transformers import AutoConfig
8
+
9
+
10
+ PRECISION_TO_BYTES = {"float32": 4,
11
+ "fp32": 4,
12
+ "float16": 2,
13
+ "fp16": 2,
14
+ "bfloat16": 2,
15
+ "bf16": 2,
16
+ "int8": 1}
17
+
18
+ ZERO_STAGES = [0, 1, 2, 3]
19
+ OPTIMIZERS = ["adam", "adamw", "sgd"]
20
+ HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
21
+
22
+
23
+ @dataclass
24
+ class ModelConfig:
25
+ model_size: float
26
+ hidden_size: int
27
+ sequence_length: int
28
+ num_layers: int
29
+ num_heads: int
30
+
31
+ def overwrite_with_hf_config(self, config: dict):
32
+ self.model_size = round(get_model_size_from_config(config) / 10**9, 2)
33
+ self.hidden_size = config["hidden_size"]
34
+ self.sequence_length = config["max_position_embeddings"]
35
+ self.num_layers = config["num_hidden_layers"]
36
+ self.num_heads = config["num_attention_heads"]
37
+
38
+ @dataclass
39
+ class TrainingConfig:
40
+ micro_batch_size: int
41
+ num_gpus: int
42
+ optimizer: str
43
+ zero_stage: int
44
+ gradient_checkpointing: False
45
+ mixed_precision: False
46
+
47
+
48
+ def parse_args():
49
+ parser = argparse.ArgumentParser(description="Parser for VRAM estimator")
50
+
51
+ parser.add_argument("--repo_id", type=str, default=None, help="HuggingFace repo id to automatically determine model settings")
52
+ parser.add_argument("--model_size", type=float, default=7, help="Model size (in billion parameters)")
53
+ parser.add_argument("--hidden_size", type=int, default=4096, help="Hidden size")
54
+ parser.add_argument("--sequence_length", type=int, default=8192, help="Sequence length")
55
+ parser.add_argument("--num_layers", type=int, default=32, help="Number of layers")
56
+ parser.add_argument("--num_heads", type=int, default=32, help="Number of heads")
57
+ parser.add_argument("--micro_batch_size", type=int, default=4, help="Micro batch size (batch size per device/GPU)")
58
+ parser.add_argument("--zero_stage", type=int, default=0, choices=ZERO_STAGES, help="ZeRO optimization stage")
59
+ parser.add_argument("--gradient_checkpointing", action="store_false", help="Enable gradient checkpointing")
60
+ parser.add_argument("--mixed_precision", action="store_false", help="Enable mixed precision for model training")
61
+ parser.add_argument("--optimizer", type=str, default="adamw", choices=OPTIMIZERS, help="Type of optimizer")
62
+ parser.add_argument("--num_gpus", type=int, default=4, help="Number of GPUs. Necessary for estimating ZeRO stages")
63
+ parser.add_argument("--cache_dir", type=str, default=None, help="HuggingFace cache directory to download config from")
64
+
65
+ parser.add_argument("--no-app", action="store_true", help="Launch gradio app. Otherwise, commandline output")
66
+ return parser
67
+
68
+ def get_model_size_from_config(config: dict):
69
+ # Embedding parameters:
70
+ embedding_params = config["vocab_size"] * config["hidden_size"]
71
+
72
+ # Transformer layer parameters
73
+ def transformer_layer_params(hidden_size, intermediate_size, num_key_value_heads):
74
+ input_layernorm_params = hidden_size
75
+ mlp_down_proj_params = hidden_size * intermediate_size
76
+ mlp_gate_proj_params = intermediate_size * hidden_size
77
+ mlp_up_proj_params = intermediate_size * hidden_size
78
+ post_attention_layernorm_params = hidden_size
79
+ self_attn_k_proj_params = (hidden_size // (num_key_value_heads // 2)) * hidden_size
80
+ self_attn_o_proj_params = hidden_size * hidden_size
81
+ self_attn_q_proj_params = hidden_size * hidden_size
82
+ self_attn_v_proj_params = (hidden_size // (num_key_value_heads // 2)) * hidden_size
83
+
84
+ total_layer_params = (
85
+ input_layernorm_params + mlp_down_proj_params + mlp_gate_proj_params + mlp_up_proj_params +
86
+ post_attention_layernorm_params + self_attn_k_proj_params + self_attn_o_proj_params +
87
+ self_attn_q_proj_params + self_attn_v_proj_params
88
+ )
89
+
90
+ return total_layer_params
91
+
92
+ # Total parameters for all transformer layers
93
+ single_layer_params = transformer_layer_params(config["hidden_size"], config["intermediate_size"], config["num_key_value_heads"])
94
+ total_transformer_params = config["num_hidden_layers"] * single_layer_params
95
+
96
+ # Output layer parameters
97
+ output_params = config["vocab_size"] * config["hidden_size"]
98
+
99
+ # Total parameters
100
+ total_params = embedding_params + total_transformer_params + output_params
101
+ return total_params
102
+
103
+
104
+ def download_config_from_hub(repo_id: str, cache_dir: str):
105
+ return AutoConfig.from_pretrained(pretrained_model_name_or_path=repo_id, cache_dir=cache_dir)
106
+
107
+ def scrape_config_from_hub(repo_id):
108
+ import requests
109
+ url = HUGGINGFACE_URL_CONFIG.format(repo_id)
110
+ try:
111
+ print(f"Fetching config.json from the following URL: {url}...")
112
+ response = requests.get(url)
113
+ response.raise_for_status() # Raises a HTTPError if the status is 4xx, 5xx
114
+
115
+ config = response.json()
116
+ print(f"Fetched the config for model {repo_id} succesfully!")
117
+ except requests.exceptions.HTTPError as errh:
118
+ print(f"HTTP Error: {errh}")
119
+ except requests.exceptions.ConnectionError as errc:
120
+ print(f"Error Connecting: {errc}")
121
+ except requests.exceptions.Timeout as errt:
122
+ print(f"Timeout Error: {errt}")
123
+ except requests.exceptions.RequestException as err:
124
+ print(f"Something went wrong: {err}")
125
+ except ValueError as e:
126
+ print(f"Error decoding JSON: {e}")
127
+
128
+ return config
129
+
130
+ def model_memory(parameters, precision = "bf16", mixed_precision = False):
131
+ if mixed_precision:
132
+ return parameters * (PRECISION_TO_BYTES["fp32"] + PRECISION_TO_BYTES["fp16"])
133
+ return parameters * PRECISION_TO_BYTES[precision]
134
+
135
+
136
+ def gradients_memory(parameters, precision = "fp32"):
137
+ return parameters * PRECISION_TO_BYTES[precision]
138
+
139
+ def optimizer_memory(parameters, optimizer= "adamw", precision = "fp32"):
140
+ optimizer_choices = {"adam": 3,
141
+ "adamw": 2,
142
+ "sgd": 1}
143
+ return optimizer_choices[optimizer] * parameters * PRECISION_TO_BYTES[precision]
144
+
145
+ def activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads):
146
+ # Reference: https://arxiv.org/pdf/2205.05198
147
+ # Activations assumed to be in 16-bit floating precision
148
+ bytes_per_layer = sequence_length * micro_batch_size * hidden_size * (34 + 5 * (num_heads * sequence_length / hidden_size))
149
+ bytes_model = bytes_per_layer * num_layers
150
+ return round(bytes_model / 10**9, 2)
151
+
152
+ def vram_required(model_size, hidden_size, sequence_length, num_layers, num_heads, micro_batch_size, num_gpus, optimizer, zero_stage, gradient_checkpointing, mixed_precision):
153
+ # Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
154
+ model_vram = model_memory(model_size, mixed_precision=mixed_precision)
155
+ gradients_vram = gradients_memory(model_size)
156
+ optimizer_vram = optimizer_memory(model_size, optimizer=optimizer)
157
+
158
+ # Baseline
159
+ if zero_stage == 0:
160
+ aggregated_vram = model_vram + gradients_vram + optimizer_vram
161
+ # Optimizer state partitioning
162
+ if zero_stage == 1:
163
+ aggregated_vram = model_vram + gradients_vram + (optimizer_vram / num_gpus)
164
+ # Gradient + Optimzer state partitioning
165
+ if zero_stage == 2:
166
+ aggregated_vram = model_vram + ((gradients_vram + optimizer_vram) / num_gpus)
167
+ # Parameter partitioning + Gradient + Optimizer partitioning
168
+ if zero_stage == 3:
169
+ aggregated_vram = (model_vram / num_gpus) + (gradients_vram / num_gpus) + (optimizer_vram / num_gpus)
170
+
171
+ print(f"ZeRO stage {zero_stage} takes {aggregated_vram} GB")
172
+
173
+ activations_vram = activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads)
174
+ if gradient_checkpointing:
175
+ activations_vram = activations_vram ** 0.5
176
+
177
+ print(f"Activations require {activations_vram} GB with gradient checkpointing: {gradient_checkpointing}")
178
+ total_vram = aggregated_vram + activations_vram
179
+ print(f"Estimated 'minimal' VRAM requirement on {num_gpus} GPUs per GPU is {total_vram} GB")
180
+ return total_vram
181
+
182
+
183
+ def create_interface():
184
+ inputs = []
185
+ inputs.append(gr.Textbox(label="Repo ID", value=None, placeholder="TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", info="HuggingFace repo id to automatically determine model settings"))
186
+ inputs.append(gr.Slider(label="Model Size", minimum=0.1, maximum=1000, step=0.1, value=7, info="Model size (in billion parameters)"))
187
+ inputs.append(gr.Slider(label="Hidden size", minimum=256, maximum=8192, step=128, value=4096, info="Hidden size"))
188
+ inputs.append(gr.Slider(label="Sequence length", minimum=256, maximum=1_000_000, step=256, value=8192, info="Sequence length"))
189
+ inputs.append(gr.Slider(label="Num layers", minimum=1, maximum=64, step=1, value=32, info="Number of layers"))
190
+ inputs.append(gr.Slider(label="Num heads", minimum=1, maximum=64, step=1, value=32, info="Number of attention heads"))
191
+ inputs.append(gr.Dropdown(label="Micro batch size", choices=[1,2,4,8,16,32,64], value=4, info="Micro batch size (batch size per device/GPU)"))
192
+ inputs.append(gr.Dropdown(label="ZeRO stage", choices=ZERO_STAGES, value=0, info="ZeRO optimization stage"))
193
+ inputs.append(gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=False, info="Enable gradient checkpointing"))
194
+ inputs.append(gr.Dropdown(label="Mixed preision", choices=[True, False], value=False, info="Enable mixed precision for model training"))
195
+ inputs.append(gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"))
196
+ inputs.append(gr.Slider(label="Num GPUs", minimum=1, maximum=64, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"))
197
+ inputs.append(gr.Textbox(label="Cache dir", value=".huggingface_configs", info="HuggingFace cache directory to download config from"))
198
+
199
+ return inputs
200
+
201
+ def process_inputs(option, repo_id, slides, text_boxes):
202
+ if option == "Repo ID":
203
+ # Process the repo_id
204
+ return f"Repo ID received: {repo_id}"
205
+ elif option == "Slides and Textboxes":
206
+ # Ensure slides and text boxes are filled
207
+ if len(slides) != 5 or len(text_boxes) != 3:
208
+ return "Please provide exactly 5 slides and 3 text boxes."
209
+ # Process slides and text boxes
210
+ return f"Slides: {slides}\nTextboxes: {text_boxes}"
211
+
212
+ def build_interface(estimate_vram_fn):
213
+ training_params = []
214
+ with gr.Blocks() as app:
215
+
216
+ option = gr.Radio(["Repo ID", "Model Parameters"], label="Select Input Type")
217
+
218
+ repo_id = gr.Textbox(label="Repo ID", visible=False)
219
+
220
+
221
+
222
+ with gr.Row(visible=False) as model_params_row:
223
+ model_params = [gr.Slider(label="Model Size", minimum=0.1, maximum=1000, step=0.1, value=7, info="Model size (in billion parameters)"),
224
+ gr.Slider(label="Hidden size", minimum=256, maximum=8192, step=128, value=4096, info="Hidden size"),
225
+ gr.Slider(label="Sequence length", minimum=256, maximum=1_000_000, step=256, value=8192, info="Sequence length"),
226
+ gr.Slider(label="Num layers", minimum=1, maximum=64, step=1, value=32, info="Number of layers"),
227
+ gr.Slider(label="Num heads", minimum=1, maximum=64, step=1, value=32, info="Number of attention heads")
228
+ ]
229
+
230
+
231
+ def update_visibility(selected_option):
232
+ if selected_option == "Repo ID":
233
+ return gr.update(visible=True), gr.update(visible=False),
234
+ elif selected_option == "Model Parameters":
235
+ return gr.update(visible=False), gr.update(visible=True)
236
+
237
+ option.change(
238
+ fn=update_visibility,
239
+ inputs=[option],
240
+ outputs=[repo_id, model_params_row]
241
+ )
242
+
243
+
244
+ with gr.Row(equal_height=True):
245
+ training_params = [gr.Dropdown(label="Micro batch size", choices=[1,2,4,8,16,32,64], value=4, info="Micro batch size (batch size per device/GPU)"),
246
+ gr.Dropdown(label="ZeRO stage", choices=ZERO_STAGES, value=0, info="ZeRO optimization stage"),
247
+ gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=False, info="Enable gradient checkpointing"),
248
+ gr.Dropdown(label="Mixed preision", choices=[True, False], value=False, info="Enable mixed precision for model training"),
249
+ gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"),
250
+ gr.Slider(label="Num GPUs", minimum=1, maximum=64, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"),
251
+ gr.Textbox(label="Cache dir", value=None, placeholder=".huggingface_configs", info="HuggingFace cache directory to download config from")
252
+ ]
253
+
254
+ submit_btn = gr.Button("Estimate!")
255
+ output = gr.Textbox(label="Total estimated VRAM per device/GPU (in GB)")
256
+
257
+ submit_btn.click(
258
+ fn=estimate_vram_fn,
259
+ inputs=[repo_id, *model_params, *training_params],
260
+ outputs=[output]
261
+ )
262
+
263
+ return app
264
+
265
+
266
+ def estimate_vram(arg_keys, *args):
267
+ params = dict(zip(arg_keys, args))
268
+ print(params)
269
+
270
+ model_config = ModelConfig(params["model_size"], params["hidden_size"], params["sequence_length"], params["num_layers"], params["num_heads"])
271
+ training_config = TrainingConfig(params["micro_batch_size"], params["num_gpus"], params["optimizer"], params["zero_stage"], params["gradient_checkpointing"], params["mixed_precision"])
272
+ if params["repo_id"]:
273
+ # If cache directory set, then download config
274
+ if params["cache_dir"]:
275
+ config = scrape_config_from_hub(params["repo_id"])
276
+ model_config.overwrite_with_hf_config(config)
277
+ # By default, scrape config.json from hub
278
+ else:
279
+ config = download_config_from_hub(params["repo_id"], params["cache_dir"])
280
+ model_config.overwrite_with_hf_config(config.to_dict())
281
+
282
+ total_vram = vram_required(**vars(model_config), **vars(training_config))
283
+ return total_vram
284
+
285
+ if __name__ == "__main__":
286
+ parser = parse_args()
287
+ args = parser.parse_args()
288
+
289
+ # Launch gradio interface
290
+ if not args.no_app:
291
+ import gradio as gr
292
+ arg_keys = list(vars(args).keys())
293
+ estimate_vram_fn = partial(estimate_vram, arg_keys)
294
+ interface = build_interface(estimate_vram_fn)
295
+ interface.launch()
296
+ # Command line interface
297
+ else:
298
+ model_config = ModelConfig(args.model_size, args.hidden_size, args.sequence_length, args.num_layers, args.num_heads)
299
+ training_config = TrainingConfig(args.micro_batch_size, args.num_gpus, args.optimizer, args.zero_stage, args.gradient_checkpointing, args.mixed_precision)
300
+ if args.repo_id:
301
+ # If cache directory set, then download config
302
+ if args.cache_dir:
303
+ config = download_config_from_hub(args.repo_id, args.cache_dir).to_dict()
304
+ # By default, scrape config.json from hub
305
+ else:
306
+ config = scrape_config_from_hub(args.repo_id)
307
+ model_config.overwrite_with_hf_config(config)
308
+
309
+ total_vram = vram_required(**vars(model_config), **vars(training_config))