Spaces:
Running
Running
initial app commit
Browse files
app.py
ADDED
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
import argparse
|
3 |
+
from functools import partial
|
4 |
+
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
from transformers import AutoConfig
|
8 |
+
|
9 |
+
|
10 |
+
PRECISION_TO_BYTES = {"float32": 4,
|
11 |
+
"fp32": 4,
|
12 |
+
"float16": 2,
|
13 |
+
"fp16": 2,
|
14 |
+
"bfloat16": 2,
|
15 |
+
"bf16": 2,
|
16 |
+
"int8": 1}
|
17 |
+
|
18 |
+
ZERO_STAGES = [0, 1, 2, 3]
|
19 |
+
OPTIMIZERS = ["adam", "adamw", "sgd"]
|
20 |
+
HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
|
21 |
+
|
22 |
+
|
23 |
+
@dataclass
|
24 |
+
class ModelConfig:
|
25 |
+
model_size: float
|
26 |
+
hidden_size: int
|
27 |
+
sequence_length: int
|
28 |
+
num_layers: int
|
29 |
+
num_heads: int
|
30 |
+
|
31 |
+
def overwrite_with_hf_config(self, config: dict):
|
32 |
+
self.model_size = round(get_model_size_from_config(config) / 10**9, 2)
|
33 |
+
self.hidden_size = config["hidden_size"]
|
34 |
+
self.sequence_length = config["max_position_embeddings"]
|
35 |
+
self.num_layers = config["num_hidden_layers"]
|
36 |
+
self.num_heads = config["num_attention_heads"]
|
37 |
+
|
38 |
+
@dataclass
|
39 |
+
class TrainingConfig:
|
40 |
+
micro_batch_size: int
|
41 |
+
num_gpus: int
|
42 |
+
optimizer: str
|
43 |
+
zero_stage: int
|
44 |
+
gradient_checkpointing: False
|
45 |
+
mixed_precision: False
|
46 |
+
|
47 |
+
|
48 |
+
def parse_args():
|
49 |
+
parser = argparse.ArgumentParser(description="Parser for VRAM estimator")
|
50 |
+
|
51 |
+
parser.add_argument("--repo_id", type=str, default=None, help="HuggingFace repo id to automatically determine model settings")
|
52 |
+
parser.add_argument("--model_size", type=float, default=7, help="Model size (in billion parameters)")
|
53 |
+
parser.add_argument("--hidden_size", type=int, default=4096, help="Hidden size")
|
54 |
+
parser.add_argument("--sequence_length", type=int, default=8192, help="Sequence length")
|
55 |
+
parser.add_argument("--num_layers", type=int, default=32, help="Number of layers")
|
56 |
+
parser.add_argument("--num_heads", type=int, default=32, help="Number of heads")
|
57 |
+
parser.add_argument("--micro_batch_size", type=int, default=4, help="Micro batch size (batch size per device/GPU)")
|
58 |
+
parser.add_argument("--zero_stage", type=int, default=0, choices=ZERO_STAGES, help="ZeRO optimization stage")
|
59 |
+
parser.add_argument("--gradient_checkpointing", action="store_false", help="Enable gradient checkpointing")
|
60 |
+
parser.add_argument("--mixed_precision", action="store_false", help="Enable mixed precision for model training")
|
61 |
+
parser.add_argument("--optimizer", type=str, default="adamw", choices=OPTIMIZERS, help="Type of optimizer")
|
62 |
+
parser.add_argument("--num_gpus", type=int, default=4, help="Number of GPUs. Necessary for estimating ZeRO stages")
|
63 |
+
parser.add_argument("--cache_dir", type=str, default=None, help="HuggingFace cache directory to download config from")
|
64 |
+
|
65 |
+
parser.add_argument("--no-app", action="store_true", help="Launch gradio app. Otherwise, commandline output")
|
66 |
+
return parser
|
67 |
+
|
68 |
+
def get_model_size_from_config(config: dict):
|
69 |
+
# Embedding parameters:
|
70 |
+
embedding_params = config["vocab_size"] * config["hidden_size"]
|
71 |
+
|
72 |
+
# Transformer layer parameters
|
73 |
+
def transformer_layer_params(hidden_size, intermediate_size, num_key_value_heads):
|
74 |
+
input_layernorm_params = hidden_size
|
75 |
+
mlp_down_proj_params = hidden_size * intermediate_size
|
76 |
+
mlp_gate_proj_params = intermediate_size * hidden_size
|
77 |
+
mlp_up_proj_params = intermediate_size * hidden_size
|
78 |
+
post_attention_layernorm_params = hidden_size
|
79 |
+
self_attn_k_proj_params = (hidden_size // (num_key_value_heads // 2)) * hidden_size
|
80 |
+
self_attn_o_proj_params = hidden_size * hidden_size
|
81 |
+
self_attn_q_proj_params = hidden_size * hidden_size
|
82 |
+
self_attn_v_proj_params = (hidden_size // (num_key_value_heads // 2)) * hidden_size
|
83 |
+
|
84 |
+
total_layer_params = (
|
85 |
+
input_layernorm_params + mlp_down_proj_params + mlp_gate_proj_params + mlp_up_proj_params +
|
86 |
+
post_attention_layernorm_params + self_attn_k_proj_params + self_attn_o_proj_params +
|
87 |
+
self_attn_q_proj_params + self_attn_v_proj_params
|
88 |
+
)
|
89 |
+
|
90 |
+
return total_layer_params
|
91 |
+
|
92 |
+
# Total parameters for all transformer layers
|
93 |
+
single_layer_params = transformer_layer_params(config["hidden_size"], config["intermediate_size"], config["num_key_value_heads"])
|
94 |
+
total_transformer_params = config["num_hidden_layers"] * single_layer_params
|
95 |
+
|
96 |
+
# Output layer parameters
|
97 |
+
output_params = config["vocab_size"] * config["hidden_size"]
|
98 |
+
|
99 |
+
# Total parameters
|
100 |
+
total_params = embedding_params + total_transformer_params + output_params
|
101 |
+
return total_params
|
102 |
+
|
103 |
+
|
104 |
+
def download_config_from_hub(repo_id: str, cache_dir: str):
|
105 |
+
return AutoConfig.from_pretrained(pretrained_model_name_or_path=repo_id, cache_dir=cache_dir)
|
106 |
+
|
107 |
+
def scrape_config_from_hub(repo_id):
|
108 |
+
import requests
|
109 |
+
url = HUGGINGFACE_URL_CONFIG.format(repo_id)
|
110 |
+
try:
|
111 |
+
print(f"Fetching config.json from the following URL: {url}...")
|
112 |
+
response = requests.get(url)
|
113 |
+
response.raise_for_status() # Raises a HTTPError if the status is 4xx, 5xx
|
114 |
+
|
115 |
+
config = response.json()
|
116 |
+
print(f"Fetched the config for model {repo_id} succesfully!")
|
117 |
+
except requests.exceptions.HTTPError as errh:
|
118 |
+
print(f"HTTP Error: {errh}")
|
119 |
+
except requests.exceptions.ConnectionError as errc:
|
120 |
+
print(f"Error Connecting: {errc}")
|
121 |
+
except requests.exceptions.Timeout as errt:
|
122 |
+
print(f"Timeout Error: {errt}")
|
123 |
+
except requests.exceptions.RequestException as err:
|
124 |
+
print(f"Something went wrong: {err}")
|
125 |
+
except ValueError as e:
|
126 |
+
print(f"Error decoding JSON: {e}")
|
127 |
+
|
128 |
+
return config
|
129 |
+
|
130 |
+
def model_memory(parameters, precision = "bf16", mixed_precision = False):
|
131 |
+
if mixed_precision:
|
132 |
+
return parameters * (PRECISION_TO_BYTES["fp32"] + PRECISION_TO_BYTES["fp16"])
|
133 |
+
return parameters * PRECISION_TO_BYTES[precision]
|
134 |
+
|
135 |
+
|
136 |
+
def gradients_memory(parameters, precision = "fp32"):
|
137 |
+
return parameters * PRECISION_TO_BYTES[precision]
|
138 |
+
|
139 |
+
def optimizer_memory(parameters, optimizer= "adamw", precision = "fp32"):
|
140 |
+
optimizer_choices = {"adam": 3,
|
141 |
+
"adamw": 2,
|
142 |
+
"sgd": 1}
|
143 |
+
return optimizer_choices[optimizer] * parameters * PRECISION_TO_BYTES[precision]
|
144 |
+
|
145 |
+
def activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads):
|
146 |
+
# Reference: https://arxiv.org/pdf/2205.05198
|
147 |
+
# Activations assumed to be in 16-bit floating precision
|
148 |
+
bytes_per_layer = sequence_length * micro_batch_size * hidden_size * (34 + 5 * (num_heads * sequence_length / hidden_size))
|
149 |
+
bytes_model = bytes_per_layer * num_layers
|
150 |
+
return round(bytes_model / 10**9, 2)
|
151 |
+
|
152 |
+
def vram_required(model_size, hidden_size, sequence_length, num_layers, num_heads, micro_batch_size, num_gpus, optimizer, zero_stage, gradient_checkpointing, mixed_precision):
|
153 |
+
# Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
|
154 |
+
model_vram = model_memory(model_size, mixed_precision=mixed_precision)
|
155 |
+
gradients_vram = gradients_memory(model_size)
|
156 |
+
optimizer_vram = optimizer_memory(model_size, optimizer=optimizer)
|
157 |
+
|
158 |
+
# Baseline
|
159 |
+
if zero_stage == 0:
|
160 |
+
aggregated_vram = model_vram + gradients_vram + optimizer_vram
|
161 |
+
# Optimizer state partitioning
|
162 |
+
if zero_stage == 1:
|
163 |
+
aggregated_vram = model_vram + gradients_vram + (optimizer_vram / num_gpus)
|
164 |
+
# Gradient + Optimzer state partitioning
|
165 |
+
if zero_stage == 2:
|
166 |
+
aggregated_vram = model_vram + ((gradients_vram + optimizer_vram) / num_gpus)
|
167 |
+
# Parameter partitioning + Gradient + Optimizer partitioning
|
168 |
+
if zero_stage == 3:
|
169 |
+
aggregated_vram = (model_vram / num_gpus) + (gradients_vram / num_gpus) + (optimizer_vram / num_gpus)
|
170 |
+
|
171 |
+
print(f"ZeRO stage {zero_stage} takes {aggregated_vram} GB")
|
172 |
+
|
173 |
+
activations_vram = activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads)
|
174 |
+
if gradient_checkpointing:
|
175 |
+
activations_vram = activations_vram ** 0.5
|
176 |
+
|
177 |
+
print(f"Activations require {activations_vram} GB with gradient checkpointing: {gradient_checkpointing}")
|
178 |
+
total_vram = aggregated_vram + activations_vram
|
179 |
+
print(f"Estimated 'minimal' VRAM requirement on {num_gpus} GPUs per GPU is {total_vram} GB")
|
180 |
+
return total_vram
|
181 |
+
|
182 |
+
|
183 |
+
def create_interface():
|
184 |
+
inputs = []
|
185 |
+
inputs.append(gr.Textbox(label="Repo ID", value=None, placeholder="TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", info="HuggingFace repo id to automatically determine model settings"))
|
186 |
+
inputs.append(gr.Slider(label="Model Size", minimum=0.1, maximum=1000, step=0.1, value=7, info="Model size (in billion parameters)"))
|
187 |
+
inputs.append(gr.Slider(label="Hidden size", minimum=256, maximum=8192, step=128, value=4096, info="Hidden size"))
|
188 |
+
inputs.append(gr.Slider(label="Sequence length", minimum=256, maximum=1_000_000, step=256, value=8192, info="Sequence length"))
|
189 |
+
inputs.append(gr.Slider(label="Num layers", minimum=1, maximum=64, step=1, value=32, info="Number of layers"))
|
190 |
+
inputs.append(gr.Slider(label="Num heads", minimum=1, maximum=64, step=1, value=32, info="Number of attention heads"))
|
191 |
+
inputs.append(gr.Dropdown(label="Micro batch size", choices=[1,2,4,8,16,32,64], value=4, info="Micro batch size (batch size per device/GPU)"))
|
192 |
+
inputs.append(gr.Dropdown(label="ZeRO stage", choices=ZERO_STAGES, value=0, info="ZeRO optimization stage"))
|
193 |
+
inputs.append(gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=False, info="Enable gradient checkpointing"))
|
194 |
+
inputs.append(gr.Dropdown(label="Mixed preision", choices=[True, False], value=False, info="Enable mixed precision for model training"))
|
195 |
+
inputs.append(gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"))
|
196 |
+
inputs.append(gr.Slider(label="Num GPUs", minimum=1, maximum=64, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"))
|
197 |
+
inputs.append(gr.Textbox(label="Cache dir", value=".huggingface_configs", info="HuggingFace cache directory to download config from"))
|
198 |
+
|
199 |
+
return inputs
|
200 |
+
|
201 |
+
def process_inputs(option, repo_id, slides, text_boxes):
|
202 |
+
if option == "Repo ID":
|
203 |
+
# Process the repo_id
|
204 |
+
return f"Repo ID received: {repo_id}"
|
205 |
+
elif option == "Slides and Textboxes":
|
206 |
+
# Ensure slides and text boxes are filled
|
207 |
+
if len(slides) != 5 or len(text_boxes) != 3:
|
208 |
+
return "Please provide exactly 5 slides and 3 text boxes."
|
209 |
+
# Process slides and text boxes
|
210 |
+
return f"Slides: {slides}\nTextboxes: {text_boxes}"
|
211 |
+
|
212 |
+
def build_interface(estimate_vram_fn):
|
213 |
+
training_params = []
|
214 |
+
with gr.Blocks() as app:
|
215 |
+
|
216 |
+
option = gr.Radio(["Repo ID", "Model Parameters"], label="Select Input Type")
|
217 |
+
|
218 |
+
repo_id = gr.Textbox(label="Repo ID", visible=False)
|
219 |
+
|
220 |
+
|
221 |
+
|
222 |
+
with gr.Row(visible=False) as model_params_row:
|
223 |
+
model_params = [gr.Slider(label="Model Size", minimum=0.1, maximum=1000, step=0.1, value=7, info="Model size (in billion parameters)"),
|
224 |
+
gr.Slider(label="Hidden size", minimum=256, maximum=8192, step=128, value=4096, info="Hidden size"),
|
225 |
+
gr.Slider(label="Sequence length", minimum=256, maximum=1_000_000, step=256, value=8192, info="Sequence length"),
|
226 |
+
gr.Slider(label="Num layers", minimum=1, maximum=64, step=1, value=32, info="Number of layers"),
|
227 |
+
gr.Slider(label="Num heads", minimum=1, maximum=64, step=1, value=32, info="Number of attention heads")
|
228 |
+
]
|
229 |
+
|
230 |
+
|
231 |
+
def update_visibility(selected_option):
|
232 |
+
if selected_option == "Repo ID":
|
233 |
+
return gr.update(visible=True), gr.update(visible=False),
|
234 |
+
elif selected_option == "Model Parameters":
|
235 |
+
return gr.update(visible=False), gr.update(visible=True)
|
236 |
+
|
237 |
+
option.change(
|
238 |
+
fn=update_visibility,
|
239 |
+
inputs=[option],
|
240 |
+
outputs=[repo_id, model_params_row]
|
241 |
+
)
|
242 |
+
|
243 |
+
|
244 |
+
with gr.Row(equal_height=True):
|
245 |
+
training_params = [gr.Dropdown(label="Micro batch size", choices=[1,2,4,8,16,32,64], value=4, info="Micro batch size (batch size per device/GPU)"),
|
246 |
+
gr.Dropdown(label="ZeRO stage", choices=ZERO_STAGES, value=0, info="ZeRO optimization stage"),
|
247 |
+
gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=False, info="Enable gradient checkpointing"),
|
248 |
+
gr.Dropdown(label="Mixed preision", choices=[True, False], value=False, info="Enable mixed precision for model training"),
|
249 |
+
gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"),
|
250 |
+
gr.Slider(label="Num GPUs", minimum=1, maximum=64, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"),
|
251 |
+
gr.Textbox(label="Cache dir", value=None, placeholder=".huggingface_configs", info="HuggingFace cache directory to download config from")
|
252 |
+
]
|
253 |
+
|
254 |
+
submit_btn = gr.Button("Estimate!")
|
255 |
+
output = gr.Textbox(label="Total estimated VRAM per device/GPU (in GB)")
|
256 |
+
|
257 |
+
submit_btn.click(
|
258 |
+
fn=estimate_vram_fn,
|
259 |
+
inputs=[repo_id, *model_params, *training_params],
|
260 |
+
outputs=[output]
|
261 |
+
)
|
262 |
+
|
263 |
+
return app
|
264 |
+
|
265 |
+
|
266 |
+
def estimate_vram(arg_keys, *args):
|
267 |
+
params = dict(zip(arg_keys, args))
|
268 |
+
print(params)
|
269 |
+
|
270 |
+
model_config = ModelConfig(params["model_size"], params["hidden_size"], params["sequence_length"], params["num_layers"], params["num_heads"])
|
271 |
+
training_config = TrainingConfig(params["micro_batch_size"], params["num_gpus"], params["optimizer"], params["zero_stage"], params["gradient_checkpointing"], params["mixed_precision"])
|
272 |
+
if params["repo_id"]:
|
273 |
+
# If cache directory set, then download config
|
274 |
+
if params["cache_dir"]:
|
275 |
+
config = scrape_config_from_hub(params["repo_id"])
|
276 |
+
model_config.overwrite_with_hf_config(config)
|
277 |
+
# By default, scrape config.json from hub
|
278 |
+
else:
|
279 |
+
config = download_config_from_hub(params["repo_id"], params["cache_dir"])
|
280 |
+
model_config.overwrite_with_hf_config(config.to_dict())
|
281 |
+
|
282 |
+
total_vram = vram_required(**vars(model_config), **vars(training_config))
|
283 |
+
return total_vram
|
284 |
+
|
285 |
+
if __name__ == "__main__":
|
286 |
+
parser = parse_args()
|
287 |
+
args = parser.parse_args()
|
288 |
+
|
289 |
+
# Launch gradio interface
|
290 |
+
if not args.no_app:
|
291 |
+
import gradio as gr
|
292 |
+
arg_keys = list(vars(args).keys())
|
293 |
+
estimate_vram_fn = partial(estimate_vram, arg_keys)
|
294 |
+
interface = build_interface(estimate_vram_fn)
|
295 |
+
interface.launch()
|
296 |
+
# Command line interface
|
297 |
+
else:
|
298 |
+
model_config = ModelConfig(args.model_size, args.hidden_size, args.sequence_length, args.num_layers, args.num_heads)
|
299 |
+
training_config = TrainingConfig(args.micro_batch_size, args.num_gpus, args.optimizer, args.zero_stage, args.gradient_checkpointing, args.mixed_precision)
|
300 |
+
if args.repo_id:
|
301 |
+
# If cache directory set, then download config
|
302 |
+
if args.cache_dir:
|
303 |
+
config = download_config_from_hub(args.repo_id, args.cache_dir).to_dict()
|
304 |
+
# By default, scrape config.json from hub
|
305 |
+
else:
|
306 |
+
config = scrape_config_from_hub(args.repo_id)
|
307 |
+
model_config.overwrite_with_hf_config(config)
|
308 |
+
|
309 |
+
total_vram = vram_required(**vars(model_config), **vars(training_config))
|