Spaces:

tvosch
/

VRAM-estimator

Sleeping

App Files Files Community

tvosch commited on Oct 15, 2024

Commit

ffa1281

1 Parent(s): 9223a1e

add minimal inference code

Browse files

Files changed (3) hide show

app.py +79 -40
estimate_train_vram.py +34 -4
vram_helpers.py +25 -7

app.py CHANGED Viewed

@@ -5,11 +5,12 @@ from functools import partial
 import gradio as gr
 from transformers import AutoConfig
-from estimate_train_vram import vram_required
-from vram_helpers import ModelConfig, TrainingConfig, filter_params_for_dataclass
 ZERO_STAGES = [0, 1, 2, 3]
 BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64]
 OPTIMIZERS = ["adam", "adamw", "adamw_8bit", "sgd"]
 HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
@@ -31,6 +32,9 @@ def parse_args():
     parser.add_argument("--num_gpus", type=int, default=4, help="Number of GPUs. Necessary for estimating ZeRO stages")
     parser.add_argument("--cache_dir", type=str, default=None, help="HuggingFace cache directory to download config from")
     parser.add_argument("--qlora", action="store_false", help="Enable QLoRA in case of finetuning")
     parser.add_argument("--no-app", action="store_true", help="Launch gradio app. Otherwise, commandline output")
     return parser
@@ -67,80 +71,110 @@ def scrape_config_from_hub(repo_id):
 def build_interface(estimate_vram_fn):
     with gr.Blocks() as app:
-        gr.Markdown("## Select either an existing HF model from a repository or choose your own model parameters")
-        option = gr.Radio(["Repo ID", "Model Parameters"], label="Select Input Type")
         repo_id = gr.Textbox(label="Repo ID", visible=False, placeholder="mistralai/Mistral-7B-v0.1")
         with gr.Row(visible=False) as model_params_row:
             model_params = [gr.Slider(label="Model Size", minimum=0.1, maximum=400, step=0.1, value=7, info="Model size (in billion parameters)"),
                             gr.Slider(label="Hidden size", minimum=256, maximum=8192, step=128, value=4096, info="Hidden size"),
-                            gr.Slider(label="Sequence length", minimum=256, maximum=128_000, step=256, value=8192, info="Sequence length"),
                             gr.Slider(label="Num layers", minimum=8, maximum=64, step=1, value=32, info="Number of layers"),
                             gr.Slider(label="Num heads", minimum=8, maximum=64, step=1, value=32, info="Number of attention heads")
             ]
-        def update_visibility(selected_option):
-            if selected_option == "Repo ID":
-                return gr.update(visible=True), gr.update(visible=False),
-            elif selected_option == "Model Parameters":
-                return gr.update(visible=False), gr.update(visible=True)
-        option.change(
-                fn=update_visibility,
-                inputs=[option],
-                outputs=[repo_id, model_params_row]
-            )
-        gr.Markdown("## Select training parameters")
-        with gr.Row(equal_height=True):
             training_params = [gr.Dropdown(label="Micro batch size", choices=BATCH_SIZES, value=4, info="Micro batch size (batch size per device/GPU)"),
                                gr.Dropdown(label="ZeRO stage", choices=ZERO_STAGES, value=0, info="ZeRO optimization stage"),
                                gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=True, info="Enable gradient checkpointing"),
                                gr.Dropdown(label="Mixed precision", choices=[False, True], value=False, info="Enable mixed precision for model training"),
                                gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"),
                                gr.Dropdown(label="QLoRA", choices=[False, True], value=False, info="Finetune with QLoRA enabled"),
-                               gr.Slider(label="Num GPUs", minimum=1, maximum=64, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"),
-                               gr.Textbox(label="Cache dir", value=None, placeholder=".huggingface_configs", info="HuggingFace cache directory to download config to")
             ]
-        submit_btn = gr.Button("Estimate!")
-        output = gr.Textbox(label="Total estimated VRAM per device/GPU (in GB)")
-        def create_combined_params_dict(repo_id, *values):
-            all_params = model_params + training_params
             combined_dict = {param.label.lower().replace(" ", "_"): value for param, value in zip(all_params, values)}
             combined_dict["repo_id"] = repo_id
             return combined_dict
         submit_btn.click(
-            fn=lambda repo_id, *values: estimate_vram_fn(create_combined_params_dict(repo_id, *values)),
-            inputs=[repo_id] + model_params + training_params,
-        outputs=[output]
     )
     return app
 def estimate_vram(gradio_params):
     model_config = ModelConfig(**filter_params_for_dataclass(ModelConfig, gradio_params))
     training_config = TrainingConfig(**filter_params_for_dataclass(TrainingConfig, gradio_params))
     # Update model config
     if not gradio_params["repo_id"]:
         return "No model selected!"
     # If cache directory set, then download config
-    if gradio_params["cache_dir"]:
-        config = scrape_config_from_hub(gradio_params["repo_id"])
-        model_config.overwrite_with_hf_config(config)
     # By default, scrape config.json from hub
-    else:
-        config = download_config_from_hub(gradio_params["repo_id"], gradio_params["cache_dir"])
-        model_config.overwrite_with_hf_config(config.to_dict())
-    if gradio_params["qlora"]:
-        model_config.precision = "int4"
-    total_vram_dict = vram_required(model_config, training_config)
-    output_str = f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB activations"
     return output_str
 if __name__ == "__main__":
@@ -166,5 +200,10 @@ if __name__ == "__main__":
                 config = scrape_config_from_hub(args.repo_id)
             model_config.overwrite_with_hf_config(config)
-        total_vram_dict = vram_required(model_config, training_config)
-        print(f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB (activations)")

 import gradio as gr
 from transformers import AutoConfig
+from estimate_train_vram import training_vram_required, inference_vram_required
+from vram_helpers import ModelConfig, TrainingConfig, filter_params_for_dataclass, PRECISION_TO_BYTES
 ZERO_STAGES = [0, 1, 2, 3]
 BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64]
+QUANTIZATION = PRECISION_TO_BYTES.keys()
 OPTIMIZERS = ["adam", "adamw", "adamw_8bit", "sgd"]
 HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
     parser.add_argument("--num_gpus", type=int, default=4, help="Number of GPUs. Necessary for estimating ZeRO stages")
     parser.add_argument("--cache_dir", type=str, default=None, help="HuggingFace cache directory to download config from")
     parser.add_argument("--qlora", action="store_false", help="Enable QLoRA in case of finetuning")
+    parser.add_argument("--quantization", type=str, choices=QUANTIZATION, help="Type of quantization. Default is fp16/bf16")
+    parser.add_argument("--train", action="store_false", help="Flag to turn off train and run inference")
+    parser.add_argument("--total_sequence_length", type=int, default=0, help="Total sequence length (prompt + output) for inference")
     parser.add_argument("--no-app", action="store_true", help="Launch gradio app. Otherwise, commandline output")
     return parser
 def build_interface(estimate_vram_fn):
     with gr.Blocks() as app:
+        gr.Markdown("## 1. Select HuggingFace model from a repository or choose your own model parameters")
+        model_option = gr.Radio(["Repo ID", "Model Parameters"], label="Select Input Type")
         repo_id = gr.Textbox(label="Repo ID", visible=False, placeholder="mistralai/Mistral-7B-v0.1")
         with gr.Row(visible=False) as model_params_row:
             model_params = [gr.Slider(label="Model Size", minimum=0.1, maximum=400, step=0.1, value=7, info="Model size (in billion parameters)"),
                             gr.Slider(label="Hidden size", minimum=256, maximum=8192, step=128, value=4096, info="Hidden size"),
+                            gr.Slider(label="Sequence length", minimum=128, maximum=128_000, step=256, value=8192, info="Sequence length"),
                             gr.Slider(label="Num layers", minimum=8, maximum=64, step=1, value=32, info="Number of layers"),
                             gr.Slider(label="Num heads", minimum=8, maximum=64, step=1, value=32, info="Number of attention heads")
             ]
+        def update_visibility_model_type(selected_option, choices):
+            """
+            Dynamically update the visibility of components based on the selected option.
+            :param selected_option: The currently selected option
+            :param choices: Variable number of tuples, each containing (option_value, component)
+            :return: List of gr.update() calls corresponding to each choice
+            """
+            updates = []
+            for option_value, _ in choices:
+                updates.append(gr.update(visible=(selected_option == option_value)))
+            return updates
+        model_option_choices = [("Repo ID", repo_id), ("Model Parameters", model_params_row)]
+        model_option.change(
+                fn=partial(update_visibility_model_type, choices=model_option_choices),
+                inputs=[model_option],
+                outputs=[repo_id, model_params_row],
+        )
+        gr.Markdown("## 2. Select training or inference parameters")
+        training_option = gr.Radio(["Training", "Inference"], label="Select Input Type")
+        with gr.Row(equal_height=True, visible=False) as training_params_row:
             training_params = [gr.Dropdown(label="Micro batch size", choices=BATCH_SIZES, value=4, info="Micro batch size (batch size per device/GPU)"),
                                gr.Dropdown(label="ZeRO stage", choices=ZERO_STAGES, value=0, info="ZeRO optimization stage"),
                                gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=True, info="Enable gradient checkpointing"),
                                gr.Dropdown(label="Mixed precision", choices=[False, True], value=False, info="Enable mixed precision for model training"),
                                gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"),
                                gr.Dropdown(label="QLoRA", choices=[False, True], value=False, info="Finetune with QLoRA enabled"),
+                               gr.Slider(label="Num GPUs", minimum=1, maximum=256, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"),
             ]
+        with gr.Row(equal_height=True, visible=False) as inference_params_row:
+            inference_params = [gr.Dropdown(label="Quantization", choices=QUANTIZATION, value="fp16", info="Quantization of model"),
+                                gr.Slider(label="Num GPUs", minimum=1, maximum=256, step=1, value=1, info="Number of GPUs"),
+                                gr.Dropdown(label="Micro batch size", choices=BATCH_SIZES, value=1, info="Micro batch size (batch size per device/GPU)"),
+                                gr.Slider(label="Total sequence length", minimum=128, maximum=128_000, value=0, info="Total sequence length to run (necessary for KV cache calculation")
+            ]
+        training_option_choices = [("Training", inference_params_row), ("Inference", training_params_row)]
+        training_option.change(
+                fn=partial(update_visibility_model_type, choices=training_option_choices),
+                inputs=[training_option],
+                outputs=[training_params_row, inference_params_row],
+        )
+        submit_btn = gr.Button("Estimate!")
+        output = gr.Textbox(label="Total estimated VRAM per device/GPU (in GB)")
+        def create_combined_params_dict(repo_id, training_option, *values):
+            all_params = model_params + training_params + inference_params
             combined_dict = {param.label.lower().replace(" ", "_"): value for param, value in zip(all_params, values)}
             combined_dict["repo_id"] = repo_id
+            combined_dict["train"] = True if training_option.lower() == "training" else False # False -> inference
             return combined_dict
         submit_btn.click(
+            fn=lambda repo_id, training_option, *values: estimate_vram_fn(create_combined_params_dict(repo_id, training_option, *values)),
+            inputs=[repo_id, training_option] + model_params + training_params + inference_params,
+            outputs=[output]
     )
     return app
 def estimate_vram(gradio_params):
+    print(gradio_params)
     model_config = ModelConfig(**filter_params_for_dataclass(ModelConfig, gradio_params))
     training_config = TrainingConfig(**filter_params_for_dataclass(TrainingConfig, gradio_params))
     # Update model config
     if not gradio_params["repo_id"]:
         return "No model selected!"
     # If cache directory set, then download config
+    # if gradio_params["cache_dir"]:
+    #     config = scrape_config_from_hub(gradio_params["repo_id"])
+    #     model_config.overwrite_with_hf_config(config)
+    cache_dir="cache/"
     # By default, scrape config.json from hub
+    #else:
+    config = download_config_from_hub(gradio_params["repo_id"], cache_dir)# gradio_params["cache_dir"])
+    model_config.overwrite_with_hf_config(config.to_dict())
+    if training_config.train:
+        total_vram_dict = training_vram_required(model_config, training_config)
+        output_str = f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB activations"
+    else: # inference
+        total_vram_dict = inference_vram_required(model_config, training_config)
+        output_str = f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['kv_cache']}GB (KV cache) + {total_vram_dict['activations']}GB activations"
     return output_str
 if __name__ == "__main__":
                 config = scrape_config_from_hub(args.repo_id)
             model_config.overwrite_with_hf_config(config)
+        if training_config.train:
+            total_vram_dict = training_vram_required(model_config, training_config)
+            output_str = f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB activations"
+        else: # inference
+            total_vram_dict = inference_vram_required(model_config, training_config)
+            output_str = f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['kv_cache']}GB (KV cache) + {total_vram_dict['activations']}GB activations"
+        print(output_str)

estimate_train_vram.py CHANGED Viewed

@@ -1,15 +1,20 @@
-from vram_helpers import model_memory, gradients_memory, optimizer_memory, activations_memory
-def vram_required(model_config, training_config):
     # Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
     trainable_parameters = model_config.model_size
     if training_config.qlora:
         model_config.precision = "int4"
-        # Generally around 4-8% of trainable parameters so take upper bound
-        trainable_parameters = 0.1 * model_config.model_size
     model_vram = model_memory(parameters=trainable_parameters,
                               precision=model_config.precision,
@@ -38,6 +43,7 @@ def vram_required(model_config, training_config):
                                           training_config.micro_batch_size,
                                           model_config.hidden_size,
                                           model_config.num_heads)
     if training_config.gradient_checkpointing:
         activations_vram = round(activations_vram ** 0.5, 2)
@@ -48,4 +54,28 @@ def vram_required(model_config, training_config):
         "gradients": gradients_vram,
         "optimizer": optimizer_vram,
         "activations": activations_vram
     }.items()}

+from vram_helpers import activations_memory_per_layer, \
+                         model_memory, \
+                         gradients_memory, \
+                         optimizer_memory, \
+                         activations_memory, \
+                         kv_cache_memory
+def training_vram_required(model_config, training_config):
     # Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
     trainable_parameters = model_config.model_size
     if training_config.qlora:
         model_config.precision = "int4"
+        # 0.2% according to LoRA paper (https://arxiv.org/pdf/2106.09685)
+        trainable_parameters = 0.0002 * model_config.model_size
     model_vram = model_memory(parameters=trainable_parameters,
                               precision=model_config.precision,
                                           training_config.micro_batch_size,
                                           model_config.hidden_size,
                                           model_config.num_heads)
     if training_config.gradient_checkpointing:
         activations_vram = round(activations_vram ** 0.5, 2)
         "gradients": gradients_vram,
         "optimizer": optimizer_vram,
         "activations": activations_vram
+    }.items()}
+def inference_vram_required(model_config, training_config):
+    # Total inference VRAM = model size + KV cache size + activations + additional overhead
+    model_vram = model_memory(parameters=model_config.model_size,
+                              precision=model_config.precision,
+                              mixed_precision=model_config.mixed_precision)
+    kv_cache_vram = kv_cache_memory(batch_size=training_config.micro_batch_size,
+                                    total_sequence_length=model_config.total_sequence_length,
+                                    num_layers=model_config.num_layers,
+                                    num_heads=model_config.num_heads,
+                                    hidden_size=model_config.hidden_size,
+                                    precision=model_config.precision)
+    activations_vram = activations_memory_per_layer(sequence_length=model_config.sequence_length,
+                                          micro_batch_size=training_config.micro_batch_size,
+                                          hidden_size=model_config.hidden_size,
+                                          num_heads=model_config.num_heads)
+    total_vram = model_vram + kv_cache_vram + activations_vram
+    return {k: round(v, 2) for k, v in {
+        "total": total_vram,
+        "model": model_vram,
+        "kv_cache": kv_cache_vram,
+        "activations": activations_vram
     }.items()}

vram_helpers.py CHANGED Viewed

@@ -2,11 +2,8 @@ from dataclasses import dataclass, fields
 from typing import Optional
-PRECISION_TO_BYTES = {"float32": 4,
-                      "fp32": 4,
-                      "float16": 2,
                       "fp16": 2,
-                      "bfloat16": 2,
                       "bf16": 2,
                       "int8": 1,
                       "int4": 0.5}
@@ -17,6 +14,7 @@ class ModelConfig:
     model_size: float
     hidden_size: int
     sequence_length: int
     num_layers: int
     num_heads: int
     mixed_precision: bool = False
@@ -27,6 +25,8 @@ class ModelConfig:
         self.model_size = round(get_model_size_from_config(config) / 10**9, 2)
         self.hidden_size = config["hidden_size"]
         self.sequence_length = config["max_position_embeddings"]
         self.num_layers = config["num_hidden_layers"]
         self.num_heads = config["num_attention_heads"]
@@ -38,6 +38,7 @@ class TrainingConfig:
     zero_stage: int
     qlora: bool = False
     gradient_checkpointing: bool = False
 # Utility function to filter params based on dataclass fields
 def filter_params_for_dataclass(dataclass_type, params):
@@ -88,16 +89,33 @@ def gradients_memory(parameters, precision = "fp32"):
     return parameters * PRECISION_TO_BYTES[precision]
 def optimizer_memory(parameters, optimizer= "adamw", precision = "fp32"):
-    optimizer_choices = {"adam": 3,    # Adam: stores precision copies of the optimizer parameters, momentum, and variance -> 4 + 4 + 4 = 12 bytes per model parameter
                          "adamw": 3,   # AdamW: Same for Adam
                          "sgd": 2,      # For SGD: optimier parameters and gradients -> 4 + 4 = 8 bytes per model parameter
                          "adamw_8bit": 1.5, # Adam 8-bit: same for Adam-> 2 + 2 + 2 = 6 bytes per model parameter
                          }
     return optimizer_choices[optimizer] * parameters * PRECISION_TO_BYTES[precision]
 def activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads):
     # Reference: https://arxiv.org/pdf/2205.05198
     # Activations assumed to be in 16-bit floating precision
-    bytes_per_layer = sequence_length * micro_batch_size * hidden_size * (34 + 5 * (num_heads * sequence_length / hidden_size))
     bytes_model = bytes_per_layer * num_layers
-    return bytes_model / 10**9

 from typing import Optional
+PRECISION_TO_BYTES = {"fp32": 4,
                       "fp16": 2,
                       "bf16": 2,
                       "int8": 1,
                       "int4": 0.5}
     model_size: float
     hidden_size: int
     sequence_length: int
+    total_sequence_length: int # for inference = prompt + output tokens
     num_layers: int
     num_heads: int
     mixed_precision: bool = False
         self.model_size = round(get_model_size_from_config(config) / 10**9, 2)
         self.hidden_size = config["hidden_size"]
         self.sequence_length = config["max_position_embeddings"]
+        if self.total_sequence_length == 0:
+            self.total_sequence_length = self.sequence_length
         self.num_layers = config["num_hidden_layers"]
         self.num_heads = config["num_attention_heads"]
     zero_stage: int
     qlora: bool = False
     gradient_checkpointing: bool = False
+    train: bool = True # False for inference
 # Utility function to filter params based on dataclass fields
 def filter_params_for_dataclass(dataclass_type, params):
     return parameters * PRECISION_TO_BYTES[precision]
 def optimizer_memory(parameters, optimizer= "adamw", precision = "fp32"):
+    optimizer_choices = {
+                         "adam": 3,    # Adam: stores precision copies of the optimizer parameters, momentum, and variance -> 4 + 4 + 4 = 12 bytes per model parameter
                          "adamw": 3,   # AdamW: Same for Adam
                          "sgd": 2,      # For SGD: optimier parameters and gradients -> 4 + 4 = 8 bytes per model parameter
                          "adamw_8bit": 1.5, # Adam 8-bit: same for Adam-> 2 + 2 + 2 = 6 bytes per model parameter
                          }
     return optimizer_choices[optimizer] * parameters * PRECISION_TO_BYTES[precision]
+# def activations_memory_per_layer(sequence_length, micro_batch_size, hidden_size, num_heads):
+#     bytes_per_layer = sequence_length * micro_batch_size * hidden_size * (34 + 5 * (num_heads * sequence_length / hidden_size))
+#     return bytes_per_layer / 10**9
+def activations_memory_per_layer(sequence_length, micro_batch_size, hidden_size, num_heads):
+    precision = "fp32"
+    "Returns amount of GPU VRAM (in GB) required to store intermediate activations for traditional Transformer Encoder block"
+    mem_bytes = PRECISION_TO_BYTES[precision] * sequence_length * micro_batch_size * hidden_size * (
+        16 + 2/PRECISION_TO_BYTES[precision] + 2*num_heads*sequence_length/hidden_size + num_heads*sequence_length/(PRECISION_TO_BYTES[precision]*hidden_size))
+    return round(mem_bytes / 10**9, 2)
 def activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads):
     # Reference: https://arxiv.org/pdf/2205.05198
     # Activations assumed to be in 16-bit floating precision
+    bytes_per_layer = activations_memory_per_layer(sequence_length, micro_batch_size, hidden_size, num_heads)
     bytes_model = bytes_per_layer * num_layers
+    return bytes_model
+def kv_cache_memory(batch_size, total_sequence_length, num_layers, num_heads, hidden_size, precision):
+    # Total sequence length means input prompt length + completion so we assume the context size of the model as upper bound
+    kv_cache_memory = 2 * batch_size * total_sequence_length * num_layers * num_heads * hidden_size * PRECISION_TO_BYTES[precision]
+    return kv_cache_memory / 10**9