Spaces:
Sleeping
Sleeping
add adamw-8bit
Browse files- app.py +1 -4
- vram_helpers.py +1 -1
app.py
CHANGED
@@ -10,15 +10,12 @@ from vram_helpers import ModelConfig, TrainingConfig, filter_params_for_dataclas
|
|
10 |
|
11 |
ZERO_STAGES = [0, 1, 2, 3]
|
12 |
BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64]
|
13 |
-
OPTIMIZERS = ["adam", "adamw", "sgd"]
|
14 |
HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
|
15 |
|
16 |
|
17 |
-
|
18 |
-
|
19 |
def parse_args():
|
20 |
parser = argparse.ArgumentParser(description="Parser for VRAM estimator")
|
21 |
-
|
22 |
parser.add_argument("--repo_id", type=str, default=None, help="HuggingFace repo id to automatically determine model settings")
|
23 |
parser.add_argument("--model_size", type=float, default=7, help="Model size (in billion parameters)")
|
24 |
parser.add_argument("--hidden_size", type=int, default=4096, help="Hidden size")
|
|
|
10 |
|
11 |
ZERO_STAGES = [0, 1, 2, 3]
|
12 |
BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64]
|
13 |
+
OPTIMIZERS = ["adam", "adamw", "adamw_8bit", "sgd"]
|
14 |
HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
|
15 |
|
16 |
|
|
|
|
|
17 |
def parse_args():
|
18 |
parser = argparse.ArgumentParser(description="Parser for VRAM estimator")
|
|
|
19 |
parser.add_argument("--repo_id", type=str, default=None, help="HuggingFace repo id to automatically determine model settings")
|
20 |
parser.add_argument("--model_size", type=float, default=7, help="Model size (in billion parameters)")
|
21 |
parser.add_argument("--hidden_size", type=int, default=4096, help="Hidden size")
|
vram_helpers.py
CHANGED
@@ -91,7 +91,7 @@ def optimizer_memory(parameters, optimizer= "adamw", precision = "fp32"):
|
|
91 |
optimizer_choices = {"adam": 3, # Adam: stores precision copies of the optimizer parameters, momentum, and variance -> 4 + 4 + 4 = 12 bytes per model parameter
|
92 |
"adamw": 3, # AdamW: Same for Adam
|
93 |
"sgd": 2, # For SGD: optimier parameters and gradients -> 4 + 4 = 8 bytes per model parameter
|
94 |
-
"
|
95 |
}
|
96 |
return optimizer_choices[optimizer] * parameters * PRECISION_TO_BYTES[precision]
|
97 |
|
|
|
91 |
optimizer_choices = {"adam": 3, # Adam: stores precision copies of the optimizer parameters, momentum, and variance -> 4 + 4 + 4 = 12 bytes per model parameter
|
92 |
"adamw": 3, # AdamW: Same for Adam
|
93 |
"sgd": 2, # For SGD: optimier parameters and gradients -> 4 + 4 = 8 bytes per model parameter
|
94 |
+
"adamw_8bit": 1.5, # Adam 8-bit: same for Adam-> 2 + 2 + 2 = 6 bytes per model parameter
|
95 |
}
|
96 |
return optimizer_choices[optimizer] * parameters * PRECISION_TO_BYTES[precision]
|
97 |
|