tvosch commited on
Commit
282794e
·
1 Parent(s): a905447

add adamw-8bit

Browse files
Files changed (2) hide show
  1. app.py +1 -4
  2. vram_helpers.py +1 -1
app.py CHANGED
@@ -10,15 +10,12 @@ from vram_helpers import ModelConfig, TrainingConfig, filter_params_for_dataclas
10
 
11
  ZERO_STAGES = [0, 1, 2, 3]
12
  BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64]
13
- OPTIMIZERS = ["adam", "adamw", "sgd"]
14
  HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
15
 
16
 
17
-
18
-
19
  def parse_args():
20
  parser = argparse.ArgumentParser(description="Parser for VRAM estimator")
21
-
22
  parser.add_argument("--repo_id", type=str, default=None, help="HuggingFace repo id to automatically determine model settings")
23
  parser.add_argument("--model_size", type=float, default=7, help="Model size (in billion parameters)")
24
  parser.add_argument("--hidden_size", type=int, default=4096, help="Hidden size")
 
10
 
11
  ZERO_STAGES = [0, 1, 2, 3]
12
  BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64]
13
+ OPTIMIZERS = ["adam", "adamw", "adamw_8bit", "sgd"]
14
  HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
15
 
16
 
 
 
17
  def parse_args():
18
  parser = argparse.ArgumentParser(description="Parser for VRAM estimator")
 
19
  parser.add_argument("--repo_id", type=str, default=None, help="HuggingFace repo id to automatically determine model settings")
20
  parser.add_argument("--model_size", type=float, default=7, help="Model size (in billion parameters)")
21
  parser.add_argument("--hidden_size", type=int, default=4096, help="Hidden size")
vram_helpers.py CHANGED
@@ -91,7 +91,7 @@ def optimizer_memory(parameters, optimizer= "adamw", precision = "fp32"):
91
  optimizer_choices = {"adam": 3, # Adam: stores precision copies of the optimizer parameters, momentum, and variance -> 4 + 4 + 4 = 12 bytes per model parameter
92
  "adamw": 3, # AdamW: Same for Adam
93
  "sgd": 2, # For SGD: optimier parameters and gradients -> 4 + 4 = 8 bytes per model parameter
94
- "adam-8bit": 1.5, # Adam 8-bit: same for Adam-> 2 + 2 + 2 = 6 bytes per model parameter
95
  }
96
  return optimizer_choices[optimizer] * parameters * PRECISION_TO_BYTES[precision]
97
 
 
91
  optimizer_choices = {"adam": 3, # Adam: stores precision copies of the optimizer parameters, momentum, and variance -> 4 + 4 + 4 = 12 bytes per model parameter
92
  "adamw": 3, # AdamW: Same for Adam
93
  "sgd": 2, # For SGD: optimier parameters and gradients -> 4 + 4 = 8 bytes per model parameter
94
+ "adamw_8bit": 1.5, # Adam 8-bit: same for Adam-> 2 + 2 + 2 = 6 bytes per model parameter
95
  }
96
  return optimizer_choices[optimizer] * parameters * PRECISION_TO_BYTES[precision]
97