dh-mc commited on
Commit
1f80432
1 Parent(s): 69fc39f

ready eval qwen2-72b

Browse files
llama-factory/config/mac_template_qwen2_72b.yaml DELETED
@@ -1,43 +0,0 @@
1
- ### model
2
- model_name_or_path: ORG_NAME/MODEL_NAME
3
- quantization_bit: 4
4
-
5
- ### method
6
- stage: sft
7
- do_train: true
8
- finetuning_type: lora
9
- lora_target: all
10
-
11
- ### dataset
12
- dataset: alpaca_mac
13
- template: CHAT_TEMPLATE
14
- cutoff_len: 1024
15
- max_samples: 4528
16
- overwrite_cache: true
17
- preprocessing_num_workers: 16
18
-
19
- ### output
20
- output_dir: saves/MODEL_NAME
21
- logging_steps: 50
22
- save_steps: 560
23
- plot_loss: true
24
- # overwrite_output_dir: true
25
-
26
- ### train
27
- per_device_train_batch_size: 1
28
- gradient_accumulation_steps: 1
29
- learning_rate: 1.0e-4
30
- num_train_epochs: 6.0
31
- lr_scheduler_type: cosine
32
- warmup_ratio: 0.1
33
- bf16: true
34
- ddp_timeout: 180000000
35
-
36
- ### eval
37
- val_size: 0.01
38
- per_device_eval_batch_size: 1
39
- eval_strategy: steps
40
- eval_steps: 560
41
-
42
- report_to: wandb
43
- run_name: MODEL_NAME_lora_sft
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llm_toolkit/eval_epochs.py CHANGED
@@ -32,6 +32,7 @@ def evaluate_model_all_epochs(
32
  end_epoch=-1,
33
  batch_size=1,
34
  max_new_tokens=300,
 
35
  device="cuda",
36
  ):
37
  if adapter_path_base is None:
@@ -47,7 +48,9 @@ def evaluate_model_all_epochs(
47
  ]
48
 
49
  subdirs = sorted(subdirs, key=lambda x: int(x.split("-")[-1]))
50
- num_train_epochs = len(subdirs)
 
 
51
  print(f"found {num_train_epochs} checkpoints: {subdirs}")
52
 
53
  if end_epoch < 0 or end_epoch > num_train_epochs:
@@ -89,6 +92,7 @@ def evaluate_model_all_epochs(
89
  if __name__ == "__main__":
90
  model_name = os.getenv("MODEL_NAME")
91
  adapter_path_base = os.getenv("ADAPTER_PATH_BASE")
 
92
  start_epoch = int(os.getenv("START_EPOCH", 1))
93
  end_epoch = os.getenv("END_EPOCH", -1)
94
  load_in_4bit = os.getenv("LOAD_IN_4BIT", "true").lower() == "true"
@@ -118,7 +122,8 @@ if __name__ == "__main__":
118
  print(f"(0) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
119
  print(f"{start_gpu_memory} GB of memory reserved.")
120
 
121
- model, tokenizer = load_model(model_name, load_in_4bit=load_in_4bit)
 
122
 
123
  datasets = load_translation_dataset(data_path, tokenizer, num_shots=0)
124
  print_row_details(datasets["test"].to_pandas())
@@ -139,6 +144,7 @@ if __name__ == "__main__":
139
  adapter_path_base,
140
  datasets["test"],
141
  results_path,
 
142
  start_epoch=start_epoch,
143
  end_epoch=end_epoch,
144
  device=device,
 
32
  end_epoch=-1,
33
  batch_size=1,
34
  max_new_tokens=300,
35
+ checkpoints_per_epoch=1,
36
  device="cuda",
37
  ):
38
  if adapter_path_base is None:
 
48
  ]
49
 
50
  subdirs = sorted(subdirs, key=lambda x: int(x.split("-")[-1]))
51
+ num_train_epochs = len(subdirs) // checkpoints_per_epoch
52
+ if checkpoints_per_epoch > 1:
53
+ subdirs = subdirs[checkpoints_per_epoch - 1 :: checkpoints_per_epoch]
54
  print(f"found {num_train_epochs} checkpoints: {subdirs}")
55
 
56
  if end_epoch < 0 or end_epoch > num_train_epochs:
 
92
  if __name__ == "__main__":
93
  model_name = os.getenv("MODEL_NAME")
94
  adapter_path_base = os.getenv("ADAPTER_PATH_BASE")
95
+ checkpoints_per_epoch = int(os.getenv("CHECKPOINTS_PER_EPOCH", 1))
96
  start_epoch = int(os.getenv("START_EPOCH", 1))
97
  end_epoch = os.getenv("END_EPOCH", -1)
98
  load_in_4bit = os.getenv("LOAD_IN_4BIT", "true").lower() == "true"
 
122
  print(f"(0) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
123
  print(f"{start_gpu_memory} GB of memory reserved.")
124
 
125
+ # model, tokenizer = load_model(model_name, load_in_4bit=load_in_4bit)
126
+ model, tokenizer = None, None
127
 
128
  datasets = load_translation_dataset(data_path, tokenizer, num_shots=0)
129
  print_row_details(datasets["test"].to_pandas())
 
144
  adapter_path_base,
145
  datasets["test"],
146
  results_path,
147
+ checkpoints_per_epoch=checkpoints_per_epoch,
148
  start_epoch=start_epoch,
149
  end_epoch=end_epoch,
150
  device=device,
scripts/eval-4gpu.sh CHANGED
@@ -13,13 +13,19 @@ grep MemTotal /proc/meminfo
13
 
14
  #pip install torch torchvision torchaudio
15
 
16
- pip install -r requirements.txt
17
 
18
  export BATCH_SIZE=1
19
  export LOAD_IN_4BIT=true
20
 
21
- #./scripts/eval-model.sh Qwen/Qwen2-72B-Instruct
22
 
23
  # ./scripts/eval-model.sh shenzhi-wang/Llama3.1-70B-Chinese-Chat
24
 
25
- ./scripts/eval-rpp.sh shenzhi-wang Llama3.1-70B-Chinese-Chat checkpoint-210
 
 
 
 
 
 
 
13
 
14
  #pip install torch torchvision torchaudio
15
 
16
+ # pip install -r requirements.txt
17
 
18
  export BATCH_SIZE=1
19
  export LOAD_IN_4BIT=true
20
 
21
+ # ./scripts/eval-model.sh Qwen/Qwen2-72B-Instruct
22
 
23
  # ./scripts/eval-model.sh shenzhi-wang/Llama3.1-70B-Chinese-Chat
24
 
25
+ export CHECKPOINTS_PER_EPOCH=4
26
+ ./scripts/eval-epochs.sh Qwen Qwen2-72B-Instruct
27
+
28
+ # export CHECKPOINTS_PER_EPOCH=1
29
+ # ./scripts/eval-epochs.sh shenzhi-wang Llama3.1-70B-Chinese-Chat
30
+
31
+ # ./scripts/eval-rpp.sh shenzhi-wang Llama3.1-70B-Chinese-Chat checkpoint-210