machine-translation

Build error

App Files Files Community

dh-mc commited on Aug 13, 2024

Commit

1f80432

1 Parent(s): 69fc39f

ready eval qwen2-72b

Browse files

Files changed (3) hide show

llama-factory/config/mac_template_qwen2_72b.yaml +0 -43
llm_toolkit/eval_epochs.py +8 -2
scripts/eval-4gpu.sh +9 -3

llama-factory/config/mac_template_qwen2_72b.yaml DELETED Viewed

@@ -1,43 +0,0 @@
-### model
-model_name_or_path: ORG_NAME/MODEL_NAME
-quantization_bit: 4
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-### dataset
-dataset: alpaca_mac
-template: CHAT_TEMPLATE
-cutoff_len: 1024
-max_samples: 4528
-overwrite_cache: true
-preprocessing_num_workers: 16
-### output
-output_dir: saves/MODEL_NAME
-logging_steps: 50
-save_steps: 560
-plot_loss: true
-# overwrite_output_dir: true
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 1
-learning_rate: 1.0e-4
-num_train_epochs: 6.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-### eval
-val_size: 0.01
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 560
-report_to: wandb
-run_name: MODEL_NAME_lora_sft

llm_toolkit/eval_epochs.py CHANGED Viewed

@@ -32,6 +32,7 @@ def evaluate_model_all_epochs(
     end_epoch=-1,
     batch_size=1,
     max_new_tokens=300,
     device="cuda",
 ):
     if adapter_path_base is None:
@@ -47,7 +48,9 @@ def evaluate_model_all_epochs(
         ]
         subdirs = sorted(subdirs, key=lambda x: int(x.split("-")[-1]))
-        num_train_epochs = len(subdirs)
         print(f"found {num_train_epochs} checkpoints: {subdirs}")
         if end_epoch < 0 or end_epoch > num_train_epochs:
@@ -89,6 +92,7 @@ def evaluate_model_all_epochs(
 if __name__ == "__main__":
     model_name = os.getenv("MODEL_NAME")
     adapter_path_base = os.getenv("ADAPTER_PATH_BASE")
     start_epoch = int(os.getenv("START_EPOCH", 1))
     end_epoch = os.getenv("END_EPOCH", -1)
     load_in_4bit = os.getenv("LOAD_IN_4BIT", "true").lower() == "true"
@@ -118,7 +122,8 @@ if __name__ == "__main__":
         print(f"(0) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
         print(f"{start_gpu_memory} GB of memory reserved.")
-    model, tokenizer = load_model(model_name, load_in_4bit=load_in_4bit)
     datasets = load_translation_dataset(data_path, tokenizer, num_shots=0)
     print_row_details(datasets["test"].to_pandas())
@@ -139,6 +144,7 @@ if __name__ == "__main__":
         adapter_path_base,
         datasets["test"],
         results_path,
         start_epoch=start_epoch,
         end_epoch=end_epoch,
         device=device,

     end_epoch=-1,
     batch_size=1,
     max_new_tokens=300,
+    checkpoints_per_epoch=1,
     device="cuda",
 ):
     if adapter_path_base is None:
         ]
         subdirs = sorted(subdirs, key=lambda x: int(x.split("-")[-1]))
+        num_train_epochs = len(subdirs) // checkpoints_per_epoch
+        if checkpoints_per_epoch > 1:
+            subdirs = subdirs[checkpoints_per_epoch - 1 :: checkpoints_per_epoch]
         print(f"found {num_train_epochs} checkpoints: {subdirs}")
         if end_epoch < 0 or end_epoch > num_train_epochs:
 if __name__ == "__main__":
     model_name = os.getenv("MODEL_NAME")
     adapter_path_base = os.getenv("ADAPTER_PATH_BASE")
+    checkpoints_per_epoch = int(os.getenv("CHECKPOINTS_PER_EPOCH", 1))
     start_epoch = int(os.getenv("START_EPOCH", 1))
     end_epoch = os.getenv("END_EPOCH", -1)
     load_in_4bit = os.getenv("LOAD_IN_4BIT", "true").lower() == "true"
         print(f"(0) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
         print(f"{start_gpu_memory} GB of memory reserved.")
+    # model, tokenizer = load_model(model_name, load_in_4bit=load_in_4bit)
+    model, tokenizer = None, None
     datasets = load_translation_dataset(data_path, tokenizer, num_shots=0)
     print_row_details(datasets["test"].to_pandas())
         adapter_path_base,
         datasets["test"],
         results_path,
+        checkpoints_per_epoch=checkpoints_per_epoch,
         start_epoch=start_epoch,
         end_epoch=end_epoch,
         device=device,

scripts/eval-4gpu.sh CHANGED Viewed

@@ -13,13 +13,19 @@ grep MemTotal /proc/meminfo
 #pip install torch torchvision torchaudio
-pip install -r requirements.txt
 export BATCH_SIZE=1
 export LOAD_IN_4BIT=true
-#./scripts/eval-model.sh Qwen/Qwen2-72B-Instruct
 # ./scripts/eval-model.sh shenzhi-wang/Llama3.1-70B-Chinese-Chat
-./scripts/eval-rpp.sh shenzhi-wang Llama3.1-70B-Chinese-Chat checkpoint-210

 #pip install torch torchvision torchaudio
+# pip install -r requirements.txt
 export BATCH_SIZE=1
 export LOAD_IN_4BIT=true
+# ./scripts/eval-model.sh Qwen/Qwen2-72B-Instruct
 # ./scripts/eval-model.sh shenzhi-wang/Llama3.1-70B-Chinese-Chat
+export CHECKPOINTS_PER_EPOCH=4
+./scripts/eval-epochs.sh Qwen Qwen2-72B-Instruct
+# export CHECKPOINTS_PER_EPOCH=1
+# ./scripts/eval-epochs.sh shenzhi-wang Llama3.1-70B-Chinese-Chat
+# ./scripts/eval-rpp.sh shenzhi-wang Llama3.1-70B-Chinese-Chat checkpoint-210