Spaces:
Build error
Build error
ready eval qwen2-72b
Browse files
llama-factory/config/mac_template_qwen2_72b.yaml
DELETED
@@ -1,43 +0,0 @@
|
|
1 |
-
### model
|
2 |
-
model_name_or_path: ORG_NAME/MODEL_NAME
|
3 |
-
quantization_bit: 4
|
4 |
-
|
5 |
-
### method
|
6 |
-
stage: sft
|
7 |
-
do_train: true
|
8 |
-
finetuning_type: lora
|
9 |
-
lora_target: all
|
10 |
-
|
11 |
-
### dataset
|
12 |
-
dataset: alpaca_mac
|
13 |
-
template: CHAT_TEMPLATE
|
14 |
-
cutoff_len: 1024
|
15 |
-
max_samples: 4528
|
16 |
-
overwrite_cache: true
|
17 |
-
preprocessing_num_workers: 16
|
18 |
-
|
19 |
-
### output
|
20 |
-
output_dir: saves/MODEL_NAME
|
21 |
-
logging_steps: 50
|
22 |
-
save_steps: 560
|
23 |
-
plot_loss: true
|
24 |
-
# overwrite_output_dir: true
|
25 |
-
|
26 |
-
### train
|
27 |
-
per_device_train_batch_size: 1
|
28 |
-
gradient_accumulation_steps: 1
|
29 |
-
learning_rate: 1.0e-4
|
30 |
-
num_train_epochs: 6.0
|
31 |
-
lr_scheduler_type: cosine
|
32 |
-
warmup_ratio: 0.1
|
33 |
-
bf16: true
|
34 |
-
ddp_timeout: 180000000
|
35 |
-
|
36 |
-
### eval
|
37 |
-
val_size: 0.01
|
38 |
-
per_device_eval_batch_size: 1
|
39 |
-
eval_strategy: steps
|
40 |
-
eval_steps: 560
|
41 |
-
|
42 |
-
report_to: wandb
|
43 |
-
run_name: MODEL_NAME_lora_sft
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llm_toolkit/eval_epochs.py
CHANGED
@@ -32,6 +32,7 @@ def evaluate_model_all_epochs(
|
|
32 |
end_epoch=-1,
|
33 |
batch_size=1,
|
34 |
max_new_tokens=300,
|
|
|
35 |
device="cuda",
|
36 |
):
|
37 |
if adapter_path_base is None:
|
@@ -47,7 +48,9 @@ def evaluate_model_all_epochs(
|
|
47 |
]
|
48 |
|
49 |
subdirs = sorted(subdirs, key=lambda x: int(x.split("-")[-1]))
|
50 |
-
num_train_epochs = len(subdirs)
|
|
|
|
|
51 |
print(f"found {num_train_epochs} checkpoints: {subdirs}")
|
52 |
|
53 |
if end_epoch < 0 or end_epoch > num_train_epochs:
|
@@ -89,6 +92,7 @@ def evaluate_model_all_epochs(
|
|
89 |
if __name__ == "__main__":
|
90 |
model_name = os.getenv("MODEL_NAME")
|
91 |
adapter_path_base = os.getenv("ADAPTER_PATH_BASE")
|
|
|
92 |
start_epoch = int(os.getenv("START_EPOCH", 1))
|
93 |
end_epoch = os.getenv("END_EPOCH", -1)
|
94 |
load_in_4bit = os.getenv("LOAD_IN_4BIT", "true").lower() == "true"
|
@@ -118,7 +122,8 @@ if __name__ == "__main__":
|
|
118 |
print(f"(0) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
|
119 |
print(f"{start_gpu_memory} GB of memory reserved.")
|
120 |
|
121 |
-
model, tokenizer = load_model(model_name, load_in_4bit=load_in_4bit)
|
|
|
122 |
|
123 |
datasets = load_translation_dataset(data_path, tokenizer, num_shots=0)
|
124 |
print_row_details(datasets["test"].to_pandas())
|
@@ -139,6 +144,7 @@ if __name__ == "__main__":
|
|
139 |
adapter_path_base,
|
140 |
datasets["test"],
|
141 |
results_path,
|
|
|
142 |
start_epoch=start_epoch,
|
143 |
end_epoch=end_epoch,
|
144 |
device=device,
|
|
|
32 |
end_epoch=-1,
|
33 |
batch_size=1,
|
34 |
max_new_tokens=300,
|
35 |
+
checkpoints_per_epoch=1,
|
36 |
device="cuda",
|
37 |
):
|
38 |
if adapter_path_base is None:
|
|
|
48 |
]
|
49 |
|
50 |
subdirs = sorted(subdirs, key=lambda x: int(x.split("-")[-1]))
|
51 |
+
num_train_epochs = len(subdirs) // checkpoints_per_epoch
|
52 |
+
if checkpoints_per_epoch > 1:
|
53 |
+
subdirs = subdirs[checkpoints_per_epoch - 1 :: checkpoints_per_epoch]
|
54 |
print(f"found {num_train_epochs} checkpoints: {subdirs}")
|
55 |
|
56 |
if end_epoch < 0 or end_epoch > num_train_epochs:
|
|
|
92 |
if __name__ == "__main__":
|
93 |
model_name = os.getenv("MODEL_NAME")
|
94 |
adapter_path_base = os.getenv("ADAPTER_PATH_BASE")
|
95 |
+
checkpoints_per_epoch = int(os.getenv("CHECKPOINTS_PER_EPOCH", 1))
|
96 |
start_epoch = int(os.getenv("START_EPOCH", 1))
|
97 |
end_epoch = os.getenv("END_EPOCH", -1)
|
98 |
load_in_4bit = os.getenv("LOAD_IN_4BIT", "true").lower() == "true"
|
|
|
122 |
print(f"(0) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
|
123 |
print(f"{start_gpu_memory} GB of memory reserved.")
|
124 |
|
125 |
+
# model, tokenizer = load_model(model_name, load_in_4bit=load_in_4bit)
|
126 |
+
model, tokenizer = None, None
|
127 |
|
128 |
datasets = load_translation_dataset(data_path, tokenizer, num_shots=0)
|
129 |
print_row_details(datasets["test"].to_pandas())
|
|
|
144 |
adapter_path_base,
|
145 |
datasets["test"],
|
146 |
results_path,
|
147 |
+
checkpoints_per_epoch=checkpoints_per_epoch,
|
148 |
start_epoch=start_epoch,
|
149 |
end_epoch=end_epoch,
|
150 |
device=device,
|
scripts/eval-4gpu.sh
CHANGED
@@ -13,13 +13,19 @@ grep MemTotal /proc/meminfo
|
|
13 |
|
14 |
#pip install torch torchvision torchaudio
|
15 |
|
16 |
-
pip install -r requirements.txt
|
17 |
|
18 |
export BATCH_SIZE=1
|
19 |
export LOAD_IN_4BIT=true
|
20 |
|
21 |
-
|
22 |
|
23 |
# ./scripts/eval-model.sh shenzhi-wang/Llama3.1-70B-Chinese-Chat
|
24 |
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
#pip install torch torchvision torchaudio
|
15 |
|
16 |
+
# pip install -r requirements.txt
|
17 |
|
18 |
export BATCH_SIZE=1
|
19 |
export LOAD_IN_4BIT=true
|
20 |
|
21 |
+
# ./scripts/eval-model.sh Qwen/Qwen2-72B-Instruct
|
22 |
|
23 |
# ./scripts/eval-model.sh shenzhi-wang/Llama3.1-70B-Chinese-Chat
|
24 |
|
25 |
+
export CHECKPOINTS_PER_EPOCH=4
|
26 |
+
./scripts/eval-epochs.sh Qwen Qwen2-72B-Instruct
|
27 |
+
|
28 |
+
# export CHECKPOINTS_PER_EPOCH=1
|
29 |
+
# ./scripts/eval-epochs.sh shenzhi-wang Llama3.1-70B-Chinese-Chat
|
30 |
+
|
31 |
+
# ./scripts/eval-rpp.sh shenzhi-wang Llama3.1-70B-Chinese-Chat checkpoint-210
|