Spaces:
Runtime error
Runtime error
zetavg
commited on
finetune: support more params
Browse files- llama_lora/lib/finetune.py +31 -8
- llama_lora/ui/finetune_ui.py +59 -4
- llama_lora/ui/main_page.py +18 -0
llama_lora/lib/finetune.py
CHANGED
@@ -29,10 +29,10 @@ def train(
|
|
29 |
# training hyperparams
|
30 |
micro_batch_size: int = 4,
|
31 |
gradient_accumulation_steps: int = 32,
|
32 |
-
|
33 |
learning_rate: float = 3e-4,
|
34 |
cutoff_len: int = 256,
|
35 |
-
val_set_size: int = 2000,
|
36 |
# lora hyperparams
|
37 |
lora_r: int = 8,
|
38 |
lora_alpha: int = 16,
|
@@ -46,12 +46,16 @@ def train(
|
|
46 |
group_by_length: bool = False, # faster, but produces an odd training loss curve
|
47 |
# either training checkpoint or final adapter
|
48 |
resume_from_checkpoint: str = None,
|
|
|
|
|
|
|
49 |
# logging
|
50 |
callbacks: List[Any] = []
|
51 |
):
|
52 |
if os.path.exists(output_dir):
|
53 |
if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
|
54 |
-
raise ValueError(
|
|
|
55 |
|
56 |
device_map = "auto"
|
57 |
world_size = int(os.environ.get("WORLD_SIZE", 1))
|
@@ -186,17 +190,17 @@ def train(
|
|
186 |
per_device_train_batch_size=micro_batch_size,
|
187 |
gradient_accumulation_steps=gradient_accumulation_steps,
|
188 |
warmup_steps=100,
|
189 |
-
num_train_epochs=
|
190 |
learning_rate=learning_rate,
|
191 |
fp16=True,
|
192 |
-
logging_steps=
|
193 |
optim="adamw_torch",
|
194 |
evaluation_strategy="steps" if val_set_size > 0 else "no",
|
195 |
save_strategy="steps",
|
196 |
eval_steps=200 if val_set_size > 0 else None,
|
197 |
-
save_steps=
|
198 |
output_dir=output_dir,
|
199 |
-
save_total_limit=
|
200 |
load_best_model_at_end=True if val_set_size > 0 else False,
|
201 |
ddp_find_unused_parameters=False if ddp else None,
|
202 |
group_by_length=group_by_length,
|
@@ -213,6 +217,24 @@ def train(
|
|
213 |
os.makedirs(output_dir)
|
214 |
with open(os.path.join(output_dir, "trainer_args.json"), 'w') as trainer_args_json_file:
|
215 |
json.dump(trainer.args.to_dict(), trainer_args_json_file, indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
|
217 |
model.config.use_cache = False
|
218 |
|
@@ -232,7 +254,8 @@ def train(
|
|
232 |
print(f"Model saved to {output_dir}.")
|
233 |
|
234 |
with open(os.path.join(output_dir, "trainer_log_history.jsonl"), 'w') as trainer_log_history_jsonl_file:
|
235 |
-
trainer_log_history = "\n".join(
|
|
|
236 |
trainer_log_history_jsonl_file.write(trainer_log_history)
|
237 |
|
238 |
with open(os.path.join(output_dir, "train_output.json"), 'w') as train_output_json_file:
|
|
|
29 |
# training hyperparams
|
30 |
micro_batch_size: int = 4,
|
31 |
gradient_accumulation_steps: int = 32,
|
32 |
+
num_train_epochs: int = 3,
|
33 |
learning_rate: float = 3e-4,
|
34 |
cutoff_len: int = 256,
|
35 |
+
val_set_size: int = 2000, # TODO: use percentage
|
36 |
# lora hyperparams
|
37 |
lora_r: int = 8,
|
38 |
lora_alpha: int = 16,
|
|
|
46 |
group_by_length: bool = False, # faster, but produces an odd training loss curve
|
47 |
# either training checkpoint or final adapter
|
48 |
resume_from_checkpoint: str = None,
|
49 |
+
save_steps: int = 200,
|
50 |
+
save_total_limit: int = 3,
|
51 |
+
logging_steps: int = 10,
|
52 |
# logging
|
53 |
callbacks: List[Any] = []
|
54 |
):
|
55 |
if os.path.exists(output_dir):
|
56 |
if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
|
57 |
+
raise ValueError(
|
58 |
+
f"The output directory already exists and is not empty. ({output_dir})")
|
59 |
|
60 |
device_map = "auto"
|
61 |
world_size = int(os.environ.get("WORLD_SIZE", 1))
|
|
|
190 |
per_device_train_batch_size=micro_batch_size,
|
191 |
gradient_accumulation_steps=gradient_accumulation_steps,
|
192 |
warmup_steps=100,
|
193 |
+
num_train_epochs=num_train_epochs,
|
194 |
learning_rate=learning_rate,
|
195 |
fp16=True,
|
196 |
+
logging_steps=logging_steps,
|
197 |
optim="adamw_torch",
|
198 |
evaluation_strategy="steps" if val_set_size > 0 else "no",
|
199 |
save_strategy="steps",
|
200 |
eval_steps=200 if val_set_size > 0 else None,
|
201 |
+
save_steps=save_steps,
|
202 |
output_dir=output_dir,
|
203 |
+
save_total_limit=save_total_limit,
|
204 |
load_best_model_at_end=True if val_set_size > 0 else False,
|
205 |
ddp_find_unused_parameters=False if ddp else None,
|
206 |
group_by_length=group_by_length,
|
|
|
217 |
os.makedirs(output_dir)
|
218 |
with open(os.path.join(output_dir, "trainer_args.json"), 'w') as trainer_args_json_file:
|
219 |
json.dump(trainer.args.to_dict(), trainer_args_json_file, indent=2)
|
220 |
+
with open(os.path.join(output_dir, "finetune_params.json"), 'w') as finetune_params_json_file:
|
221 |
+
finetune_params = {
|
222 |
+
'micro_batch_size': micro_batch_size,
|
223 |
+
'gradient_accumulation_steps': gradient_accumulation_steps,
|
224 |
+
'num_train_epochs': num_train_epochs,
|
225 |
+
'learning_rate': learning_rate,
|
226 |
+
'cutoff_len': cutoff_len,
|
227 |
+
'lora_r': lora_r,
|
228 |
+
'lora_alpha': lora_alpha,
|
229 |
+
'lora_dropout': lora_dropout,
|
230 |
+
'lora_target_modules': lora_target_modules,
|
231 |
+
'train_on_inputs': train_on_inputs,
|
232 |
+
'group_by_length': group_by_length,
|
233 |
+
'save_steps': save_steps,
|
234 |
+
'save_total_limit': save_total_limit,
|
235 |
+
'logging_steps': logging_steps,
|
236 |
+
}
|
237 |
+
json.dump(finetune_params, finetune_params_json_file, indent=2)
|
238 |
|
239 |
model.config.use_cache = False
|
240 |
|
|
|
254 |
print(f"Model saved to {output_dir}.")
|
255 |
|
256 |
with open(os.path.join(output_dir, "trainer_log_history.jsonl"), 'w') as trainer_log_history_jsonl_file:
|
257 |
+
trainer_log_history = "\n".join(
|
258 |
+
[json.dumps(line) for line in trainer.state.log_history])
|
259 |
trainer_log_history_jsonl_file.write(trainer_log_history)
|
260 |
|
261 |
with open(os.path.join(output_dir, "train_output.json"), 'w') as train_output_json_file:
|
llama_lora/ui/finetune_ui.py
CHANGED
@@ -269,6 +269,9 @@ def do_train(
|
|
269 |
lora_dropout,
|
270 |
lora_target_modules,
|
271 |
model_name,
|
|
|
|
|
|
|
272 |
progress=gr.Progress(track_tqdm=should_training_progress_track_tqdm),
|
273 |
):
|
274 |
try:
|
@@ -276,7 +279,8 @@ def do_train(
|
|
276 |
output_dir = os.path.join(Global.data_dir, "lora_models", model_name)
|
277 |
if os.path.exists(output_dir):
|
278 |
if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
|
279 |
-
raise ValueError(
|
|
|
280 |
|
281 |
if not should_training_progress_track_tqdm:
|
282 |
progress(0, desc="Preparing train data...")
|
@@ -484,6 +488,9 @@ Train data (first 10):
|
|
484 |
train_on_inputs, # train_on_inputs
|
485 |
False, # group_by_length
|
486 |
None, # resume_from_checkpoint
|
|
|
|
|
|
|
487 |
training_callbacks # callbacks
|
488 |
)
|
489 |
|
@@ -500,7 +507,8 @@ Train data (first 10):
|
|
500 |
return result_message
|
501 |
|
502 |
except Exception as e:
|
503 |
-
raise gr.Error(
|
|
|
504 |
|
505 |
|
506 |
def do_abort_training():
|
@@ -661,6 +669,8 @@ def finetune_ui():
|
|
661 |
)
|
662 |
|
663 |
with gr.Row():
|
|
|
|
|
664 |
micro_batch_size_default_value = 1
|
665 |
|
666 |
if Global.gpu_total_cores is not None and Global.gpu_total_memory is not None:
|
@@ -695,7 +705,7 @@ def finetune_ui():
|
|
695 |
)
|
696 |
|
697 |
evaluate_data_percentage = gr.Slider(
|
698 |
-
minimum=0, maximum=0.5, step=0.001, value=0
|
699 |
label="Evaluation Data Percentage",
|
700 |
info="The percentage of data to be used for evaluation. This percentage of data will not be used for training and will be used to assess the performance of the model during the process."
|
701 |
)
|
@@ -726,6 +736,26 @@ def finetune_ui():
|
|
726 |
info="Modules to replace with LoRA."
|
727 |
)
|
728 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
729 |
with gr.Column():
|
730 |
model_name = gr.Textbox(
|
731 |
lines=1, label="LoRA Model Name", value=random_name,
|
@@ -767,7 +797,10 @@ def finetune_ui():
|
|
767 |
lora_alpha,
|
768 |
lora_dropout,
|
769 |
lora_target_modules,
|
770 |
-
model_name
|
|
|
|
|
|
|
771 |
]),
|
772 |
outputs=train_output
|
773 |
)
|
@@ -860,6 +893,28 @@ def finetune_ui():
|
|
860 |
'Press to load a sample dataset of the current selected format into the textbox.',
|
861 |
});
|
862 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
863 |
tippy('#finetune_model_name', {
|
864 |
placement: 'bottom',
|
865 |
delay: [500, 0],
|
|
|
269 |
lora_dropout,
|
270 |
lora_target_modules,
|
271 |
model_name,
|
272 |
+
save_steps,
|
273 |
+
save_total_limit,
|
274 |
+
logging_steps,
|
275 |
progress=gr.Progress(track_tqdm=should_training_progress_track_tqdm),
|
276 |
):
|
277 |
try:
|
|
|
279 |
output_dir = os.path.join(Global.data_dir, "lora_models", model_name)
|
280 |
if os.path.exists(output_dir):
|
281 |
if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
|
282 |
+
raise ValueError(
|
283 |
+
f"The output directory already exists and is not empty. ({output_dir})")
|
284 |
|
285 |
if not should_training_progress_track_tqdm:
|
286 |
progress(0, desc="Preparing train data...")
|
|
|
488 |
train_on_inputs, # train_on_inputs
|
489 |
False, # group_by_length
|
490 |
None, # resume_from_checkpoint
|
491 |
+
save_steps, # save_steps
|
492 |
+
save_total_limit, # save_total_limit
|
493 |
+
logging_steps, # logging_steps
|
494 |
training_callbacks # callbacks
|
495 |
)
|
496 |
|
|
|
507 |
return result_message
|
508 |
|
509 |
except Exception as e:
|
510 |
+
raise gr.Error(
|
511 |
+
f"{e} (To dismiss this error, click the 'Abort' button)")
|
512 |
|
513 |
|
514 |
def do_abort_training():
|
|
|
669 |
)
|
670 |
|
671 |
with gr.Row():
|
672 |
+
# https://huggingface.co/docs/transformers/main/main_classes/trainer
|
673 |
+
|
674 |
micro_batch_size_default_value = 1
|
675 |
|
676 |
if Global.gpu_total_cores is not None and Global.gpu_total_memory is not None:
|
|
|
705 |
)
|
706 |
|
707 |
evaluate_data_percentage = gr.Slider(
|
708 |
+
minimum=0, maximum=0.5, step=0.001, value=0,
|
709 |
label="Evaluation Data Percentage",
|
710 |
info="The percentage of data to be used for evaluation. This percentage of data will not be used for training and will be used to assess the performance of the model during the process."
|
711 |
)
|
|
|
736 |
info="Modules to replace with LoRA."
|
737 |
)
|
738 |
|
739 |
+
with gr.Row():
|
740 |
+
logging_steps = gr.Number(
|
741 |
+
label="Logging Steps",
|
742 |
+
precision=0,
|
743 |
+
value=10,
|
744 |
+
elem_id="finetune_logging_steps"
|
745 |
+
)
|
746 |
+
save_steps = gr.Number(
|
747 |
+
label="Steps Per Save",
|
748 |
+
precision=0,
|
749 |
+
value=500,
|
750 |
+
elem_id="finetune_save_steps"
|
751 |
+
)
|
752 |
+
save_total_limit = gr.Number(
|
753 |
+
label="Saved Checkpoints Limit",
|
754 |
+
precision=0,
|
755 |
+
value=5,
|
756 |
+
elem_id="finetune_save_total_limit"
|
757 |
+
)
|
758 |
+
|
759 |
with gr.Column():
|
760 |
model_name = gr.Textbox(
|
761 |
lines=1, label="LoRA Model Name", value=random_name,
|
|
|
797 |
lora_alpha,
|
798 |
lora_dropout,
|
799 |
lora_target_modules,
|
800 |
+
model_name,
|
801 |
+
save_steps,
|
802 |
+
save_total_limit,
|
803 |
+
logging_steps,
|
804 |
]),
|
805 |
outputs=train_output
|
806 |
)
|
|
|
893 |
'Press to load a sample dataset of the current selected format into the textbox.',
|
894 |
});
|
895 |
|
896 |
+
tippy('#finetune_save_total_limit', {
|
897 |
+
placement: 'bottom',
|
898 |
+
delay: [500, 0],
|
899 |
+
animation: 'scale-subtle',
|
900 |
+
content:
|
901 |
+
'Total amount of checkpoints to preserve. Older checkpoints will be deleted.',
|
902 |
+
});
|
903 |
+
tippy('#finetune_save_steps', {
|
904 |
+
placement: 'bottom',
|
905 |
+
delay: [500, 0],
|
906 |
+
animation: 'scale-subtle',
|
907 |
+
content:
|
908 |
+
'Number of updates steps before two checkpoint saves.',
|
909 |
+
});
|
910 |
+
tippy('#finetune_logging_steps', {
|
911 |
+
placement: 'bottom',
|
912 |
+
delay: [500, 0],
|
913 |
+
animation: 'scale-subtle',
|
914 |
+
content:
|
915 |
+
'Number of update steps between two logs.',
|
916 |
+
});
|
917 |
+
|
918 |
tippy('#finetune_model_name', {
|
919 |
placement: 'bottom',
|
920 |
delay: [500, 0],
|
llama_lora/ui/main_page.py
CHANGED
@@ -432,6 +432,24 @@ def main_page_custom_css():
|
|
432 |
flex: 2;
|
433 |
}
|
434 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
435 |
@media screen and (max-width: 392px) {
|
436 |
#inference_lora_model, #finetune_template {
|
437 |
border-bottom-left-radius: 0;
|
|
|
432 |
flex: 2;
|
433 |
}
|
434 |
|
435 |
+
#finetune_save_total_limit,
|
436 |
+
#finetune_save_steps,
|
437 |
+
#finetune_logging_steps {
|
438 |
+
min-width: min(120px,100%) !important;
|
439 |
+
padding-top: 4px;
|
440 |
+
}
|
441 |
+
#finetune_save_total_limit span,
|
442 |
+
#finetune_save_steps span,
|
443 |
+
#finetune_logging_steps span {
|
444 |
+
font-size: 12px;
|
445 |
+
margin-bottom: 5px;
|
446 |
+
}
|
447 |
+
#finetune_save_total_limit input,
|
448 |
+
#finetune_save_steps input,
|
449 |
+
#finetune_logging_steps input {
|
450 |
+
padding: 4px 8px;
|
451 |
+
}
|
452 |
+
|
453 |
@media screen and (max-width: 392px) {
|
454 |
#inference_lora_model, #finetune_template {
|
455 |
border-bottom-left-radius: 0;
|