zetavg commited on
Commit
c5290ad
·
unverified ·
1 Parent(s): 3889cb7

update fine-tune resuming related stuff

Browse files
llama_lora/ui/finetune_ui.py CHANGED
@@ -316,6 +316,13 @@ def do_train(
316
  resume_from_checkpoint = os.path.join(Global.data_dir, "lora_models", continue_from_model)
317
  if continue_from_checkpoint:
318
  resume_from_checkpoint = os.path.join(resume_from_checkpoint, continue_from_checkpoint)
 
 
 
 
 
 
 
319
 
320
  output_dir = os.path.join(Global.data_dir, "lora_models", model_name)
321
  if os.path.exists(output_dir):
 
316
  resume_from_checkpoint = os.path.join(Global.data_dir, "lora_models", continue_from_model)
317
  if continue_from_checkpoint:
318
  resume_from_checkpoint = os.path.join(resume_from_checkpoint, continue_from_checkpoint)
319
+ will_be_resume_from_checkpoint_file = os.path.join(resume_from_checkpoint, "pytorch_model.bin")
320
+ if not os.path.exists(will_be_resume_from_checkpoint_file):
321
+ raise ValueError(f"Unable to resume from checkpoint {continue_from_model}/{continue_from_checkpoint}. Resuming is only possible from checkpoints stored locally in the data directory. Please ensure that the file '{will_be_resume_from_checkpoint_file}' exists.")
322
+ else:
323
+ will_be_resume_from_checkpoint_file = os.path.join(resume_from_checkpoint, "adapter_model.bin")
324
+ if not os.path.exists(will_be_resume_from_checkpoint_file):
325
+ raise ValueError(f"Unable to continue from model {continue_from_model}. Continuation is only possible from models stored locally in the data directory. Please ensure that the file '{will_be_resume_from_checkpoint_file}' exists.")
326
 
327
  output_dir = os.path.join(Global.data_dir, "lora_models", model_name)
328
  if os.path.exists(output_dir):
lora_models/unhelpful-ai-v01/finetune_params.json CHANGED
@@ -1,8 +1,9 @@
1
  {
2
- "num_train_epochs": 16,
3
  "learning_rate": 0.0003,
4
  "cutoff_len": 512,
5
- "lora_r": 12,
 
6
  "lora_alpha": 32,
7
  "lora_dropout": 0.05,
8
  "lora_target_modules": [
@@ -11,9 +12,5 @@
11
  "k_proj",
12
  "o_proj"
13
  ],
14
- "train_on_inputs": false,
15
- "group_by_length": false,
16
- "save_steps": 500,
17
- "save_total_limit": 5,
18
- "logging_steps": 10
19
  }
 
1
  {
2
+ "num_train_epochs": 8,
3
  "learning_rate": 0.0003,
4
  "cutoff_len": 512,
5
+ "val_set_size": 0,
6
+ "lora_r": 16,
7
  "lora_alpha": 32,
8
  "lora_dropout": 0.05,
9
  "lora_target_modules": [
 
12
  "k_proj",
13
  "o_proj"
14
  ],
15
+ "train_on_inputs": false
 
 
 
 
16
  }