Kevin Fink commited on
Commit
ef63026
·
1 Parent(s): 6249921
Files changed (1) hide show
  1. app.py +12 -9
app.py CHANGED
@@ -246,16 +246,19 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
246
  except:
247
  checkpoint_dir = training_args.output_dir
248
  # If the trainer_state.json is missing, look for the previous checkpoint
249
- previous_checkpoints = sorted(os.listdir("/data/results"), key=get_checkpoint_int)
250
 
251
- if previous_checkpoints:
252
- # Load the most recent previous checkpoint
253
- last_checkpoint = previous_checkpoints[-2]
254
- print(f"Loading previous checkpoint: {last_checkpoint}")
255
- train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
256
- else:
257
- print("No previous checkpoints found. Starting training from scratch.")
258
- train_result = trainer.train()
 
 
 
259
  trainer.push_to_hub(commit_message="Training complete!")
260
  except Exception as e:
261
  return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"
 
246
  except:
247
  checkpoint_dir = training_args.output_dir
248
  # If the trainer_state.json is missing, look for the previous checkpoint
249
+ previous_checkpoints = sorted(os.listdir("/data/results"), key=get_checkpoint_int, reverse=True)
250
 
251
+ for check in previous_checkpoints[1:]:
252
+ try:
253
+ print(f"Loading previous checkpoint: {check}")
254
+ train_result = trainer.train(resume_from_checkpoint=check)
255
+ trainer.push_to_hub(commit_message="Training complete!")
256
+ return 'DONE!'#train_result
257
+ except:
258
+ pass
259
+
260
+ print("No previous checkpoints found. Starting training from scratch.")
261
+ train_result = trainer.train()
262
  trainer.push_to_hub(commit_message="Training complete!")
263
  except Exception as e:
264
  return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"