Kevin Fink
commited on
Commit
·
ef63026
1
Parent(s):
6249921
dev
Browse files
app.py
CHANGED
@@ -246,16 +246,19 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
246 |
except:
|
247 |
checkpoint_dir = training_args.output_dir
|
248 |
# If the trainer_state.json is missing, look for the previous checkpoint
|
249 |
-
previous_checkpoints = sorted(os.listdir("/data/results"), key=get_checkpoint_int)
|
250 |
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
|
|
|
|
|
|
259 |
trainer.push_to_hub(commit_message="Training complete!")
|
260 |
except Exception as e:
|
261 |
return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"
|
|
|
246 |
except:
|
247 |
checkpoint_dir = training_args.output_dir
|
248 |
# If the trainer_state.json is missing, look for the previous checkpoint
|
249 |
+
previous_checkpoints = sorted(os.listdir("/data/results"), key=get_checkpoint_int, reverse=True)
|
250 |
|
251 |
+
for check in previous_checkpoints[1:]:
|
252 |
+
try:
|
253 |
+
print(f"Loading previous checkpoint: {check}")
|
254 |
+
train_result = trainer.train(resume_from_checkpoint=check)
|
255 |
+
trainer.push_to_hub(commit_message="Training complete!")
|
256 |
+
return 'DONE!'#train_result
|
257 |
+
except:
|
258 |
+
pass
|
259 |
+
|
260 |
+
print("No previous checkpoints found. Starting training from scratch.")
|
261 |
+
train_result = trainer.train()
|
262 |
trainer.push_to_hub(commit_message="Training complete!")
|
263 |
except Exception as e:
|
264 |
return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"
|