Spaces:

shorecode
/

gradio-3

Sleeping

App Files Files Community

Kevin Fink commited on Dec 10, 2024

Commit

8325fbf

1 Parent(s): 63431bc

dev

Browse files

Files changed (1) hide show

app.py +17 -23

app.py CHANGED Viewed

@@ -68,8 +68,6 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
         # Set training arguments
         training_args = TrainingArguments(
-            remove_unused_columns=False,
-            torch_empty_cache_steps=100,
             output_dir='/data/results',
             eval_strategy="steps",  # Change this to steps
             save_strategy='steps',
@@ -84,7 +82,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
             metric_for_best_model="loss",
             greater_is_better=True,
             logging_dir='/data/logs',
-            logging_steps=200,
             #push_to_hub=True,
             hub_model_id=hub_id.strip(),
             fp16=True,
@@ -231,32 +229,28 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
             ##data_collator=data_collator,
             ##processing_class=tokenizer,
         #)
         try:
             train_result = trainer.train(resume_from_checkpoint=True)
         except:
             checkpoint_dir = training_args.output_dir
-            if os.path.exists(checkpoint_dir) and os.listdir(checkpoint_dir):
-                # Check if the trainer_state.json file exists in the specified checkpoint
-                trainer_state_path = os.path.join(checkpoint_dir, 'trainer_state.json')
-                if os.path.exists(trainer_state_path):
-                    train_result = trainer.train(resume_from_checkpoint=True)
-                else:
-                    # If the trainer_state.json is missing, look for the previous checkpoint
-                    print(f"Checkpoint {checkpoint_dir} is missing 'trainer_state.json'. Looking for previous checkpoints...")
-                    previous_checkpoints = sorted(glob.glob(os.path.join(os.path.dirname(checkpoint_dir), 'checkpoint-*')), key=os.path.getmtime)
-                    if previous_checkpoints:
-                        # Load the most recent previous checkpoint
-                        last_checkpoint = previous_checkpoints[-1]
-                        print(f"Loading previous checkpoint: {last_checkpoint}")
-                        train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
-                    else:
-                        print("No previous checkpoints found. Starting training from scratch.")
-                        train_result = trainer.train()
             else:
-                print("No checkpoints found. Starting training from scratch.")
                 train_result = trainer.train()
         trainer.push_to_hub(commit_message="Training complete!")
     except Exception as e:
         return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"

         # Set training arguments
         training_args = TrainingArguments(
             output_dir='/data/results',
             eval_strategy="steps",  # Change this to steps
             save_strategy='steps',
             metric_for_best_model="loss",
             greater_is_better=True,
             logging_dir='/data/logs',
+            logging_steps=100,
             #push_to_hub=True,
             hub_model_id=hub_id.strip(),
             fp16=True,
             ##data_collator=data_collator,
             ##processing_class=tokenizer,
         #)
+        print(f'ROOTDIR: {os.listdir('/data/results')}')
+        for entry in os.listdir('data/results'):
+            try:
+                print(f'{entry}: {os.listdir(entry)}')
+            except:
+                pass
         try:
             train_result = trainer.train(resume_from_checkpoint=True)
         except:
             checkpoint_dir = training_args.output_dir
+            # If the trainer_state.json is missing, look for the previous checkpoint
+            print(f"Checkpoint {checkpoint_dir} is missing 'trainer_state.json'. Looking for previous checkpoints...")
+            previous_checkpoints = sorted(glob.glob(os.path.join(os.path.dirname(checkpoint_dir), 'checkpoint-*')), key=os.path.getmtime)
+            if previous_checkpoints:
+                # Load the most recent previous checkpoint
+                last_checkpoint = previous_checkpoints[-1]
+                print(f"Loading previous checkpoint: {last_checkpoint}")
+                train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
             else:
+                print("No previous checkpoints found. Starting training from scratch.")
                 train_result = trainer.train()
         trainer.push_to_hub(commit_message="Training complete!")
     except Exception as e:
         return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"