Spaces:

soutrik
/

gradio_demo_CatDogClassifier

Runtime error

App Files Files Community

Soutrik commited on Nov 10, 2024

Commit

3749e6c

1 Parent(s): 3fa4d71

added: checks

Browse files

Files changed (4) hide show

configs/train.yaml +3 -0
ec2_runner_setup.md +21 -5
setup_aws_ci.md +0 -17
src/train_optuna_callbacks.py +15 -11

configs/train.yaml CHANGED Viewed

@@ -42,3 +42,6 @@ name: "catdog_experiment"
 # optimization metric
 optimization_metric: "val_acc"

 # optimization metric
 optimization_metric: "val_acc"
+# optuna hyperparameter optimization
+n_trials: 2

ec2_runner_setup.md CHANGED Viewed

@@ -1,6 +1,22 @@
 **Install docker and docker-compose on Ubuntu 22.04**
-Install docker
 ```bash
 sudo apt update
 sudo apt install -y apt-transport-https ca-certificates curl software-properties-common
@@ -16,7 +32,7 @@ sudo reboot
 docker --version
 docker ps
 ```
-Install docker-compose
 ```bash
 sudo rm /usr/local/bin/docker-compose
 sudo curl -L "https://github.com/docker/compose/releases/download/v2.30.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
@@ -24,7 +40,7 @@ sudo chmod +x /usr/local/bin/docker-compose
 docker-compose --version
 ```
-** Github actions self-hosted runner
 ```bash
 mkdir actions-runner && cd actions-runner
 curl -o actions-runner-linux-x64-2.320.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.320.0/actions-runner-linux-x64-2.320.0.tar.gz
@@ -35,7 +51,7 @@ tar xzf ./actions-runner-linux-x64-2.320.0.tar.gz
 ./run.sh
 # https://github.com/soutrik71/pytorch-template-aws/settings/actions/runners/new?arch=x64&os=linux
 ```
-** Activate aws cli
 ```bash
 curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
 sudo apt install unzip
@@ -45,7 +61,7 @@ aws --version
 aws configure
 ```
-** S3 bucket operations
 ```bash
 aws s3 cp data s3://deep-bucket-s3/data --recursive
 aws s3 ls s3://deep-bucket-s3

 **Install docker and docker-compose on Ubuntu 22.04**
+__PreRequisites__:
+    * Have an aws account with a user that has the necessary permissions
+    * Have the access key either on env variables or in the github actions secrets
+    * Have an ec2 runner instance running/created in the aws account
+    * Have a s3 bucket created in the aws account
+    * Have aws container registry created in the aws account
+__Local VM setup__:
+    * Install aws configure and setup the access key and secret key and the right zone
+        ```bash
+        curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
+        unzip awscliv2.zip
+        sudo ./aws/install
+        aws configure
+        ```
+__Install docker__:
 ```bash
 sudo apt update
 sudo apt install -y apt-transport-https ca-certificates curl software-properties-common
 docker --version
 docker ps
 ```
+__Install docker-compose__:
 ```bash
 sudo rm /usr/local/bin/docker-compose
 sudo curl -L "https://github.com/docker/compose/releases/download/v2.30.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
 docker-compose --version
 ```
+__Github actions self-hosted runner__:
 ```bash
 mkdir actions-runner && cd actions-runner
 curl -o actions-runner-linux-x64-2.320.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.320.0/actions-runner-linux-x64-2.320.0.tar.gz
 ./run.sh
 # https://github.com/soutrik71/pytorch-template-aws/settings/actions/runners/new?arch=x64&os=linux
 ```
+__Activate aws cli__:
 ```bash
 curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
 sudo apt install unzip
 aws configure
 ```
+__S3 bucket operations__:
 ```bash
 aws s3 cp data s3://deep-bucket-s3/data --recursive
 aws s3 ls s3://deep-bucket-s3

setup_aws_ci.md DELETED Viewed

@@ -1,17 +0,0 @@
-## __PreRequisites__:
-    * Have an aws account with a user that has the necessary permissions
-    * Have the access key either on env variables or in the github actions secrets
-    * Have an ec2 runner instance running/created in the aws account
-    * Have a s3 bucket created in the aws account
-    * Have aws container registry created in the aws account
-## __Local VM setup__:
-    * Install aws configure and setup the access key and secret key and the right zone
-        ```bash
-        curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
-        unzip awscliv2.zip
-        sudo ./aws/install
-        aws configure
-        ```

src/train_optuna_callbacks.py CHANGED Viewed

@@ -168,43 +168,46 @@ def setup_trainer(cfg: DictConfig):
     setup_logger(
         Path(cfg.paths.log_dir)
-        / ("train.log" if cfg.task_name == "train" else "eval.log")
     )
     # Instantiate callbacks
     callbacks = instantiate_callbacks(cfg.callbacks)
     logger.info(f"Callbacks: {callbacks}")
     if cfg.get("train", False):
-        # Clear checkpoint directory
         clear_checkpoint_directory(cfg.paths.ckpt_dir)
-        # find the best hyperparameters using Optuna and train the model
         pruner = optuna.pruners.MedianPruner()
         study = optuna.create_study(
             direction="maximize", pruner=pruner, study_name="pytorch_lightning_optuna"
         )
         study.optimize(
             lambda trial: objective(trial, cfg, callbacks),
-            n_trials=3,
             show_progress_bar=True,
         )
-        # Log best trial results
         best_trial = study.best_trial
         logger.info(f"Best trial number: {best_trial.number}")
         logger.info(f"Best trial value (val_acc): {best_trial.value}")
-        for key, value in best_trial.params.items():
-            logger.info(f"  {key}: {value}")
-        # write the best hyperparameters to the config
-        best_hyperparams = {key: value for key, value in best_trial.params.items()}
         best_hyperparams_path = Path(cfg.paths.ckpt_dir) / "best_hyperparams.json"
         with open(best_hyperparams_path, "w") as f:
-            json.dump(best_hyperparams, f)
         logger.info(f"Best hyperparameters saved to {best_hyperparams_path}")
     if cfg.get("test", False):
         best_hyperparams_path = Path(cfg.paths.ckpt_dir) / "best_hyperparams.json"
         if best_hyperparams_path.exists():
             with open(best_hyperparams_path, "r") as f:
                 best_hyperparams = json.load(f)
@@ -212,10 +215,11 @@ def setup_trainer(cfg: DictConfig):
             logger.info(f"Loaded best hyperparameters for testing: {best_hyperparams}")
         else:
             logger.error(
-                "Best hyperparameters not found! Using default hyperparameters."
             )
             raise FileNotFoundError("Best hyperparameters not found!")
         data_module: L.LightningDataModule = hydra.utils.instantiate(cfg.data)
         model: L.LightningModule = hydra.utils.instantiate(cfg.model)
         trainer = Trainer(**cfg.trainer, logger=instantiate_loggers(cfg.logger))

     setup_logger(
         Path(cfg.paths.log_dir)
+        / ("train.log" if cfg.task_name == "train" else "test.log")
     )
     # Instantiate callbacks
     callbacks = instantiate_callbacks(cfg.callbacks)
     logger.info(f"Callbacks: {callbacks}")
+    # Training phase with Optuna
     if cfg.get("train", False):
         clear_checkpoint_directory(cfg.paths.ckpt_dir)
+        # Optuna study setup
         pruner = optuna.pruners.MedianPruner()
         study = optuna.create_study(
             direction="maximize", pruner=pruner, study_name="pytorch_lightning_optuna"
         )
         study.optimize(
             lambda trial: objective(trial, cfg, callbacks),
+            n_trials=cfg.n_trials,
             show_progress_bar=True,
         )
+        # Log best trial results and save hyperparameters
         best_trial = study.best_trial
         logger.info(f"Best trial number: {best_trial.number}")
         logger.info(f"Best trial value (val_acc): {best_trial.value}")
+        best_hyperparams = best_trial.params
+        logger.info(f"Best hyperparameters: {best_hyperparams}")
+        # Save best hyperparameters to JSON
         best_hyperparams_path = Path(cfg.paths.ckpt_dir) / "best_hyperparams.json"
         with open(best_hyperparams_path, "w") as f:
+            json.dump(best_hyperparams, f, indent=4)
         logger.info(f"Best hyperparameters saved to {best_hyperparams_path}")
+    # Testing phase with best hyperparameters
     if cfg.get("test", False):
         best_hyperparams_path = Path(cfg.paths.ckpt_dir) / "best_hyperparams.json"
+        logger.info(f"Testing with best hyperparameters from {best_hyperparams_path}")
         if best_hyperparams_path.exists():
             with open(best_hyperparams_path, "r") as f:
                 best_hyperparams = json.load(f)
             logger.info(f"Loaded best hyperparameters for testing: {best_hyperparams}")
         else:
             logger.error(
+                "Best hyperparameters not found! Ensure training has run with `train=True` and saved the hyperparameters."
             )
             raise FileNotFoundError("Best hyperparameters not found!")
+        # Initialize data module, model, and trainer for testing
         data_module: L.LightningDataModule = hydra.utils.instantiate(cfg.data)
         model: L.LightningModule = hydra.utils.instantiate(cfg.model)
         trainer = Trainer(**cfg.trainer, logger=instantiate_loggers(cfg.logger))