Spaces:
Runtime error
Runtime error
Soutrik
commited on
Commit
·
3749e6c
1
Parent(s):
3fa4d71
added: checks
Browse files- configs/train.yaml +3 -0
- ec2_runner_setup.md +21 -5
- setup_aws_ci.md +0 -17
- src/train_optuna_callbacks.py +15 -11
configs/train.yaml
CHANGED
@@ -42,3 +42,6 @@ name: "catdog_experiment"
|
|
42 |
|
43 |
# optimization metric
|
44 |
optimization_metric: "val_acc"
|
|
|
|
|
|
|
|
42 |
|
43 |
# optimization metric
|
44 |
optimization_metric: "val_acc"
|
45 |
+
|
46 |
+
# optuna hyperparameter optimization
|
47 |
+
n_trials: 2
|
ec2_runner_setup.md
CHANGED
@@ -1,6 +1,22 @@
|
|
1 |
**Install docker and docker-compose on Ubuntu 22.04**
|
|
|
2 |
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
```bash
|
5 |
sudo apt update
|
6 |
sudo apt install -y apt-transport-https ca-certificates curl software-properties-common
|
@@ -16,7 +32,7 @@ sudo reboot
|
|
16 |
docker --version
|
17 |
docker ps
|
18 |
```
|
19 |
-
|
20 |
```bash
|
21 |
sudo rm /usr/local/bin/docker-compose
|
22 |
sudo curl -L "https://github.com/docker/compose/releases/download/v2.30.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
|
@@ -24,7 +40,7 @@ sudo chmod +x /usr/local/bin/docker-compose
|
|
24 |
docker-compose --version
|
25 |
```
|
26 |
|
27 |
-
|
28 |
```bash
|
29 |
mkdir actions-runner && cd actions-runner
|
30 |
curl -o actions-runner-linux-x64-2.320.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.320.0/actions-runner-linux-x64-2.320.0.tar.gz
|
@@ -35,7 +51,7 @@ tar xzf ./actions-runner-linux-x64-2.320.0.tar.gz
|
|
35 |
./run.sh
|
36 |
# https://github.com/soutrik71/pytorch-template-aws/settings/actions/runners/new?arch=x64&os=linux
|
37 |
```
|
38 |
-
|
39 |
```bash
|
40 |
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
|
41 |
sudo apt install unzip
|
@@ -45,7 +61,7 @@ aws --version
|
|
45 |
aws configure
|
46 |
|
47 |
```
|
48 |
-
|
49 |
```bash
|
50 |
aws s3 cp data s3://deep-bucket-s3/data --recursive
|
51 |
aws s3 ls s3://deep-bucket-s3
|
|
|
1 |
**Install docker and docker-compose on Ubuntu 22.04**
|
2 |
+
__PreRequisites__:
|
3 |
|
4 |
+
* Have an aws account with a user that has the necessary permissions
|
5 |
+
* Have the access key either on env variables or in the github actions secrets
|
6 |
+
* Have an ec2 runner instance running/created in the aws account
|
7 |
+
* Have a s3 bucket created in the aws account
|
8 |
+
* Have aws container registry created in the aws account
|
9 |
+
__Local VM setup__:
|
10 |
+
* Install aws configure and setup the access key and secret key and the right zone
|
11 |
+
```bash
|
12 |
+
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
|
13 |
+
unzip awscliv2.zip
|
14 |
+
sudo ./aws/install
|
15 |
+
aws configure
|
16 |
+
```
|
17 |
+
|
18 |
+
|
19 |
+
__Install docker__:
|
20 |
```bash
|
21 |
sudo apt update
|
22 |
sudo apt install -y apt-transport-https ca-certificates curl software-properties-common
|
|
|
32 |
docker --version
|
33 |
docker ps
|
34 |
```
|
35 |
+
__Install docker-compose__:
|
36 |
```bash
|
37 |
sudo rm /usr/local/bin/docker-compose
|
38 |
sudo curl -L "https://github.com/docker/compose/releases/download/v2.30.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
|
|
|
40 |
docker-compose --version
|
41 |
```
|
42 |
|
43 |
+
__Github actions self-hosted runner__:
|
44 |
```bash
|
45 |
mkdir actions-runner && cd actions-runner
|
46 |
curl -o actions-runner-linux-x64-2.320.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.320.0/actions-runner-linux-x64-2.320.0.tar.gz
|
|
|
51 |
./run.sh
|
52 |
# https://github.com/soutrik71/pytorch-template-aws/settings/actions/runners/new?arch=x64&os=linux
|
53 |
```
|
54 |
+
__Activate aws cli__:
|
55 |
```bash
|
56 |
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
|
57 |
sudo apt install unzip
|
|
|
61 |
aws configure
|
62 |
|
63 |
```
|
64 |
+
__S3 bucket operations__:
|
65 |
```bash
|
66 |
aws s3 cp data s3://deep-bucket-s3/data --recursive
|
67 |
aws s3 ls s3://deep-bucket-s3
|
setup_aws_ci.md
DELETED
@@ -1,17 +0,0 @@
|
|
1 |
-
## __PreRequisites__:
|
2 |
-
|
3 |
-
* Have an aws account with a user that has the necessary permissions
|
4 |
-
* Have the access key either on env variables or in the github actions secrets
|
5 |
-
* Have an ec2 runner instance running/created in the aws account
|
6 |
-
* Have a s3 bucket created in the aws account
|
7 |
-
* Have aws container registry created in the aws account
|
8 |
-
|
9 |
-
## __Local VM setup__:
|
10 |
-
* Install aws configure and setup the access key and secret key and the right zone
|
11 |
-
```bash
|
12 |
-
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
|
13 |
-
unzip awscliv2.zip
|
14 |
-
sudo ./aws/install
|
15 |
-
aws configure
|
16 |
-
```
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/train_optuna_callbacks.py
CHANGED
@@ -168,43 +168,46 @@ def setup_trainer(cfg: DictConfig):
|
|
168 |
|
169 |
setup_logger(
|
170 |
Path(cfg.paths.log_dir)
|
171 |
-
/ ("train.log" if cfg.task_name == "train" else "
|
172 |
)
|
173 |
|
174 |
# Instantiate callbacks
|
175 |
callbacks = instantiate_callbacks(cfg.callbacks)
|
176 |
logger.info(f"Callbacks: {callbacks}")
|
177 |
|
|
|
178 |
if cfg.get("train", False):
|
179 |
-
# Clear checkpoint directory
|
180 |
clear_checkpoint_directory(cfg.paths.ckpt_dir)
|
181 |
-
|
|
|
182 |
pruner = optuna.pruners.MedianPruner()
|
183 |
study = optuna.create_study(
|
184 |
direction="maximize", pruner=pruner, study_name="pytorch_lightning_optuna"
|
185 |
)
|
186 |
study.optimize(
|
187 |
lambda trial: objective(trial, cfg, callbacks),
|
188 |
-
n_trials=
|
189 |
show_progress_bar=True,
|
190 |
)
|
191 |
|
192 |
-
# Log best trial results
|
193 |
best_trial = study.best_trial
|
194 |
logger.info(f"Best trial number: {best_trial.number}")
|
195 |
logger.info(f"Best trial value (val_acc): {best_trial.value}")
|
196 |
-
|
197 |
-
|
198 |
|
199 |
-
#
|
200 |
-
best_hyperparams = {key: value for key, value in best_trial.params.items()}
|
201 |
best_hyperparams_path = Path(cfg.paths.ckpt_dir) / "best_hyperparams.json"
|
202 |
with open(best_hyperparams_path, "w") as f:
|
203 |
-
json.dump(best_hyperparams, f)
|
204 |
logger.info(f"Best hyperparameters saved to {best_hyperparams_path}")
|
205 |
|
|
|
206 |
if cfg.get("test", False):
|
207 |
best_hyperparams_path = Path(cfg.paths.ckpt_dir) / "best_hyperparams.json"
|
|
|
|
|
208 |
if best_hyperparams_path.exists():
|
209 |
with open(best_hyperparams_path, "r") as f:
|
210 |
best_hyperparams = json.load(f)
|
@@ -212,10 +215,11 @@ def setup_trainer(cfg: DictConfig):
|
|
212 |
logger.info(f"Loaded best hyperparameters for testing: {best_hyperparams}")
|
213 |
else:
|
214 |
logger.error(
|
215 |
-
"Best hyperparameters not found!
|
216 |
)
|
217 |
raise FileNotFoundError("Best hyperparameters not found!")
|
218 |
|
|
|
219 |
data_module: L.LightningDataModule = hydra.utils.instantiate(cfg.data)
|
220 |
model: L.LightningModule = hydra.utils.instantiate(cfg.model)
|
221 |
trainer = Trainer(**cfg.trainer, logger=instantiate_loggers(cfg.logger))
|
|
|
168 |
|
169 |
setup_logger(
|
170 |
Path(cfg.paths.log_dir)
|
171 |
+
/ ("train.log" if cfg.task_name == "train" else "test.log")
|
172 |
)
|
173 |
|
174 |
# Instantiate callbacks
|
175 |
callbacks = instantiate_callbacks(cfg.callbacks)
|
176 |
logger.info(f"Callbacks: {callbacks}")
|
177 |
|
178 |
+
# Training phase with Optuna
|
179 |
if cfg.get("train", False):
|
|
|
180 |
clear_checkpoint_directory(cfg.paths.ckpt_dir)
|
181 |
+
|
182 |
+
# Optuna study setup
|
183 |
pruner = optuna.pruners.MedianPruner()
|
184 |
study = optuna.create_study(
|
185 |
direction="maximize", pruner=pruner, study_name="pytorch_lightning_optuna"
|
186 |
)
|
187 |
study.optimize(
|
188 |
lambda trial: objective(trial, cfg, callbacks),
|
189 |
+
n_trials=cfg.n_trials,
|
190 |
show_progress_bar=True,
|
191 |
)
|
192 |
|
193 |
+
# Log best trial results and save hyperparameters
|
194 |
best_trial = study.best_trial
|
195 |
logger.info(f"Best trial number: {best_trial.number}")
|
196 |
logger.info(f"Best trial value (val_acc): {best_trial.value}")
|
197 |
+
best_hyperparams = best_trial.params
|
198 |
+
logger.info(f"Best hyperparameters: {best_hyperparams}")
|
199 |
|
200 |
+
# Save best hyperparameters to JSON
|
|
|
201 |
best_hyperparams_path = Path(cfg.paths.ckpt_dir) / "best_hyperparams.json"
|
202 |
with open(best_hyperparams_path, "w") as f:
|
203 |
+
json.dump(best_hyperparams, f, indent=4)
|
204 |
logger.info(f"Best hyperparameters saved to {best_hyperparams_path}")
|
205 |
|
206 |
+
# Testing phase with best hyperparameters
|
207 |
if cfg.get("test", False):
|
208 |
best_hyperparams_path = Path(cfg.paths.ckpt_dir) / "best_hyperparams.json"
|
209 |
+
logger.info(f"Testing with best hyperparameters from {best_hyperparams_path}")
|
210 |
+
|
211 |
if best_hyperparams_path.exists():
|
212 |
with open(best_hyperparams_path, "r") as f:
|
213 |
best_hyperparams = json.load(f)
|
|
|
215 |
logger.info(f"Loaded best hyperparameters for testing: {best_hyperparams}")
|
216 |
else:
|
217 |
logger.error(
|
218 |
+
"Best hyperparameters not found! Ensure training has run with `train=True` and saved the hyperparameters."
|
219 |
)
|
220 |
raise FileNotFoundError("Best hyperparameters not found!")
|
221 |
|
222 |
+
# Initialize data module, model, and trainer for testing
|
223 |
data_module: L.LightningDataModule = hydra.utils.instantiate(cfg.data)
|
224 |
model: L.LightningModule = hydra.utils.instantiate(cfg.model)
|
225 |
trainer = Trainer(**cfg.trainer, logger=instantiate_loggers(cfg.logger))
|