Soutrik commited on
Commit
0f27535
·
1 Parent(s): 4ff4028

train and eval working

Browse files
Files changed (39) hide show
  1. .gitignore +3 -1
  2. configs/callbacks/default.yaml +2 -2
  3. configs/callbacks/{model_summary.yaml → rich_model_summary.yaml} +1 -1
  4. configs/data/catdog.yaml +1 -2
  5. configs/experiment/catdog_experiment.yaml +2 -2
  6. configs/paths/catdog.yaml +1 -1
  7. configs/train.yaml +3 -0
  8. logs/train/runs/2024-11-08_15-27-10/.hydra/config.yaml +0 -95
  9. logs/train/runs/2024-11-08_15-27-10/.hydra/hydra.yaml +0 -174
  10. logs/train/runs/2024-11-08_15-27-10/.hydra/overrides.yaml +0 -1
  11. logs/train/runs/2024-11-08_15-27-10/train.log +0 -0
  12. logs/train/runs/2024-11-08_15-29-07/.hydra/config.yaml +0 -95
  13. logs/train/runs/2024-11-08_15-29-07/.hydra/hydra.yaml +0 -174
  14. logs/train/runs/2024-11-08_15-29-07/.hydra/overrides.yaml +0 -1
  15. logs/train/runs/2024-11-08_15-29-07/train.log +0 -0
  16. logs/train/runs/2024-11-08_15-29-42/.hydra/config.yaml +0 -95
  17. logs/train/runs/2024-11-08_15-29-42/.hydra/hydra.yaml +0 -174
  18. logs/train/runs/2024-11-08_15-29-42/.hydra/overrides.yaml +0 -1
  19. logs/train/runs/2024-11-08_15-29-42/train.log +0 -0
  20. logs/train/runs/2024-11-08_15-30-22/.hydra/config.yaml +0 -95
  21. logs/train/runs/2024-11-08_15-30-22/.hydra/hydra.yaml +0 -174
  22. logs/train/runs/2024-11-08_15-30-22/.hydra/overrides.yaml +0 -1
  23. logs/train/runs/2024-11-08_15-30-22/train.log +0 -0
  24. logs/train/runs/2024-11-08_15-35-40/.hydra/config.yaml +0 -95
  25. logs/train/runs/2024-11-08_15-35-40/.hydra/hydra.yaml +0 -174
  26. logs/train/runs/2024-11-08_15-35-40/.hydra/overrides.yaml +0 -1
  27. logs/train/runs/2024-11-08_15-35-40/train.log +0 -0
  28. logs/train/runs/2024-11-08_15-35-57/.hydra/config.yaml +0 -95
  29. logs/train/runs/2024-11-08_15-35-57/.hydra/hydra.yaml +0 -174
  30. logs/train/runs/2024-11-08_15-35-57/.hydra/overrides.yaml +0 -1
  31. logs/train/runs/2024-11-08_15-35-57/train.log +0 -0
  32. logs/train/runs/2024-11-08_15-37-45/.hydra/config.yaml +0 -95
  33. logs/train/runs/2024-11-08_15-37-45/.hydra/hydra.yaml +0 -174
  34. logs/train/runs/2024-11-08_15-37-45/.hydra/overrides.yaml +0 -1
  35. logs/train/runs/2024-11-08_15-37-45/train.log +0 -0
  36. notebooks/training_lightning.ipynb +889 -0
  37. src/datamodules/catdog_datamodule.py +15 -4
  38. src/models/catdog_model.py +6 -10
  39. src/train.py +184 -0
.gitignore CHANGED
@@ -20,4 +20,6 @@ app/core/__pycache__/
20
  src/__pycache__/test_infra.cpython-310.pyc
21
  app/core/__pycache__/config.cpython-310.pyc
22
  data/
23
- !configs/data/
 
 
 
20
  src/__pycache__/test_infra.cpython-310.pyc
21
  app/core/__pycache__/config.cpython-310.pyc
22
  data/
23
+ !configs/data/
24
+ checkpoints/
25
+ logs/
configs/callbacks/default.yaml CHANGED
@@ -1,7 +1,7 @@
1
  defaults:
2
  - model_checkpoint
3
  - early_stopping
4
- - model_summary
5
  - rich_progress_bar
6
  - _self_
7
 
@@ -17,7 +17,7 @@ early_stopping:
17
  patience: 3
18
  mode: "min"
19
 
20
- model_summary:
21
  max_depth: -1
22
 
23
  rich_progress_bar:
 
1
  defaults:
2
  - model_checkpoint
3
  - early_stopping
4
+ - rich_model_summary
5
  - rich_progress_bar
6
  - _self_
7
 
 
17
  patience: 3
18
  mode: "min"
19
 
20
+ rich_model_summary:
21
  max_depth: -1
22
 
23
  rich_progress_bar:
configs/callbacks/{model_summary.yaml → rich_model_summary.yaml} RENAMED
@@ -1,2 +1,2 @@
1
- model_summary:
2
  max_depth: 1
 
1
+ rich_model_summary:
2
  max_depth: 1
configs/data/catdog.yaml CHANGED
@@ -6,5 +6,4 @@ num_workers: 4
6
  batch_size: 32
7
  train_val_split: [0.8, 0.2]
8
  pin_memory: False
9
- image_size: 160
10
- dataset_url: "https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip"
 
6
  batch_size: 32
7
  train_val_split: [0.8, 0.2]
8
  pin_memory: False
9
+ image_size: 160
 
configs/experiment/catdog_experiment.yaml CHANGED
@@ -38,7 +38,7 @@ model:
38
 
39
  trainer:
40
  min_epochs: 1
41
- max_epochs: 6
42
 
43
  callbacks:
44
  model_checkpoint:
@@ -54,7 +54,7 @@ callbacks:
54
  mode: "max"
55
  verbose: True
56
 
57
- model_summary:
58
  max_depth: 1
59
 
60
  rich_progress_bar:
 
38
 
39
  trainer:
40
  min_epochs: 1
41
+ max_epochs: 10
42
 
43
  callbacks:
44
  model_checkpoint:
 
54
  mode: "max"
55
  verbose: True
56
 
57
+ rich_model_summary:
58
  max_depth: 1
59
 
60
  rich_progress_bar:
configs/paths/catdog.yaml CHANGED
@@ -16,7 +16,7 @@ ckpt_dir: ${paths.root_dir}/checkpoints
16
  artifact_dir: ${paths.root_dir}/artifacts/
17
 
18
  # download url for the dataset
19
- data_url: "https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip"
20
 
21
  # path to output directory, created dynamically by hydra
22
  # path generation pattern is specified in `configs/hydra/default.yaml`
 
16
  artifact_dir: ${paths.root_dir}/artifacts/
17
 
18
  # download url for the dataset
19
+ data_url: "https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip"
20
 
21
  # path to output directory, created dynamically by hydra
22
  # path generation pattern is specified in `configs/hydra/default.yaml`
configs/train.yaml CHANGED
@@ -39,3 +39,6 @@ seed: 42
39
 
40
  # name of the experiment
41
  name: "dogbreed_experiment"
 
 
 
 
39
 
40
  # name of the experiment
41
  name: "dogbreed_experiment"
42
+
43
+ # optimization metric
44
+ optimization_metric: "val_acc"
logs/train/runs/2024-11-08_15-27-10/.hydra/config.yaml DELETED
@@ -1,95 +0,0 @@
1
- task_name: train
2
- tags:
3
- - dev
4
- train: true
5
- test: false
6
- ckpt_path: ${paths.ckpt_dir}/best-checkpoint.ckpt
7
- seed: 42
8
- name: catdog_experiment
9
- data:
10
- _target_: src.datamodules.catdog_datamodule.CatDogImageDataModule
11
- data_dir: ${paths.data_dir}
12
- url: ${paths.data_url}
13
- num_workers: 8
14
- batch_size: 64
15
- train_val_split:
16
- - 0.8
17
- - 0.2
18
- pin_memory: true
19
- image_size: 160
20
- dataset_url: https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip
21
- model:
22
- _target_: src.models.catdog_model.ViTTinyClassifier
23
- img_size: 160
24
- patch_size: 16
25
- num_classes: 2
26
- embed_dim: 64
27
- depth: 6
28
- num_heads: 2
29
- mlp_ratio: 3
30
- pre_norm: false
31
- lr: 0.001
32
- weight_decay: 1.0e-05
33
- factor: 0.1
34
- patience: 10
35
- min_lr: 1.0e-06
36
- callbacks:
37
- model_checkpoint:
38
- dirpath: ${paths.ckpt_dir}
39
- filename: best-checkpoint
40
- monitor: val_acc
41
- verbose: false
42
- save_last: true
43
- save_top_k: 1
44
- mode: max
45
- auto_insert_metric_name: false
46
- save_weights_only: false
47
- every_n_train_steps: null
48
- train_time_interval: null
49
- every_n_epochs: null
50
- save_on_train_epoch_end: null
51
- early_stopping:
52
- monitor: val_acc
53
- min_delta: 0.0
54
- patience: 10
55
- verbose: false
56
- mode: max
57
- strict: true
58
- check_finite: true
59
- stopping_threshold: null
60
- divergence_threshold: null
61
- check_on_train_epoch_end: null
62
- model_summary:
63
- max_depth: 1
64
- rich_progress_bar:
65
- refresh_rate: 1
66
- logger:
67
- csv:
68
- save_dir: ${paths.output_dir}
69
- name: csv/
70
- prefix: ''
71
- tensorboard:
72
- save_dir: ${paths.output_dir}/tensorboard/
73
- name: null
74
- log_graph: false
75
- default_hp_metric: true
76
- prefix: ''
77
- trainer:
78
- _target_: lightning.Trainer
79
- default_root_dir: ${paths.output_dir}
80
- min_epochs: 1
81
- max_epochs: 6
82
- accelerator: auto
83
- devices: auto
84
- deterministic: true
85
- log_every_n_steps: 10
86
- fast_dev_run: false
87
- paths:
88
- root_dir: ${oc.env:PROJECT_ROOT}
89
- data_dir: ${paths.root_dir}/data/
90
- log_dir: ${paths.root_dir}/logs/
91
- ckpt_dir: ${paths.root_dir}/checkpoints
92
- artifact_dir: ${paths.root_dir}/artifacts/
93
- data_url: https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip
94
- output_dir: ${hydra:runtime.output_dir}
95
- work_dir: ${hydra:runtime.cwd}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train/runs/2024-11-08_15-27-10/.hydra/hydra.yaml DELETED
@@ -1,174 +0,0 @@
1
- hydra:
2
- run:
3
- dir: ${paths.log_dir}/${task_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
4
- sweep:
5
- dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
6
- subdir: ${hydra.job.num}
7
- launcher:
8
- _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
- sweeper:
10
- _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
- max_batch_size: null
12
- params: null
13
- help:
14
- app_name: ${hydra.job.name}
15
- header: '${hydra.help.app_name} is powered by Hydra.
16
-
17
- '
18
- footer: 'Powered by Hydra (https://hydra.cc)
19
-
20
- Use --hydra-help to view Hydra specific help
21
-
22
- '
23
- template: '${hydra.help.header}
24
-
25
- == Configuration groups ==
26
-
27
- Compose your configuration from those groups (group=option)
28
-
29
-
30
- $APP_CONFIG_GROUPS
31
-
32
-
33
- == Config ==
34
-
35
- Override anything in the config (foo.bar=value)
36
-
37
-
38
- $CONFIG
39
-
40
-
41
- ${hydra.help.footer}
42
-
43
- '
44
- hydra_help:
45
- template: 'Hydra (${hydra.runtime.version})
46
-
47
- See https://hydra.cc for more info.
48
-
49
-
50
- == Flags ==
51
-
52
- $FLAGS_HELP
53
-
54
-
55
- == Configuration groups ==
56
-
57
- Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
- to command line)
59
-
60
-
61
- $HYDRA_CONFIG_GROUPS
62
-
63
-
64
- Use ''--cfg hydra'' to Show the Hydra config.
65
-
66
- '
67
- hydra_help: ???
68
- hydra_logging:
69
- version: 1
70
- formatters:
71
- colorlog:
72
- (): colorlog.ColoredFormatter
73
- format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s'
74
- handlers:
75
- console:
76
- class: logging.StreamHandler
77
- formatter: colorlog
78
- stream: ext://sys.stdout
79
- root:
80
- level: INFO
81
- handlers:
82
- - console
83
- disable_existing_loggers: false
84
- job_logging:
85
- version: 1
86
- formatters:
87
- simple:
88
- format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
89
- colorlog:
90
- (): colorlog.ColoredFormatter
91
- format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s]
92
- - %(message)s'
93
- log_colors:
94
- DEBUG: purple
95
- INFO: green
96
- WARNING: yellow
97
- ERROR: red
98
- CRITICAL: red
99
- handlers:
100
- console:
101
- class: logging.StreamHandler
102
- formatter: colorlog
103
- stream: ext://sys.stdout
104
- file:
105
- class: logging.FileHandler
106
- formatter: simple
107
- filename: ${hydra.runtime.output_dir}/${task_name}.log
108
- root:
109
- level: INFO
110
- handlers:
111
- - console
112
- - file
113
- disable_existing_loggers: false
114
- env: {}
115
- mode: RUN
116
- searchpath: []
117
- callbacks: {}
118
- output_subdir: .hydra
119
- overrides:
120
- hydra:
121
- - hydra.mode=RUN
122
- task: []
123
- job:
124
- name: hydra_test
125
- chdir: null
126
- override_dirname: ''
127
- id: ???
128
- num: ???
129
- config_name: train
130
- env_set: {}
131
- env_copy: []
132
- config:
133
- override_dirname:
134
- kv_sep: '='
135
- item_sep: ','
136
- exclude_keys: []
137
- runtime:
138
- version: 1.3.2
139
- version_base: '1.1'
140
- cwd: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws
141
- config_sources:
142
- - path: hydra.conf
143
- schema: pkg
144
- provider: hydra
145
- - path: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/configs
146
- schema: file
147
- provider: main
148
- - path: hydra_plugins.hydra_colorlog.conf
149
- schema: pkg
150
- provider: hydra-colorlog
151
- - path: ''
152
- schema: structured
153
- provider: schema
154
- output_dir: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/logs/train/runs/2024-11-08_15-27-10
155
- choices:
156
- debug: null
157
- experiment: catdog_experiment
158
- hydra: default
159
- paths: catdog
160
- trainer: default
161
- logger: default
162
- callbacks: default
163
- model: catdog_classifier
164
- data: catdog
165
- hydra/env: default
166
- hydra/callbacks: null
167
- hydra/job_logging: colorlog
168
- hydra/hydra_logging: colorlog
169
- hydra/hydra_help: default
170
- hydra/help: default
171
- hydra/sweeper: basic
172
- hydra/launcher: basic
173
- hydra/output: default
174
- verbose: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train/runs/2024-11-08_15-27-10/.hydra/overrides.yaml DELETED
@@ -1 +0,0 @@
1
- []
 
 
logs/train/runs/2024-11-08_15-27-10/train.log DELETED
File without changes
logs/train/runs/2024-11-08_15-29-07/.hydra/config.yaml DELETED
@@ -1,95 +0,0 @@
1
- task_name: train
2
- tags:
3
- - dev
4
- train: true
5
- test: false
6
- ckpt_path: ${paths.ckpt_dir}/best-checkpoint.ckpt
7
- seed: 42
8
- name: catdog_experiment
9
- data:
10
- _target_: src.datamodules.catdog_datamodule.CatDogImageDataModule
11
- data_dir: ${paths.data_dir}
12
- url: ${paths.data_url}
13
- num_workers: 8
14
- batch_size: 64
15
- train_val_split:
16
- - 0.8
17
- - 0.2
18
- pin_memory: true
19
- image_size: 160
20
- dataset_url: https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip
21
- model:
22
- _target_: src.models.catdog_model.ViTTinyClassifier
23
- img_size: 160
24
- patch_size: 16
25
- num_classes: 2
26
- embed_dim: 64
27
- depth: 6
28
- num_heads: 2
29
- mlp_ratio: 3
30
- pre_norm: false
31
- lr: 0.001
32
- weight_decay: 1.0e-05
33
- factor: 0.1
34
- patience: 10
35
- min_lr: 1.0e-06
36
- callbacks:
37
- model_checkpoint:
38
- dirpath: ${paths.ckpt_dir}
39
- filename: best-checkpoint
40
- monitor: val_acc
41
- verbose: false
42
- save_last: true
43
- save_top_k: 1
44
- mode: max
45
- auto_insert_metric_name: false
46
- save_weights_only: false
47
- every_n_train_steps: null
48
- train_time_interval: null
49
- every_n_epochs: null
50
- save_on_train_epoch_end: null
51
- early_stopping:
52
- monitor: val_acc
53
- min_delta: 0.0
54
- patience: 10
55
- verbose: false
56
- mode: max
57
- strict: true
58
- check_finite: true
59
- stopping_threshold: null
60
- divergence_threshold: null
61
- check_on_train_epoch_end: null
62
- model_summary:
63
- max_depth: 1
64
- rich_progress_bar:
65
- refresh_rate: 1
66
- logger:
67
- csv:
68
- save_dir: ${paths.output_dir}
69
- name: csv/
70
- prefix: ''
71
- tensorboard:
72
- save_dir: ${paths.output_dir}/tensorboard/
73
- name: null
74
- log_graph: false
75
- default_hp_metric: true
76
- prefix: ''
77
- trainer:
78
- _target_: lightning.Trainer
79
- default_root_dir: ${paths.output_dir}
80
- min_epochs: 1
81
- max_epochs: 6
82
- accelerator: auto
83
- devices: auto
84
- deterministic: true
85
- log_every_n_steps: 10
86
- fast_dev_run: false
87
- paths:
88
- root_dir: ${oc.env:PROJECT_ROOT}
89
- data_dir: ${paths.root_dir}/data/
90
- log_dir: ${paths.root_dir}/logs/
91
- ckpt_dir: ${paths.root_dir}/checkpoints
92
- artifact_dir: ${paths.root_dir}/artifacts/
93
- data_url: https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip
94
- output_dir: ${hydra:runtime.output_dir}
95
- work_dir: ${hydra:runtime.cwd}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train/runs/2024-11-08_15-29-07/.hydra/hydra.yaml DELETED
@@ -1,174 +0,0 @@
1
- hydra:
2
- run:
3
- dir: ${paths.log_dir}/${task_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
4
- sweep:
5
- dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
6
- subdir: ${hydra.job.num}
7
- launcher:
8
- _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
- sweeper:
10
- _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
- max_batch_size: null
12
- params: null
13
- help:
14
- app_name: ${hydra.job.name}
15
- header: '${hydra.help.app_name} is powered by Hydra.
16
-
17
- '
18
- footer: 'Powered by Hydra (https://hydra.cc)
19
-
20
- Use --hydra-help to view Hydra specific help
21
-
22
- '
23
- template: '${hydra.help.header}
24
-
25
- == Configuration groups ==
26
-
27
- Compose your configuration from those groups (group=option)
28
-
29
-
30
- $APP_CONFIG_GROUPS
31
-
32
-
33
- == Config ==
34
-
35
- Override anything in the config (foo.bar=value)
36
-
37
-
38
- $CONFIG
39
-
40
-
41
- ${hydra.help.footer}
42
-
43
- '
44
- hydra_help:
45
- template: 'Hydra (${hydra.runtime.version})
46
-
47
- See https://hydra.cc for more info.
48
-
49
-
50
- == Flags ==
51
-
52
- $FLAGS_HELP
53
-
54
-
55
- == Configuration groups ==
56
-
57
- Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
- to command line)
59
-
60
-
61
- $HYDRA_CONFIG_GROUPS
62
-
63
-
64
- Use ''--cfg hydra'' to Show the Hydra config.
65
-
66
- '
67
- hydra_help: ???
68
- hydra_logging:
69
- version: 1
70
- formatters:
71
- colorlog:
72
- (): colorlog.ColoredFormatter
73
- format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s'
74
- handlers:
75
- console:
76
- class: logging.StreamHandler
77
- formatter: colorlog
78
- stream: ext://sys.stdout
79
- root:
80
- level: INFO
81
- handlers:
82
- - console
83
- disable_existing_loggers: false
84
- job_logging:
85
- version: 1
86
- formatters:
87
- simple:
88
- format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
89
- colorlog:
90
- (): colorlog.ColoredFormatter
91
- format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s]
92
- - %(message)s'
93
- log_colors:
94
- DEBUG: purple
95
- INFO: green
96
- WARNING: yellow
97
- ERROR: red
98
- CRITICAL: red
99
- handlers:
100
- console:
101
- class: logging.StreamHandler
102
- formatter: colorlog
103
- stream: ext://sys.stdout
104
- file:
105
- class: logging.FileHandler
106
- formatter: simple
107
- filename: ${hydra.runtime.output_dir}/${task_name}.log
108
- root:
109
- level: INFO
110
- handlers:
111
- - console
112
- - file
113
- disable_existing_loggers: false
114
- env: {}
115
- mode: RUN
116
- searchpath: []
117
- callbacks: {}
118
- output_subdir: .hydra
119
- overrides:
120
- hydra:
121
- - hydra.mode=RUN
122
- task: []
123
- job:
124
- name: hydra_test
125
- chdir: null
126
- override_dirname: ''
127
- id: ???
128
- num: ???
129
- config_name: train
130
- env_set: {}
131
- env_copy: []
132
- config:
133
- override_dirname:
134
- kv_sep: '='
135
- item_sep: ','
136
- exclude_keys: []
137
- runtime:
138
- version: 1.3.2
139
- version_base: '1.1'
140
- cwd: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws
141
- config_sources:
142
- - path: hydra.conf
143
- schema: pkg
144
- provider: hydra
145
- - path: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/configs
146
- schema: file
147
- provider: main
148
- - path: hydra_plugins.hydra_colorlog.conf
149
- schema: pkg
150
- provider: hydra-colorlog
151
- - path: ''
152
- schema: structured
153
- provider: schema
154
- output_dir: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/logs/train/runs/2024-11-08_15-29-07
155
- choices:
156
- debug: null
157
- experiment: catdog_experiment
158
- hydra: default
159
- paths: catdog
160
- trainer: default
161
- logger: default
162
- callbacks: default
163
- model: catdog_classifier
164
- data: catdog
165
- hydra/env: default
166
- hydra/callbacks: null
167
- hydra/job_logging: colorlog
168
- hydra/hydra_logging: colorlog
169
- hydra/hydra_help: default
170
- hydra/help: default
171
- hydra/sweeper: basic
172
- hydra/launcher: basic
173
- hydra/output: default
174
- verbose: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train/runs/2024-11-08_15-29-07/.hydra/overrides.yaml DELETED
@@ -1 +0,0 @@
1
- []
 
 
logs/train/runs/2024-11-08_15-29-07/train.log DELETED
File without changes
logs/train/runs/2024-11-08_15-29-42/.hydra/config.yaml DELETED
@@ -1,95 +0,0 @@
1
- task_name: train
2
- tags:
3
- - dev
4
- train: true
5
- test: false
6
- ckpt_path: ${paths.ckpt_dir}/best-checkpoint.ckpt
7
- seed: 42
8
- name: catdog_experiment
9
- data:
10
- _target_: src.datamodules.catdog_datamodule.CatDogImageDataModule
11
- data_dir: ${paths.data_dir}
12
- url: ${paths.data_url}
13
- num_workers: 8
14
- batch_size: 64
15
- train_val_split:
16
- - 0.8
17
- - 0.2
18
- pin_memory: true
19
- image_size: 160
20
- dataset_url: https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip
21
- model:
22
- _target_: src.models.catdog_model.ViTTinyClassifier
23
- img_size: 160
24
- patch_size: 16
25
- num_classes: 2
26
- embed_dim: 64
27
- depth: 6
28
- num_heads: 2
29
- mlp_ratio: 3
30
- pre_norm: false
31
- lr: 0.001
32
- weight_decay: 1.0e-05
33
- factor: 0.1
34
- patience: 10
35
- min_lr: 1.0e-06
36
- callbacks:
37
- model_checkpoint:
38
- dirpath: ${paths.ckpt_dir}
39
- filename: best-checkpoint
40
- monitor: val_acc
41
- verbose: false
42
- save_last: true
43
- save_top_k: 1
44
- mode: max
45
- auto_insert_metric_name: false
46
- save_weights_only: false
47
- every_n_train_steps: null
48
- train_time_interval: null
49
- every_n_epochs: null
50
- save_on_train_epoch_end: null
51
- early_stopping:
52
- monitor: val_acc
53
- min_delta: 0.0
54
- patience: 10
55
- verbose: false
56
- mode: max
57
- strict: true
58
- check_finite: true
59
- stopping_threshold: null
60
- divergence_threshold: null
61
- check_on_train_epoch_end: null
62
- model_summary:
63
- max_depth: 1
64
- rich_progress_bar:
65
- refresh_rate: 1
66
- logger:
67
- csv:
68
- save_dir: ${paths.output_dir}
69
- name: csv/
70
- prefix: ''
71
- tensorboard:
72
- save_dir: ${paths.output_dir}/tensorboard/
73
- name: null
74
- log_graph: false
75
- default_hp_metric: true
76
- prefix: ''
77
- trainer:
78
- _target_: lightning.Trainer
79
- default_root_dir: ${paths.output_dir}
80
- min_epochs: 1
81
- max_epochs: 6
82
- accelerator: auto
83
- devices: auto
84
- deterministic: true
85
- log_every_n_steps: 10
86
- fast_dev_run: false
87
- paths:
88
- root_dir: ${oc.env:PROJECT_ROOT}
89
- data_dir: ${paths.root_dir}/data/
90
- log_dir: ${paths.root_dir}/logs/
91
- ckpt_dir: ${paths.root_dir}/checkpoints
92
- artifact_dir: ${paths.root_dir}/artifacts/
93
- data_url: https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip
94
- output_dir: ${hydra:runtime.output_dir}
95
- work_dir: ${hydra:runtime.cwd}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train/runs/2024-11-08_15-29-42/.hydra/hydra.yaml DELETED
@@ -1,174 +0,0 @@
1
- hydra:
2
- run:
3
- dir: ${paths.log_dir}/${task_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
4
- sweep:
5
- dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
6
- subdir: ${hydra.job.num}
7
- launcher:
8
- _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
- sweeper:
10
- _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
- max_batch_size: null
12
- params: null
13
- help:
14
- app_name: ${hydra.job.name}
15
- header: '${hydra.help.app_name} is powered by Hydra.
16
-
17
- '
18
- footer: 'Powered by Hydra (https://hydra.cc)
19
-
20
- Use --hydra-help to view Hydra specific help
21
-
22
- '
23
- template: '${hydra.help.header}
24
-
25
- == Configuration groups ==
26
-
27
- Compose your configuration from those groups (group=option)
28
-
29
-
30
- $APP_CONFIG_GROUPS
31
-
32
-
33
- == Config ==
34
-
35
- Override anything in the config (foo.bar=value)
36
-
37
-
38
- $CONFIG
39
-
40
-
41
- ${hydra.help.footer}
42
-
43
- '
44
- hydra_help:
45
- template: 'Hydra (${hydra.runtime.version})
46
-
47
- See https://hydra.cc for more info.
48
-
49
-
50
- == Flags ==
51
-
52
- $FLAGS_HELP
53
-
54
-
55
- == Configuration groups ==
56
-
57
- Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
- to command line)
59
-
60
-
61
- $HYDRA_CONFIG_GROUPS
62
-
63
-
64
- Use ''--cfg hydra'' to Show the Hydra config.
65
-
66
- '
67
- hydra_help: ???
68
- hydra_logging:
69
- version: 1
70
- formatters:
71
- colorlog:
72
- (): colorlog.ColoredFormatter
73
- format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s'
74
- handlers:
75
- console:
76
- class: logging.StreamHandler
77
- formatter: colorlog
78
- stream: ext://sys.stdout
79
- root:
80
- level: INFO
81
- handlers:
82
- - console
83
- disable_existing_loggers: false
84
- job_logging:
85
- version: 1
86
- formatters:
87
- simple:
88
- format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
89
- colorlog:
90
- (): colorlog.ColoredFormatter
91
- format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s]
92
- - %(message)s'
93
- log_colors:
94
- DEBUG: purple
95
- INFO: green
96
- WARNING: yellow
97
- ERROR: red
98
- CRITICAL: red
99
- handlers:
100
- console:
101
- class: logging.StreamHandler
102
- formatter: colorlog
103
- stream: ext://sys.stdout
104
- file:
105
- class: logging.FileHandler
106
- formatter: simple
107
- filename: ${hydra.runtime.output_dir}/${task_name}.log
108
- root:
109
- level: INFO
110
- handlers:
111
- - console
112
- - file
113
- disable_existing_loggers: false
114
- env: {}
115
- mode: RUN
116
- searchpath: []
117
- callbacks: {}
118
- output_subdir: .hydra
119
- overrides:
120
- hydra:
121
- - hydra.mode=RUN
122
- task: []
123
- job:
124
- name: hydra_test
125
- chdir: null
126
- override_dirname: ''
127
- id: ???
128
- num: ???
129
- config_name: train
130
- env_set: {}
131
- env_copy: []
132
- config:
133
- override_dirname:
134
- kv_sep: '='
135
- item_sep: ','
136
- exclude_keys: []
137
- runtime:
138
- version: 1.3.2
139
- version_base: '1.1'
140
- cwd: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws
141
- config_sources:
142
- - path: hydra.conf
143
- schema: pkg
144
- provider: hydra
145
- - path: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/configs
146
- schema: file
147
- provider: main
148
- - path: hydra_plugins.hydra_colorlog.conf
149
- schema: pkg
150
- provider: hydra-colorlog
151
- - path: ''
152
- schema: structured
153
- provider: schema
154
- output_dir: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/logs/train/runs/2024-11-08_15-29-42
155
- choices:
156
- debug: null
157
- experiment: catdog_experiment
158
- hydra: default
159
- paths: catdog
160
- trainer: default
161
- logger: default
162
- callbacks: default
163
- model: catdog_classifier
164
- data: catdog
165
- hydra/env: default
166
- hydra/callbacks: null
167
- hydra/job_logging: colorlog
168
- hydra/hydra_logging: colorlog
169
- hydra/hydra_help: default
170
- hydra/help: default
171
- hydra/sweeper: basic
172
- hydra/launcher: basic
173
- hydra/output: default
174
- verbose: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train/runs/2024-11-08_15-29-42/.hydra/overrides.yaml DELETED
@@ -1 +0,0 @@
1
- []
 
 
logs/train/runs/2024-11-08_15-29-42/train.log DELETED
File without changes
logs/train/runs/2024-11-08_15-30-22/.hydra/config.yaml DELETED
@@ -1,95 +0,0 @@
1
- task_name: train
2
- tags:
3
- - dev
4
- train: true
5
- test: false
6
- ckpt_path: ${paths.ckpt_dir}/best-checkpoint.ckpt
7
- seed: 42
8
- name: catdog_experiment
9
- data:
10
- _target_: src.datamodules.catdog_datamodule.CatDogImageDataModule
11
- data_dir: ${paths.data_dir}
12
- url: ${paths.data_url}
13
- num_workers: 8
14
- batch_size: 64
15
- train_val_split:
16
- - 0.8
17
- - 0.2
18
- pin_memory: true
19
- image_size: 160
20
- dataset_url: https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip
21
- model:
22
- _target_: src.models.catdog_model.ViTTinyClassifier
23
- img_size: 160
24
- patch_size: 16
25
- num_classes: 2
26
- embed_dim: 64
27
- depth: 6
28
- num_heads: 2
29
- mlp_ratio: 3
30
- pre_norm: false
31
- lr: 0.001
32
- weight_decay: 1.0e-05
33
- factor: 0.1
34
- patience: 10
35
- min_lr: 1.0e-06
36
- callbacks:
37
- model_checkpoint:
38
- dirpath: ${paths.ckpt_dir}
39
- filename: best-checkpoint
40
- monitor: val_acc
41
- verbose: false
42
- save_last: true
43
- save_top_k: 1
44
- mode: max
45
- auto_insert_metric_name: false
46
- save_weights_only: false
47
- every_n_train_steps: null
48
- train_time_interval: null
49
- every_n_epochs: null
50
- save_on_train_epoch_end: null
51
- early_stopping:
52
- monitor: val_acc
53
- min_delta: 0.0
54
- patience: 10
55
- verbose: false
56
- mode: max
57
- strict: true
58
- check_finite: true
59
- stopping_threshold: null
60
- divergence_threshold: null
61
- check_on_train_epoch_end: null
62
- model_summary:
63
- max_depth: 1
64
- rich_progress_bar:
65
- refresh_rate: 1
66
- logger:
67
- csv:
68
- save_dir: ${paths.output_dir}
69
- name: csv/
70
- prefix: ''
71
- tensorboard:
72
- save_dir: ${paths.output_dir}/tensorboard/
73
- name: null
74
- log_graph: false
75
- default_hp_metric: true
76
- prefix: ''
77
- trainer:
78
- _target_: lightning.Trainer
79
- default_root_dir: ${paths.output_dir}
80
- min_epochs: 1
81
- max_epochs: 6
82
- accelerator: auto
83
- devices: auto
84
- deterministic: true
85
- log_every_n_steps: 10
86
- fast_dev_run: false
87
- paths:
88
- root_dir: ${oc.env:PROJECT_ROOT}
89
- data_dir: ${paths.root_dir}/data/
90
- log_dir: ${paths.root_dir}/logs/
91
- ckpt_dir: ${paths.root_dir}/checkpoints
92
- artifact_dir: ${paths.root_dir}/artifacts/
93
- data_url: https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip
94
- output_dir: ${hydra:runtime.output_dir}
95
- work_dir: ${hydra:runtime.cwd}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train/runs/2024-11-08_15-30-22/.hydra/hydra.yaml DELETED
@@ -1,174 +0,0 @@
1
- hydra:
2
- run:
3
- dir: ${paths.log_dir}/${task_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
4
- sweep:
5
- dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
6
- subdir: ${hydra.job.num}
7
- launcher:
8
- _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
- sweeper:
10
- _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
- max_batch_size: null
12
- params: null
13
- help:
14
- app_name: ${hydra.job.name}
15
- header: '${hydra.help.app_name} is powered by Hydra.
16
-
17
- '
18
- footer: 'Powered by Hydra (https://hydra.cc)
19
-
20
- Use --hydra-help to view Hydra specific help
21
-
22
- '
23
- template: '${hydra.help.header}
24
-
25
- == Configuration groups ==
26
-
27
- Compose your configuration from those groups (group=option)
28
-
29
-
30
- $APP_CONFIG_GROUPS
31
-
32
-
33
- == Config ==
34
-
35
- Override anything in the config (foo.bar=value)
36
-
37
-
38
- $CONFIG
39
-
40
-
41
- ${hydra.help.footer}
42
-
43
- '
44
- hydra_help:
45
- template: 'Hydra (${hydra.runtime.version})
46
-
47
- See https://hydra.cc for more info.
48
-
49
-
50
- == Flags ==
51
-
52
- $FLAGS_HELP
53
-
54
-
55
- == Configuration groups ==
56
-
57
- Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
- to command line)
59
-
60
-
61
- $HYDRA_CONFIG_GROUPS
62
-
63
-
64
- Use ''--cfg hydra'' to Show the Hydra config.
65
-
66
- '
67
- hydra_help: ???
68
- hydra_logging:
69
- version: 1
70
- formatters:
71
- colorlog:
72
- (): colorlog.ColoredFormatter
73
- format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s'
74
- handlers:
75
- console:
76
- class: logging.StreamHandler
77
- formatter: colorlog
78
- stream: ext://sys.stdout
79
- root:
80
- level: INFO
81
- handlers:
82
- - console
83
- disable_existing_loggers: false
84
- job_logging:
85
- version: 1
86
- formatters:
87
- simple:
88
- format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
89
- colorlog:
90
- (): colorlog.ColoredFormatter
91
- format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s]
92
- - %(message)s'
93
- log_colors:
94
- DEBUG: purple
95
- INFO: green
96
- WARNING: yellow
97
- ERROR: red
98
- CRITICAL: red
99
- handlers:
100
- console:
101
- class: logging.StreamHandler
102
- formatter: colorlog
103
- stream: ext://sys.stdout
104
- file:
105
- class: logging.FileHandler
106
- formatter: simple
107
- filename: ${hydra.runtime.output_dir}/${task_name}.log
108
- root:
109
- level: INFO
110
- handlers:
111
- - console
112
- - file
113
- disable_existing_loggers: false
114
- env: {}
115
- mode: RUN
116
- searchpath: []
117
- callbacks: {}
118
- output_subdir: .hydra
119
- overrides:
120
- hydra:
121
- - hydra.mode=RUN
122
- task: []
123
- job:
124
- name: hydra_test
125
- chdir: null
126
- override_dirname: ''
127
- id: ???
128
- num: ???
129
- config_name: train
130
- env_set: {}
131
- env_copy: []
132
- config:
133
- override_dirname:
134
- kv_sep: '='
135
- item_sep: ','
136
- exclude_keys: []
137
- runtime:
138
- version: 1.3.2
139
- version_base: '1.1'
140
- cwd: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws
141
- config_sources:
142
- - path: hydra.conf
143
- schema: pkg
144
- provider: hydra
145
- - path: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/configs
146
- schema: file
147
- provider: main
148
- - path: hydra_plugins.hydra_colorlog.conf
149
- schema: pkg
150
- provider: hydra-colorlog
151
- - path: ''
152
- schema: structured
153
- provider: schema
154
- output_dir: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/logs/train/runs/2024-11-08_15-30-22
155
- choices:
156
- debug: null
157
- experiment: catdog_experiment
158
- hydra: default
159
- paths: catdog
160
- trainer: default
161
- logger: default
162
- callbacks: default
163
- model: catdog_classifier
164
- data: catdog
165
- hydra/env: default
166
- hydra/callbacks: null
167
- hydra/job_logging: colorlog
168
- hydra/hydra_logging: colorlog
169
- hydra/hydra_help: default
170
- hydra/help: default
171
- hydra/sweeper: basic
172
- hydra/launcher: basic
173
- hydra/output: default
174
- verbose: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train/runs/2024-11-08_15-30-22/.hydra/overrides.yaml DELETED
@@ -1 +0,0 @@
1
- []
 
 
logs/train/runs/2024-11-08_15-30-22/train.log DELETED
File without changes
logs/train/runs/2024-11-08_15-35-40/.hydra/config.yaml DELETED
@@ -1,95 +0,0 @@
1
- task_name: train
2
- tags:
3
- - dev
4
- train: true
5
- test: false
6
- ckpt_path: ${paths.ckpt_dir}/best-checkpoint.ckpt
7
- seed: 42
8
- name: catdog_experiment
9
- data:
10
- _target_: src.datamodules.catdog_datamodule.CatDogImageDataModule
11
- data_dir: ${paths.data_dir}
12
- url: ${paths.data_url}
13
- num_workers: 8
14
- batch_size: 64
15
- train_val_split:
16
- - 0.8
17
- - 0.2
18
- pin_memory: true
19
- image_size: 160
20
- dataset_url: https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip
21
- model:
22
- _target_: src.models.catdog_model.ViTTinyClassifier
23
- img_size: 160
24
- patch_size: 16
25
- num_classes: 2
26
- embed_dim: 64
27
- depth: 6
28
- num_heads: 2
29
- mlp_ratio: 3
30
- pre_norm: false
31
- lr: 0.001
32
- weight_decay: 1.0e-05
33
- factor: 0.1
34
- patience: 10
35
- min_lr: 1.0e-06
36
- callbacks:
37
- model_checkpoint:
38
- dirpath: ${paths.ckpt_dir}
39
- filename: best-checkpoint
40
- monitor: val_acc
41
- verbose: false
42
- save_last: true
43
- save_top_k: 1
44
- mode: max
45
- auto_insert_metric_name: false
46
- save_weights_only: false
47
- every_n_train_steps: null
48
- train_time_interval: null
49
- every_n_epochs: null
50
- save_on_train_epoch_end: null
51
- early_stopping:
52
- monitor: val_acc
53
- min_delta: 0.0
54
- patience: 10
55
- verbose: false
56
- mode: max
57
- strict: true
58
- check_finite: true
59
- stopping_threshold: null
60
- divergence_threshold: null
61
- check_on_train_epoch_end: null
62
- model_summary:
63
- max_depth: 1
64
- rich_progress_bar:
65
- refresh_rate: 1
66
- logger:
67
- csv:
68
- save_dir: ${paths.output_dir}
69
- name: csv/
70
- prefix: ''
71
- tensorboard:
72
- save_dir: ${paths.output_dir}/tensorboard/
73
- name: null
74
- log_graph: false
75
- default_hp_metric: true
76
- prefix: ''
77
- trainer:
78
- _target_: lightning.Trainer
79
- default_root_dir: ${paths.output_dir}
80
- min_epochs: 1
81
- max_epochs: 6
82
- accelerator: auto
83
- devices: auto
84
- deterministic: true
85
- log_every_n_steps: 10
86
- fast_dev_run: false
87
- paths:
88
- root_dir: ${oc.env:PROJECT_ROOT}
89
- data_dir: ${paths.root_dir}/data/
90
- log_dir: ${paths.root_dir}/logs/
91
- ckpt_dir: ${paths.root_dir}/checkpoints
92
- artifact_dir: ${paths.root_dir}/artifacts/
93
- data_url: https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip
94
- output_dir: ${hydra:runtime.output_dir}
95
- work_dir: ${hydra:runtime.cwd}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train/runs/2024-11-08_15-35-40/.hydra/hydra.yaml DELETED
@@ -1,174 +0,0 @@
1
- hydra:
2
- run:
3
- dir: ${paths.log_dir}/${task_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
4
- sweep:
5
- dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
6
- subdir: ${hydra.job.num}
7
- launcher:
8
- _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
- sweeper:
10
- _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
- max_batch_size: null
12
- params: null
13
- help:
14
- app_name: ${hydra.job.name}
15
- header: '${hydra.help.app_name} is powered by Hydra.
16
-
17
- '
18
- footer: 'Powered by Hydra (https://hydra.cc)
19
-
20
- Use --hydra-help to view Hydra specific help
21
-
22
- '
23
- template: '${hydra.help.header}
24
-
25
- == Configuration groups ==
26
-
27
- Compose your configuration from those groups (group=option)
28
-
29
-
30
- $APP_CONFIG_GROUPS
31
-
32
-
33
- == Config ==
34
-
35
- Override anything in the config (foo.bar=value)
36
-
37
-
38
- $CONFIG
39
-
40
-
41
- ${hydra.help.footer}
42
-
43
- '
44
- hydra_help:
45
- template: 'Hydra (${hydra.runtime.version})
46
-
47
- See https://hydra.cc for more info.
48
-
49
-
50
- == Flags ==
51
-
52
- $FLAGS_HELP
53
-
54
-
55
- == Configuration groups ==
56
-
57
- Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
- to command line)
59
-
60
-
61
- $HYDRA_CONFIG_GROUPS
62
-
63
-
64
- Use ''--cfg hydra'' to Show the Hydra config.
65
-
66
- '
67
- hydra_help: ???
68
- hydra_logging:
69
- version: 1
70
- formatters:
71
- colorlog:
72
- (): colorlog.ColoredFormatter
73
- format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s'
74
- handlers:
75
- console:
76
- class: logging.StreamHandler
77
- formatter: colorlog
78
- stream: ext://sys.stdout
79
- root:
80
- level: INFO
81
- handlers:
82
- - console
83
- disable_existing_loggers: false
84
- job_logging:
85
- version: 1
86
- formatters:
87
- simple:
88
- format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
89
- colorlog:
90
- (): colorlog.ColoredFormatter
91
- format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s]
92
- - %(message)s'
93
- log_colors:
94
- DEBUG: purple
95
- INFO: green
96
- WARNING: yellow
97
- ERROR: red
98
- CRITICAL: red
99
- handlers:
100
- console:
101
- class: logging.StreamHandler
102
- formatter: colorlog
103
- stream: ext://sys.stdout
104
- file:
105
- class: logging.FileHandler
106
- formatter: simple
107
- filename: ${hydra.runtime.output_dir}/${task_name}.log
108
- root:
109
- level: INFO
110
- handlers:
111
- - console
112
- - file
113
- disable_existing_loggers: false
114
- env: {}
115
- mode: RUN
116
- searchpath: []
117
- callbacks: {}
118
- output_subdir: .hydra
119
- overrides:
120
- hydra:
121
- - hydra.mode=RUN
122
- task: []
123
- job:
124
- name: hydra_test2
125
- chdir: null
126
- override_dirname: ''
127
- id: ???
128
- num: ???
129
- config_name: train
130
- env_set: {}
131
- env_copy: []
132
- config:
133
- override_dirname:
134
- kv_sep: '='
135
- item_sep: ','
136
- exclude_keys: []
137
- runtime:
138
- version: 1.3.2
139
- version_base: '1.1'
140
- cwd: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws
141
- config_sources:
142
- - path: hydra.conf
143
- schema: pkg
144
- provider: hydra
145
- - path: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/configs
146
- schema: file
147
- provider: main
148
- - path: hydra_plugins.hydra_colorlog.conf
149
- schema: pkg
150
- provider: hydra-colorlog
151
- - path: ''
152
- schema: structured
153
- provider: schema
154
- output_dir: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/logs/train/runs/2024-11-08_15-35-40
155
- choices:
156
- debug: null
157
- experiment: catdog_experiment
158
- hydra: default
159
- paths: catdog
160
- trainer: default
161
- logger: default
162
- callbacks: default
163
- model: catdog_classifier
164
- data: catdog
165
- hydra/env: default
166
- hydra/callbacks: null
167
- hydra/job_logging: colorlog
168
- hydra/hydra_logging: colorlog
169
- hydra/hydra_help: default
170
- hydra/help: default
171
- hydra/sweeper: basic
172
- hydra/launcher: basic
173
- hydra/output: default
174
- verbose: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train/runs/2024-11-08_15-35-40/.hydra/overrides.yaml DELETED
@@ -1 +0,0 @@
1
- []
 
 
logs/train/runs/2024-11-08_15-35-40/train.log DELETED
File without changes
logs/train/runs/2024-11-08_15-35-57/.hydra/config.yaml DELETED
@@ -1,95 +0,0 @@
1
- task_name: train
2
- tags:
3
- - dev
4
- train: true
5
- test: false
6
- ckpt_path: ${paths.ckpt_dir}/best-checkpoint.ckpt
7
- seed: 42
8
- name: catdog_experiment
9
- data:
10
- _target_: src.datamodules.catdog_datamodule.CatDogImageDataModule
11
- data_dir: ${paths.data_dir}
12
- url: ${paths.data_url}
13
- num_workers: 8
14
- batch_size: 64
15
- train_val_split:
16
- - 0.8
17
- - 0.2
18
- pin_memory: true
19
- image_size: 160
20
- dataset_url: https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip
21
- model:
22
- _target_: src.models.catdog_model.ViTTinyClassifier
23
- img_size: 160
24
- patch_size: 16
25
- num_classes: 2
26
- embed_dim: 64
27
- depth: 6
28
- num_heads: 2
29
- mlp_ratio: 3
30
- pre_norm: false
31
- lr: 0.001
32
- weight_decay: 1.0e-05
33
- factor: 0.1
34
- patience: 10
35
- min_lr: 1.0e-06
36
- callbacks:
37
- model_checkpoint:
38
- dirpath: ${paths.ckpt_dir}
39
- filename: best-checkpoint
40
- monitor: val_acc
41
- verbose: false
42
- save_last: true
43
- save_top_k: 1
44
- mode: max
45
- auto_insert_metric_name: false
46
- save_weights_only: false
47
- every_n_train_steps: null
48
- train_time_interval: null
49
- every_n_epochs: null
50
- save_on_train_epoch_end: null
51
- early_stopping:
52
- monitor: val_acc
53
- min_delta: 0.0
54
- patience: 10
55
- verbose: false
56
- mode: max
57
- strict: true
58
- check_finite: true
59
- stopping_threshold: null
60
- divergence_threshold: null
61
- check_on_train_epoch_end: null
62
- model_summary:
63
- max_depth: 1
64
- rich_progress_bar:
65
- refresh_rate: 1
66
- logger:
67
- csv:
68
- save_dir: ${paths.output_dir}
69
- name: csv/
70
- prefix: ''
71
- tensorboard:
72
- save_dir: ${paths.output_dir}/tensorboard/
73
- name: null
74
- log_graph: false
75
- default_hp_metric: true
76
- prefix: ''
77
- trainer:
78
- _target_: lightning.Trainer
79
- default_root_dir: ${paths.output_dir}
80
- min_epochs: 1
81
- max_epochs: 6
82
- accelerator: auto
83
- devices: auto
84
- deterministic: true
85
- log_every_n_steps: 10
86
- fast_dev_run: false
87
- paths:
88
- root_dir: ${oc.env:PROJECT_ROOT}
89
- data_dir: ${paths.root_dir}/data/
90
- log_dir: ${paths.root_dir}/logs/
91
- ckpt_dir: ${paths.root_dir}/checkpoints
92
- artifact_dir: ${paths.root_dir}/artifacts/
93
- data_url: https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip
94
- output_dir: ${hydra:runtime.output_dir}
95
- work_dir: ${hydra:runtime.cwd}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train/runs/2024-11-08_15-35-57/.hydra/hydra.yaml DELETED
@@ -1,174 +0,0 @@
1
- hydra:
2
- run:
3
- dir: ${paths.log_dir}/${task_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
4
- sweep:
5
- dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
6
- subdir: ${hydra.job.num}
7
- launcher:
8
- _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
- sweeper:
10
- _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
- max_batch_size: null
12
- params: null
13
- help:
14
- app_name: ${hydra.job.name}
15
- header: '${hydra.help.app_name} is powered by Hydra.
16
-
17
- '
18
- footer: 'Powered by Hydra (https://hydra.cc)
19
-
20
- Use --hydra-help to view Hydra specific help
21
-
22
- '
23
- template: '${hydra.help.header}
24
-
25
- == Configuration groups ==
26
-
27
- Compose your configuration from those groups (group=option)
28
-
29
-
30
- $APP_CONFIG_GROUPS
31
-
32
-
33
- == Config ==
34
-
35
- Override anything in the config (foo.bar=value)
36
-
37
-
38
- $CONFIG
39
-
40
-
41
- ${hydra.help.footer}
42
-
43
- '
44
- hydra_help:
45
- template: 'Hydra (${hydra.runtime.version})
46
-
47
- See https://hydra.cc for more info.
48
-
49
-
50
- == Flags ==
51
-
52
- $FLAGS_HELP
53
-
54
-
55
- == Configuration groups ==
56
-
57
- Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
- to command line)
59
-
60
-
61
- $HYDRA_CONFIG_GROUPS
62
-
63
-
64
- Use ''--cfg hydra'' to Show the Hydra config.
65
-
66
- '
67
- hydra_help: ???
68
- hydra_logging:
69
- version: 1
70
- formatters:
71
- colorlog:
72
- (): colorlog.ColoredFormatter
73
- format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s'
74
- handlers:
75
- console:
76
- class: logging.StreamHandler
77
- formatter: colorlog
78
- stream: ext://sys.stdout
79
- root:
80
- level: INFO
81
- handlers:
82
- - console
83
- disable_existing_loggers: false
84
- job_logging:
85
- version: 1
86
- formatters:
87
- simple:
88
- format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
89
- colorlog:
90
- (): colorlog.ColoredFormatter
91
- format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s]
92
- - %(message)s'
93
- log_colors:
94
- DEBUG: purple
95
- INFO: green
96
- WARNING: yellow
97
- ERROR: red
98
- CRITICAL: red
99
- handlers:
100
- console:
101
- class: logging.StreamHandler
102
- formatter: colorlog
103
- stream: ext://sys.stdout
104
- file:
105
- class: logging.FileHandler
106
- formatter: simple
107
- filename: ${hydra.runtime.output_dir}/${task_name}.log
108
- root:
109
- level: INFO
110
- handlers:
111
- - console
112
- - file
113
- disable_existing_loggers: false
114
- env: {}
115
- mode: RUN
116
- searchpath: []
117
- callbacks: {}
118
- output_subdir: .hydra
119
- overrides:
120
- hydra:
121
- - hydra.mode=RUN
122
- task: []
123
- job:
124
- name: hydra_test2
125
- chdir: null
126
- override_dirname: ''
127
- id: ???
128
- num: ???
129
- config_name: train
130
- env_set: {}
131
- env_copy: []
132
- config:
133
- override_dirname:
134
- kv_sep: '='
135
- item_sep: ','
136
- exclude_keys: []
137
- runtime:
138
- version: 1.3.2
139
- version_base: '1.1'
140
- cwd: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws
141
- config_sources:
142
- - path: hydra.conf
143
- schema: pkg
144
- provider: hydra
145
- - path: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/configs
146
- schema: file
147
- provider: main
148
- - path: hydra_plugins.hydra_colorlog.conf
149
- schema: pkg
150
- provider: hydra-colorlog
151
- - path: ''
152
- schema: structured
153
- provider: schema
154
- output_dir: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/logs/train/runs/2024-11-08_15-35-57
155
- choices:
156
- debug: null
157
- experiment: catdog_experiment
158
- hydra: default
159
- paths: catdog
160
- trainer: default
161
- logger: default
162
- callbacks: default
163
- model: catdog_classifier
164
- data: catdog
165
- hydra/env: default
166
- hydra/callbacks: null
167
- hydra/job_logging: colorlog
168
- hydra/hydra_logging: colorlog
169
- hydra/hydra_help: default
170
- hydra/help: default
171
- hydra/sweeper: basic
172
- hydra/launcher: basic
173
- hydra/output: default
174
- verbose: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train/runs/2024-11-08_15-35-57/.hydra/overrides.yaml DELETED
@@ -1 +0,0 @@
1
- []
 
 
logs/train/runs/2024-11-08_15-35-57/train.log DELETED
File without changes
logs/train/runs/2024-11-08_15-37-45/.hydra/config.yaml DELETED
@@ -1,95 +0,0 @@
1
- task_name: train
2
- tags:
3
- - dev
4
- train: true
5
- test: false
6
- ckpt_path: ${paths.ckpt_dir}/best-checkpoint.ckpt
7
- seed: 42
8
- name: catdog_experiment
9
- data:
10
- _target_: src.datamodules.catdog_datamodule.CatDogImageDataModule
11
- data_dir: ${paths.data_dir}
12
- url: ${paths.data_url}
13
- num_workers: 8
14
- batch_size: 64
15
- train_val_split:
16
- - 0.8
17
- - 0.2
18
- pin_memory: true
19
- image_size: 160
20
- dataset_url: https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip
21
- model:
22
- _target_: src.models.catdog_model.ViTTinyClassifier
23
- img_size: 160
24
- patch_size: 16
25
- num_classes: 2
26
- embed_dim: 64
27
- depth: 6
28
- num_heads: 2
29
- mlp_ratio: 3
30
- pre_norm: false
31
- lr: 0.001
32
- weight_decay: 1.0e-05
33
- factor: 0.1
34
- patience: 10
35
- min_lr: 1.0e-06
36
- callbacks:
37
- model_checkpoint:
38
- dirpath: ${paths.ckpt_dir}
39
- filename: best-checkpoint
40
- monitor: val_acc
41
- verbose: true
42
- save_last: true
43
- save_top_k: 1
44
- mode: max
45
- auto_insert_metric_name: false
46
- save_weights_only: false
47
- every_n_train_steps: null
48
- train_time_interval: null
49
- every_n_epochs: null
50
- save_on_train_epoch_end: null
51
- early_stopping:
52
- monitor: val_acc
53
- min_delta: 0.0
54
- patience: 10
55
- verbose: true
56
- mode: max
57
- strict: true
58
- check_finite: true
59
- stopping_threshold: null
60
- divergence_threshold: null
61
- check_on_train_epoch_end: null
62
- model_summary:
63
- max_depth: 1
64
- rich_progress_bar:
65
- refresh_rate: 1
66
- logger:
67
- csv:
68
- save_dir: ${paths.output_dir}
69
- name: csv/
70
- prefix: ''
71
- tensorboard:
72
- save_dir: ${paths.output_dir}/tensorboard/
73
- name: null
74
- log_graph: false
75
- default_hp_metric: true
76
- prefix: ''
77
- trainer:
78
- _target_: lightning.Trainer
79
- default_root_dir: ${paths.output_dir}
80
- min_epochs: 1
81
- max_epochs: 6
82
- accelerator: auto
83
- devices: auto
84
- deterministic: true
85
- log_every_n_steps: 10
86
- fast_dev_run: false
87
- paths:
88
- root_dir: ${oc.env:PROJECT_ROOT}
89
- data_dir: ${paths.root_dir}/data/
90
- log_dir: ${paths.root_dir}/logs/
91
- ckpt_dir: ${paths.root_dir}/checkpoints
92
- artifact_dir: ${paths.root_dir}/artifacts/
93
- data_url: https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip
94
- output_dir: ${hydra:runtime.output_dir}
95
- work_dir: ${hydra:runtime.cwd}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train/runs/2024-11-08_15-37-45/.hydra/hydra.yaml DELETED
@@ -1,174 +0,0 @@
1
- hydra:
2
- run:
3
- dir: ${paths.log_dir}/${task_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
4
- sweep:
5
- dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
6
- subdir: ${hydra.job.num}
7
- launcher:
8
- _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
- sweeper:
10
- _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
- max_batch_size: null
12
- params: null
13
- help:
14
- app_name: ${hydra.job.name}
15
- header: '${hydra.help.app_name} is powered by Hydra.
16
-
17
- '
18
- footer: 'Powered by Hydra (https://hydra.cc)
19
-
20
- Use --hydra-help to view Hydra specific help
21
-
22
- '
23
- template: '${hydra.help.header}
24
-
25
- == Configuration groups ==
26
-
27
- Compose your configuration from those groups (group=option)
28
-
29
-
30
- $APP_CONFIG_GROUPS
31
-
32
-
33
- == Config ==
34
-
35
- Override anything in the config (foo.bar=value)
36
-
37
-
38
- $CONFIG
39
-
40
-
41
- ${hydra.help.footer}
42
-
43
- '
44
- hydra_help:
45
- template: 'Hydra (${hydra.runtime.version})
46
-
47
- See https://hydra.cc for more info.
48
-
49
-
50
- == Flags ==
51
-
52
- $FLAGS_HELP
53
-
54
-
55
- == Configuration groups ==
56
-
57
- Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
- to command line)
59
-
60
-
61
- $HYDRA_CONFIG_GROUPS
62
-
63
-
64
- Use ''--cfg hydra'' to Show the Hydra config.
65
-
66
- '
67
- hydra_help: ???
68
- hydra_logging:
69
- version: 1
70
- formatters:
71
- colorlog:
72
- (): colorlog.ColoredFormatter
73
- format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s'
74
- handlers:
75
- console:
76
- class: logging.StreamHandler
77
- formatter: colorlog
78
- stream: ext://sys.stdout
79
- root:
80
- level: INFO
81
- handlers:
82
- - console
83
- disable_existing_loggers: false
84
- job_logging:
85
- version: 1
86
- formatters:
87
- simple:
88
- format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
89
- colorlog:
90
- (): colorlog.ColoredFormatter
91
- format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s]
92
- - %(message)s'
93
- log_colors:
94
- DEBUG: purple
95
- INFO: green
96
- WARNING: yellow
97
- ERROR: red
98
- CRITICAL: red
99
- handlers:
100
- console:
101
- class: logging.StreamHandler
102
- formatter: colorlog
103
- stream: ext://sys.stdout
104
- file:
105
- class: logging.FileHandler
106
- formatter: simple
107
- filename: ${hydra.runtime.output_dir}/${task_name}.log
108
- root:
109
- level: INFO
110
- handlers:
111
- - console
112
- - file
113
- disable_existing_loggers: false
114
- env: {}
115
- mode: RUN
116
- searchpath: []
117
- callbacks: {}
118
- output_subdir: .hydra
119
- overrides:
120
- hydra:
121
- - hydra.mode=RUN
122
- task: []
123
- job:
124
- name: hydra_test2
125
- chdir: null
126
- override_dirname: ''
127
- id: ???
128
- num: ???
129
- config_name: train
130
- env_set: {}
131
- env_copy: []
132
- config:
133
- override_dirname:
134
- kv_sep: '='
135
- item_sep: ','
136
- exclude_keys: []
137
- runtime:
138
- version: 1.3.2
139
- version_base: '1.1'
140
- cwd: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws
141
- config_sources:
142
- - path: hydra.conf
143
- schema: pkg
144
- provider: hydra
145
- - path: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/configs
146
- schema: file
147
- provider: main
148
- - path: hydra_plugins.hydra_colorlog.conf
149
- schema: pkg
150
- provider: hydra-colorlog
151
- - path: ''
152
- schema: structured
153
- provider: schema
154
- output_dir: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/logs/train/runs/2024-11-08_15-37-45
155
- choices:
156
- debug: null
157
- experiment: catdog_experiment
158
- hydra: default
159
- paths: catdog
160
- trainer: default
161
- logger: default
162
- callbacks: default
163
- model: catdog_classifier
164
- data: catdog
165
- hydra/env: default
166
- hydra/callbacks: null
167
- hydra/job_logging: colorlog
168
- hydra/hydra_logging: colorlog
169
- hydra/hydra_help: default
170
- hydra/help: default
171
- hydra/sweeper: basic
172
- hydra/launcher: basic
173
- hydra/output: default
174
- verbose: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train/runs/2024-11-08_15-37-45/.hydra/overrides.yaml DELETED
@@ -1 +0,0 @@
1
- []
 
 
logs/train/runs/2024-11-08_15-37-45/train.log DELETED
File without changes
notebooks/training_lightning.ipynb ADDED
@@ -0,0 +1,889 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "application/javascript": "IPython.notebook.set_autosave_interval(300000)"
11
+ },
12
+ "metadata": {},
13
+ "output_type": "display_data"
14
+ },
15
+ {
16
+ "name": "stdout",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "Autosaving every 300 seconds\n"
20
+ ]
21
+ }
22
+ ],
23
+ "source": [
24
+ "%autosave 300\n",
25
+ "%load_ext autoreload\n",
26
+ "%autoreload 2\n",
27
+ "%reload_ext autoreload\n",
28
+ "%config Completer.use_jedi = False"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 2,
34
+ "metadata": {},
35
+ "outputs": [
36
+ {
37
+ "name": "stdout",
38
+ "output_type": "stream",
39
+ "text": [
40
+ "/mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws\n"
41
+ ]
42
+ }
43
+ ],
44
+ "source": [
45
+ "\n",
46
+ "import os\n",
47
+ "\n",
48
+ "os.chdir(\"..\")\n",
49
+ "print(os.getcwd())"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 3,
55
+ "metadata": {},
56
+ "outputs": [
57
+ {
58
+ "name": "stderr",
59
+ "output_type": "stream",
60
+ "text": [
61
+ "/anaconda/envs/emlo_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
62
+ " from .autonotebook import tqdm as notebook_tqdm\n"
63
+ ]
64
+ }
65
+ ],
66
+ "source": [
67
+ "import os\n",
68
+ "import shutil\n",
69
+ "from pathlib import Path\n",
70
+ "import torch\n",
71
+ "import lightning as L\n",
72
+ "from lightning.pytorch.loggers import Logger\n",
73
+ "from typing import List\n",
74
+ "from src.datamodules.catdog_datamodule import CatDogImageDataModule\n",
75
+ "from src.utils.logging_utils import setup_logger, task_wrapper\n",
76
+ "from loguru import logger\n",
77
+ "from dotenv import load_dotenv, find_dotenv\n",
78
+ "import rootutils\n",
79
+ "import hydra\n",
80
+ "from omegaconf import DictConfig, OmegaConf\n",
81
+ "from lightning.pytorch.callbacks import (\n",
82
+ " ModelCheckpoint,\n",
83
+ " EarlyStopping,\n",
84
+ " RichModelSummary,\n",
85
+ " RichProgressBar,\n",
86
+ ")\n",
87
+ "from lightning.pytorch.loggers import TensorBoardLogger, CSVLogger"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": 4,
93
+ "metadata": {},
94
+ "outputs": [
95
+ {
96
+ "name": "stderr",
97
+ "output_type": "stream",
98
+ "text": [
99
+ "\u001b[32m2024-11-08 18:25:17.572\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m9\u001b[0m - \u001b[31m\u001b[1mname '__file__' is not defined\u001b[0m\n"
100
+ ]
101
+ }
102
+ ],
103
+ "source": [
104
+ "# Load environment variables\n",
105
+ "load_dotenv(find_dotenv(\".env\"))\n",
106
+ "\n",
107
+ "# Setup root directory\n",
108
+ "try:\n",
109
+ " root = rootutils.setup_root(__file__, indicator=\".project-root\")\n",
110
+ "\n",
111
+ "except Exception as e:\n",
112
+ " logger.error(e)\n",
113
+ " root = Path(os.getcwd())\n",
114
+ " os.environ[\"PROJECT_ROOT\"] = str(root)"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": 5,
120
+ "metadata": {},
121
+ "outputs": [],
122
+ "source": [
123
+ "def load_checkpoint_if_available(ckpt_path: str) -> str:\n",
124
+ " \"\"\"Check if the specified checkpoint exists and return the valid checkpoint path.\"\"\"\n",
125
+ " if ckpt_path and Path(ckpt_path).exists():\n",
126
+ " logger.info(f\"Checkpoint found: {ckpt_path}\")\n",
127
+ " return ckpt_path\n",
128
+ " else:\n",
129
+ " logger.warning(\n",
130
+ " f\"No checkpoint found at {ckpt_path}. Using current model weights.\"\n",
131
+ " )\n",
132
+ " return None\n",
133
+ "\n",
134
+ "\n",
135
+ "def clear_checkpoint_directory(ckpt_dir: str):\n",
136
+ " \"\"\"Clear all contents of the checkpoint directory without deleting the directory itself.\"\"\"\n",
137
+ " ckpt_dir_path = Path(ckpt_dir)\n",
138
+ " if ckpt_dir_path.exists() and ckpt_dir_path.is_dir():\n",
139
+ " logger.info(f\"Clearing checkpoint directory: {ckpt_dir}\")\n",
140
+ " # Iterate over all files and directories in the checkpoint directory and remove them\n",
141
+ " for item in ckpt_dir_path.iterdir():\n",
142
+ " try:\n",
143
+ " if item.is_file() or item.is_symlink():\n",
144
+ " item.unlink() # Remove file or symlink\n",
145
+ " elif item.is_dir():\n",
146
+ " shutil.rmtree(item) # Remove directory\n",
147
+ " except Exception as e:\n",
148
+ " logger.error(f\"Failed to delete {item}: {e}\")\n",
149
+ " logger.info(f\"Checkpoint directory cleared: {ckpt_dir}\")\n",
150
+ " else:\n",
151
+ " logger.info(\n",
152
+ " f\"Checkpoint directory does not exist. Creating directory: {ckpt_dir}\"\n",
153
+ " )\n",
154
+ " os.makedirs(ckpt_dir_path, exist_ok=True)\n",
155
+ "\n",
156
+ "\n",
157
+ "@task_wrapper\n",
158
+ "def train_module(\n",
159
+ " cfg: DictConfig,\n",
160
+ " data_module: L.LightningDataModule,\n",
161
+ " model: L.LightningModule,\n",
162
+ " trainer: L.Trainer,\n",
163
+ "):\n",
164
+ " \"\"\"Train the model using the provided Trainer and DataModule.\"\"\"\n",
165
+ " logger.info(\"Training the model\")\n",
166
+ " trainer.fit(model, data_module)\n",
167
+ " train_metrics = trainer.callback_metrics\n",
168
+ " try:\n",
169
+ " logger.info(\n",
170
+ " f\"Training completed with the following metrics- train_acc: {train_metrics['train_acc'].item()} and val_acc: {train_metrics['val_acc'].item()}\"\n",
171
+ " )\n",
172
+ " except KeyError:\n",
173
+ " logger.info(f\"Training completed with the following metrics:{train_metrics}\")\n",
174
+ "\n",
175
+ " return train_metrics\n",
176
+ "\n",
177
+ "\n",
178
+ "@task_wrapper\n",
179
+ "def run_test_module(\n",
180
+ " cfg: DictConfig,\n",
181
+ " datamodule: L.LightningDataModule,\n",
182
+ " model: L.LightningModule,\n",
183
+ " trainer: L.Trainer,\n",
184
+ "):\n",
185
+ " \"\"\"Test the model using the best checkpoint or the current model weights.\"\"\"\n",
186
+ " logger.info(\"Testing the model\")\n",
187
+ " datamodule.setup(stage=\"test\")\n",
188
+ "\n",
189
+ " ckpt_path = load_checkpoint_if_available(cfg.ckpt_path)\n",
190
+ "\n",
191
+ " # If no checkpoint is available, Lightning will use current model weights\n",
192
+ " test_metrics = trainer.test(model, datamodule, ckpt_path=ckpt_path)\n",
193
+ " logger.info(f\"Test metrics:\\n{test_metrics}\")\n",
194
+ "\n",
195
+ " return test_metrics[0] if test_metrics else {}"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": 6,
201
+ "metadata": {},
202
+ "outputs": [
203
+ {
204
+ "name": "stderr",
205
+ "output_type": "stream",
206
+ "text": [
207
+ "/tmp/ipykernel_487789/541470590.py:8: UserWarning: \n",
208
+ "The version_base parameter is not specified.\n",
209
+ "Please specify a compatability version level, or None.\n",
210
+ "Will assume defaults for version 1.1\n",
211
+ " with hydra.initialize(config_path=\"../configs\"):\n"
212
+ ]
213
+ },
214
+ {
215
+ "name": "stdout",
216
+ "output_type": "stream",
217
+ "text": [
218
+ "Full Configuration:\n",
219
+ "task_name: train\n",
220
+ "tags:\n",
221
+ "- dev\n",
222
+ "train: true\n",
223
+ "test: false\n",
224
+ "ckpt_path: ${paths.ckpt_dir}/best-checkpoint.ckpt\n",
225
+ "seed: 42\n",
226
+ "name: catdog_experiment\n",
227
+ "data:\n",
228
+ " _target_: src.datamodules.catdog_datamodule.CatDogImageDataModule\n",
229
+ " data_dir: ${paths.data_dir}\n",
230
+ " url: ${paths.data_url}\n",
231
+ " num_workers: 8\n",
232
+ " batch_size: 64\n",
233
+ " train_val_split:\n",
234
+ " - 0.8\n",
235
+ " - 0.2\n",
236
+ " pin_memory: true\n",
237
+ " image_size: 160\n",
238
+ "model:\n",
239
+ " _target_: src.models.catdog_model.ViTTinyClassifier\n",
240
+ " img_size: 160\n",
241
+ " patch_size: 16\n",
242
+ " num_classes: 2\n",
243
+ " embed_dim: 64\n",
244
+ " depth: 6\n",
245
+ " num_heads: 2\n",
246
+ " mlp_ratio: 3\n",
247
+ " pre_norm: false\n",
248
+ " lr: 0.001\n",
249
+ " weight_decay: 1.0e-05\n",
250
+ " factor: 0.1\n",
251
+ " patience: 10\n",
252
+ " min_lr: 1.0e-06\n",
253
+ "callbacks:\n",
254
+ " model_checkpoint:\n",
255
+ " dirpath: ${paths.ckpt_dir}\n",
256
+ " filename: best-checkpoint\n",
257
+ " monitor: val_acc\n",
258
+ " verbose: true\n",
259
+ " save_last: true\n",
260
+ " save_top_k: 1\n",
261
+ " mode: max\n",
262
+ " auto_insert_metric_name: false\n",
263
+ " save_weights_only: false\n",
264
+ " every_n_train_steps: null\n",
265
+ " train_time_interval: null\n",
266
+ " every_n_epochs: null\n",
267
+ " save_on_train_epoch_end: null\n",
268
+ " early_stopping:\n",
269
+ " monitor: val_acc\n",
270
+ " min_delta: 0.0\n",
271
+ " patience: 10\n",
272
+ " verbose: true\n",
273
+ " mode: max\n",
274
+ " strict: true\n",
275
+ " check_finite: true\n",
276
+ " stopping_threshold: null\n",
277
+ " divergence_threshold: null\n",
278
+ " check_on_train_epoch_end: null\n",
279
+ " rich_model_summary:\n",
280
+ " max_depth: 1\n",
281
+ " rich_progress_bar:\n",
282
+ " refresh_rate: 1\n",
283
+ "logger:\n",
284
+ " csv:\n",
285
+ " save_dir: ${paths.output_dir}\n",
286
+ " name: csv/\n",
287
+ " prefix: ''\n",
288
+ " tensorboard:\n",
289
+ " save_dir: ${paths.output_dir}/tensorboard/\n",
290
+ " name: null\n",
291
+ " log_graph: false\n",
292
+ " default_hp_metric: true\n",
293
+ " prefix: ''\n",
294
+ "trainer:\n",
295
+ " _target_: lightning.Trainer\n",
296
+ " default_root_dir: ${paths.output_dir}\n",
297
+ " min_epochs: 1\n",
298
+ " max_epochs: 6\n",
299
+ " accelerator: auto\n",
300
+ " devices: auto\n",
301
+ " deterministic: true\n",
302
+ " log_every_n_steps: 10\n",
303
+ " fast_dev_run: false\n",
304
+ "paths:\n",
305
+ " root_dir: ${oc.env:PROJECT_ROOT}\n",
306
+ " data_dir: ${paths.root_dir}/data/\n",
307
+ " log_dir: ${paths.root_dir}/logs/\n",
308
+ " ckpt_dir: ${paths.root_dir}/checkpoints\n",
309
+ " artifact_dir: ${paths.root_dir}/artifacts/\n",
310
+ " data_url: https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip\n",
311
+ " output_dir: ${hydra:runtime.output_dir}\n",
312
+ " work_dir: ${hydra:runtime.cwd}\n",
313
+ "\n"
314
+ ]
315
+ }
316
+ ],
317
+ "source": [
318
+ "import hydra\n",
319
+ "from omegaconf import DictConfig, OmegaConf\n",
320
+ "\n",
321
+ "\n",
322
+ "# Function to load the configuration as an object without using the @hydra.main decorator\n",
323
+ "def load_config() -> DictConfig:\n",
324
+ " # Initialize the configuration context (e.g., \"../configs\" directory)\n",
325
+ " with hydra.initialize(config_path=\"../configs\"):\n",
326
+ " # Compose the configuration object with a specific config name (e.g., \"train\")\n",
327
+ " cfg = hydra.compose(config_name=\"train\")\n",
328
+ " return cfg\n",
329
+ "\n",
330
+ "\n",
331
+ "# Load the configuration\n",
332
+ "cfg = load_config()\n",
333
+ "\n",
334
+ "# Print the entire configuration for reference\n",
335
+ "print(\"Full Configuration:\")\n",
336
+ "print(OmegaConf.to_yaml(cfg))"
337
+ ]
338
+ },
339
+ {
340
+ "cell_type": "code",
341
+ "execution_count": 7,
342
+ "metadata": {},
343
+ "outputs": [
344
+ {
345
+ "name": "stderr",
346
+ "output_type": "stream",
347
+ "text": [
348
+ "\u001b[32m2024-11-08 18:25:23\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m8\u001b[0m - \u001b[1mWhole Config:\n",
349
+ "task_name: train\n",
350
+ "tags:\n",
351
+ "- dev\n",
352
+ "train: true\n",
353
+ "test: false\n",
354
+ "ckpt_path: ${paths.ckpt_dir}/best-checkpoint.ckpt\n",
355
+ "seed: 42\n",
356
+ "name: catdog_experiment\n",
357
+ "data:\n",
358
+ " _target_: src.datamodules.catdog_datamodule.CatDogImageDataModule\n",
359
+ " data_dir: ${paths.data_dir}\n",
360
+ " url: ${paths.data_url}\n",
361
+ " num_workers: 8\n",
362
+ " batch_size: 64\n",
363
+ " train_val_split:\n",
364
+ " - 0.8\n",
365
+ " - 0.2\n",
366
+ " pin_memory: true\n",
367
+ " image_size: 160\n",
368
+ "model:\n",
369
+ " _target_: src.models.catdog_model.ViTTinyClassifier\n",
370
+ " img_size: 160\n",
371
+ " patch_size: 16\n",
372
+ " num_classes: 2\n",
373
+ " embed_dim: 64\n",
374
+ " depth: 6\n",
375
+ " num_heads: 2\n",
376
+ " mlp_ratio: 3\n",
377
+ " pre_norm: false\n",
378
+ " lr: 0.001\n",
379
+ " weight_decay: 1.0e-05\n",
380
+ " factor: 0.1\n",
381
+ " patience: 10\n",
382
+ " min_lr: 1.0e-06\n",
383
+ "callbacks:\n",
384
+ " model_checkpoint:\n",
385
+ " dirpath: ${paths.ckpt_dir}\n",
386
+ " filename: best-checkpoint\n",
387
+ " monitor: val_acc\n",
388
+ " verbose: true\n",
389
+ " save_last: true\n",
390
+ " save_top_k: 1\n",
391
+ " mode: max\n",
392
+ " auto_insert_metric_name: false\n",
393
+ " save_weights_only: false\n",
394
+ " every_n_train_steps: null\n",
395
+ " train_time_interval: null\n",
396
+ " every_n_epochs: null\n",
397
+ " save_on_train_epoch_end: null\n",
398
+ " early_stopping:\n",
399
+ " monitor: val_acc\n",
400
+ " min_delta: 0.0\n",
401
+ " patience: 10\n",
402
+ " verbose: true\n",
403
+ " mode: max\n",
404
+ " strict: true\n",
405
+ " check_finite: true\n",
406
+ " stopping_threshold: null\n",
407
+ " divergence_threshold: null\n",
408
+ " check_on_train_epoch_end: null\n",
409
+ " rich_model_summary:\n",
410
+ " max_depth: 1\n",
411
+ " rich_progress_bar:\n",
412
+ " refresh_rate: 1\n",
413
+ "logger:\n",
414
+ " csv:\n",
415
+ " save_dir: ${paths.output_dir}\n",
416
+ " name: csv/\n",
417
+ " prefix: ''\n",
418
+ " tensorboard:\n",
419
+ " save_dir: ${paths.output_dir}/tensorboard/\n",
420
+ " name: null\n",
421
+ " log_graph: false\n",
422
+ " default_hp_metric: true\n",
423
+ " prefix: ''\n",
424
+ "trainer:\n",
425
+ " _target_: lightning.Trainer\n",
426
+ " default_root_dir: ${paths.output_dir}\n",
427
+ " min_epochs: 1\n",
428
+ " max_epochs: 6\n",
429
+ " accelerator: auto\n",
430
+ " devices: auto\n",
431
+ " deterministic: true\n",
432
+ " log_every_n_steps: 10\n",
433
+ " fast_dev_run: false\n",
434
+ "paths:\n",
435
+ " root_dir: ${oc.env:PROJECT_ROOT}\n",
436
+ " data_dir: ${paths.root_dir}/data/\n",
437
+ " log_dir: ${paths.root_dir}/logs/\n",
438
+ " ckpt_dir: ${paths.root_dir}/checkpoints\n",
439
+ " artifact_dir: ${paths.root_dir}/artifacts/\n",
440
+ " data_url: https://download.pytorch.org/tutorials/cats_and_dogs_filtered.zip\n",
441
+ " output_dir: ${hydra:runtime.output_dir}\n",
442
+ " work_dir: ${hydra:runtime.cwd}\n",
443
+ "\u001b[0m\n"
444
+ ]
445
+ }
446
+ ],
447
+ "source": [
448
+ "# Initialize logger\n",
449
+ "if cfg.task_name == \"train\":\n",
450
+ " log_path = Path(cfg.paths.log_dir) / \"train.log\"\n",
451
+ "else:\n",
452
+ " log_path = Path(cfg.paths.log_dir) / \"eval.log\"\n",
453
+ "setup_logger(log_path)\n",
454
+ "\n",
455
+ "logger.info(f\"Whole Config:\\n{OmegaConf.to_yaml(cfg)}\")"
456
+ ]
457
+ },
458
+ {
459
+ "cell_type": "code",
460
+ "execution_count": 8,
461
+ "metadata": {},
462
+ "outputs": [
463
+ {
464
+ "name": "stderr",
465
+ "output_type": "stream",
466
+ "text": [
467
+ "\u001b[32m2024-11-08 18:25:25\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m3\u001b[0m - \u001b[1mRoot directory: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws\u001b[0m\n",
468
+ "\u001b[32m2024-11-08 18:25:25\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m5\u001b[0m - \u001b[1mCurrent working directory: ['.dvc', '.dvcignore', '.env', '.git', '.github', '.gitignore', '.project-root', 'aws', 'basic_setup.md', 'configs', 'data', 'data.dvc', 'docker-compose.yaml', 'Dockerfile', 'ec2_runner_setup.md', 'logs', 'main.py', 'notebooks', 'poetry.lock', 'pyproject.toml', 'README.md', 'setup_aws_ci.md', 'src', 'tests', 'todo.md']\u001b[0m\n",
469
+ "\u001b[32m2024-11-08 18:25:25\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m8\u001b[0m - \u001b[1mCheckpoint directory: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/checkpoints\u001b[0m\n",
470
+ "\u001b[32m2024-11-08 18:25:25\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m12\u001b[0m - \u001b[1mData directory: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/data/\u001b[0m\n",
471
+ "\u001b[32m2024-11-08 18:25:25\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m16\u001b[0m - \u001b[1mLog directory: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/logs/\u001b[0m\n",
472
+ "\u001b[32m2024-11-08 18:25:25\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mArtifact directory: /mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-vm-dev/code/Users/Soutrik.Chowdhury/pytorch-template-aws/artifacts/\u001b[0m\n",
473
+ "\u001b[32m2024-11-08 18:25:25\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m28\u001b[0m - \u001b[1mExperiment name: catdog_experiment\u001b[0m\n"
474
+ ]
475
+ }
476
+ ],
477
+ "source": [
478
+ "# the path to the checkpoint directory\n",
479
+ "root_dir = cfg.paths.root_dir\n",
480
+ "logger.info(f\"Root directory: {root_dir}\")\n",
481
+ "\n",
482
+ "logger.info(f\"Current working directory: {os.listdir(root_dir)}\")\n",
483
+ "\n",
484
+ "ckpt_dir = cfg.paths.ckpt_dir\n",
485
+ "logger.info(f\"Checkpoint directory: {ckpt_dir}\")\n",
486
+ "\n",
487
+ "# the path to the data directory\n",
488
+ "data_dir = cfg.paths.data_dir\n",
489
+ "logger.info(f\"Data directory: {data_dir}\")\n",
490
+ "\n",
491
+ "# the path to the log directory\n",
492
+ "log_dir = cfg.paths.log_dir\n",
493
+ "logger.info(f\"Log directory: {log_dir}\")\n",
494
+ "\n",
495
+ "# the path to the artifact directory\n",
496
+ "artifact_dir = cfg.paths.artifact_dir\n",
497
+ "logger.info(f\"Artifact directory: {artifact_dir}\")\n",
498
+ "\n",
499
+ "# output directory\n",
500
+ "# output_dir = cfg.paths.output_dir\n",
501
+ "# logger.info(f\"Output directory: {output_dir}\")\n",
502
+ "\n",
503
+ "# name of the experiment\n",
504
+ "experiment_name = cfg.name\n",
505
+ "logger.info(f\"Experiment name: {experiment_name}\")\n"
506
+ ]
507
+ },
508
+ {
509
+ "cell_type": "code",
510
+ "execution_count": 9,
511
+ "metadata": {},
512
+ "outputs": [
513
+ {
514
+ "name": "stderr",
515
+ "output_type": "stream",
516
+ "text": [
517
+ "\u001b[32m2024-11-08 18:25:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m2\u001b[0m - \u001b[1mInstantiating datamodule <src.datamodules.catdog_datamodule.CatDogImageDataModule>\u001b[0m\n"
518
+ ]
519
+ }
520
+ ],
521
+ "source": [
522
+ "# Initialize DataModule\n",
523
+ "logger.info(f\"Instantiating datamodule <{cfg.data._target_}>\")\n",
524
+ "datamodule: L.LightningDataModule = hydra.utils.instantiate(cfg.data)"
525
+ ]
526
+ },
527
+ {
528
+ "cell_type": "code",
529
+ "execution_count": 10,
530
+ "metadata": {},
531
+ "outputs": [
532
+ {
533
+ "name": "stderr",
534
+ "output_type": "stream",
535
+ "text": [
536
+ "\u001b[32m2024-11-08 18:25:28\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m2\u001b[0m - \u001b[1mNo GPU available\u001b[0m\n",
537
+ "Seed set to 42\n"
538
+ ]
539
+ },
540
+ {
541
+ "data": {
542
+ "text/plain": [
543
+ "42"
544
+ ]
545
+ },
546
+ "execution_count": 10,
547
+ "metadata": {},
548
+ "output_type": "execute_result"
549
+ }
550
+ ],
551
+ "source": [
552
+ "# Check for GPU availability\n",
553
+ "logger.info(\"GPU available\" if torch.cuda.is_available() else \"No GPU available\")\n",
554
+ "\n",
555
+ "# Set seed for reproducibility\n",
556
+ "L.seed_everything(cfg.seed, workers=True)"
557
+ ]
558
+ },
559
+ {
560
+ "cell_type": "code",
561
+ "execution_count": 11,
562
+ "metadata": {},
563
+ "outputs": [
564
+ {
565
+ "name": "stderr",
566
+ "output_type": "stream",
567
+ "text": [
568
+ "\u001b[32m2024-11-08 18:25:29\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m2\u001b[0m - \u001b[1mInstantiating model <src.models.catdog_model.ViTTinyClassifier>\u001b[0m\n"
569
+ ]
570
+ }
571
+ ],
572
+ "source": [
573
+ "# Initialize model\n",
574
+ "logger.info(f\"Instantiating model <{cfg.model._target_}>\")\n",
575
+ "model: L.LightningModule = hydra.utils.instantiate(cfg.model)"
576
+ ]
577
+ },
578
+ {
579
+ "cell_type": "code",
580
+ "execution_count": 12,
581
+ "metadata": {},
582
+ "outputs": [
583
+ {
584
+ "name": "stderr",
585
+ "output_type": "stream",
586
+ "text": [
587
+ "\u001b[32m2024-11-08 18:25:30\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mModel summary:\n",
588
+ "ViTTinyClassifier(\n",
589
+ " (model): VisionTransformer(\n",
590
+ " (patch_embed): PatchEmbed(\n",
591
+ " (proj): Conv2d(3, 64, kernel_size=(16, 16), stride=(16, 16))\n",
592
+ " (norm): Identity()\n",
593
+ " )\n",
594
+ " (pos_drop): Dropout(p=0.0, inplace=False)\n",
595
+ " (patch_drop): Identity()\n",
596
+ " (norm_pre): Identity()\n",
597
+ " (blocks): Sequential(\n",
598
+ " (0): Block(\n",
599
+ " (norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
600
+ " (attn): Attention(\n",
601
+ " (qkv): Linear(in_features=64, out_features=192, bias=False)\n",
602
+ " (q_norm): Identity()\n",
603
+ " (k_norm): Identity()\n",
604
+ " (attn_drop): Dropout(p=0.0, inplace=False)\n",
605
+ " (proj): Linear(in_features=64, out_features=64, bias=True)\n",
606
+ " (proj_drop): Dropout(p=0.0, inplace=False)\n",
607
+ " )\n",
608
+ " (ls1): Identity()\n",
609
+ " (drop_path1): Identity()\n",
610
+ " (norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
611
+ " (mlp): Mlp(\n",
612
+ " (fc1): Linear(in_features=64, out_features=192, bias=True)\n",
613
+ " (act): GELU(approximate='none')\n",
614
+ " (drop1): Dropout(p=0.0, inplace=False)\n",
615
+ " (norm): Identity()\n",
616
+ " (fc2): Linear(in_features=192, out_features=64, bias=True)\n",
617
+ " (drop2): Dropout(p=0.0, inplace=False)\n",
618
+ " )\n",
619
+ " (ls2): Identity()\n",
620
+ " (drop_path2): Identity()\n",
621
+ " )\n",
622
+ " (1): Block(\n",
623
+ " (norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
624
+ " (attn): Attention(\n",
625
+ " (qkv): Linear(in_features=64, out_features=192, bias=False)\n",
626
+ " (q_norm): Identity()\n",
627
+ " (k_norm): Identity()\n",
628
+ " (attn_drop): Dropout(p=0.0, inplace=False)\n",
629
+ " (proj): Linear(in_features=64, out_features=64, bias=True)\n",
630
+ " (proj_drop): Dropout(p=0.0, inplace=False)\n",
631
+ " )\n",
632
+ " (ls1): Identity()\n",
633
+ " (drop_path1): Identity()\n",
634
+ " (norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
635
+ " (mlp): Mlp(\n",
636
+ " (fc1): Linear(in_features=64, out_features=192, bias=True)\n",
637
+ " (act): GELU(approximate='none')\n",
638
+ " (drop1): Dropout(p=0.0, inplace=False)\n",
639
+ " (norm): Identity()\n",
640
+ " (fc2): Linear(in_features=192, out_features=64, bias=True)\n",
641
+ " (drop2): Dropout(p=0.0, inplace=False)\n",
642
+ " )\n",
643
+ " (ls2): Identity()\n",
644
+ " (drop_path2): Identity()\n",
645
+ " )\n",
646
+ " (2): Block(\n",
647
+ " (norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
648
+ " (attn): Attention(\n",
649
+ " (qkv): Linear(in_features=64, out_features=192, bias=False)\n",
650
+ " (q_norm): Identity()\n",
651
+ " (k_norm): Identity()\n",
652
+ " (attn_drop): Dropout(p=0.0, inplace=False)\n",
653
+ " (proj): Linear(in_features=64, out_features=64, bias=True)\n",
654
+ " (proj_drop): Dropout(p=0.0, inplace=False)\n",
655
+ " )\n",
656
+ " (ls1): Identity()\n",
657
+ " (drop_path1): Identity()\n",
658
+ " (norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
659
+ " (mlp): Mlp(\n",
660
+ " (fc1): Linear(in_features=64, out_features=192, bias=True)\n",
661
+ " (act): GELU(approximate='none')\n",
662
+ " (drop1): Dropout(p=0.0, inplace=False)\n",
663
+ " (norm): Identity()\n",
664
+ " (fc2): Linear(in_features=192, out_features=64, bias=True)\n",
665
+ " (drop2): Dropout(p=0.0, inplace=False)\n",
666
+ " )\n",
667
+ " (ls2): Identity()\n",
668
+ " (drop_path2): Identity()\n",
669
+ " )\n",
670
+ " (3): Block(\n",
671
+ " (norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
672
+ " (attn): Attention(\n",
673
+ " (qkv): Linear(in_features=64, out_features=192, bias=False)\n",
674
+ " (q_norm): Identity()\n",
675
+ " (k_norm): Identity()\n",
676
+ " (attn_drop): Dropout(p=0.0, inplace=False)\n",
677
+ " (proj): Linear(in_features=64, out_features=64, bias=True)\n",
678
+ " (proj_drop): Dropout(p=0.0, inplace=False)\n",
679
+ " )\n",
680
+ " (ls1): Identity()\n",
681
+ " (drop_path1): Identity()\n",
682
+ " (norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
683
+ " (mlp): Mlp(\n",
684
+ " (fc1): Linear(in_features=64, out_features=192, bias=True)\n",
685
+ " (act): GELU(approximate='none')\n",
686
+ " (drop1): Dropout(p=0.0, inplace=False)\n",
687
+ " (norm): Identity()\n",
688
+ " (fc2): Linear(in_features=192, out_features=64, bias=True)\n",
689
+ " (drop2): Dropout(p=0.0, inplace=False)\n",
690
+ " )\n",
691
+ " (ls2): Identity()\n",
692
+ " (drop_path2): Identity()\n",
693
+ " )\n",
694
+ " (4): Block(\n",
695
+ " (norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
696
+ " (attn): Attention(\n",
697
+ " (qkv): Linear(in_features=64, out_features=192, bias=False)\n",
698
+ " (q_norm): Identity()\n",
699
+ " (k_norm): Identity()\n",
700
+ " (attn_drop): Dropout(p=0.0, inplace=False)\n",
701
+ " (proj): Linear(in_features=64, out_features=64, bias=True)\n",
702
+ " (proj_drop): Dropout(p=0.0, inplace=False)\n",
703
+ " )\n",
704
+ " (ls1): Identity()\n",
705
+ " (drop_path1): Identity()\n",
706
+ " (norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
707
+ " (mlp): Mlp(\n",
708
+ " (fc1): Linear(in_features=64, out_features=192, bias=True)\n",
709
+ " (act): GELU(approximate='none')\n",
710
+ " (drop1): Dropout(p=0.0, inplace=False)\n",
711
+ " (norm): Identity()\n",
712
+ " (fc2): Linear(in_features=192, out_features=64, bias=True)\n",
713
+ " (drop2): Dropout(p=0.0, inplace=False)\n",
714
+ " )\n",
715
+ " (ls2): Identity()\n",
716
+ " (drop_path2): Identity()\n",
717
+ " )\n",
718
+ " (5): Block(\n",
719
+ " (norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
720
+ " (attn): Attention(\n",
721
+ " (qkv): Linear(in_features=64, out_features=192, bias=False)\n",
722
+ " (q_norm): Identity()\n",
723
+ " (k_norm): Identity()\n",
724
+ " (attn_drop): Dropout(p=0.0, inplace=False)\n",
725
+ " (proj): Linear(in_features=64, out_features=64, bias=True)\n",
726
+ " (proj_drop): Dropout(p=0.0, inplace=False)\n",
727
+ " )\n",
728
+ " (ls1): Identity()\n",
729
+ " (drop_path1): Identity()\n",
730
+ " (norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
731
+ " (mlp): Mlp(\n",
732
+ " (fc1): Linear(in_features=64, out_features=192, bias=True)\n",
733
+ " (act): GELU(approximate='none')\n",
734
+ " (drop1): Dropout(p=0.0, inplace=False)\n",
735
+ " (norm): Identity()\n",
736
+ " (fc2): Linear(in_features=192, out_features=64, bias=True)\n",
737
+ " (drop2): Dropout(p=0.0, inplace=False)\n",
738
+ " )\n",
739
+ " (ls2): Identity()\n",
740
+ " (drop_path2): Identity()\n",
741
+ " )\n",
742
+ " )\n",
743
+ " (norm): LayerNorm((64,), eps=1e-06, elementwise_affine=True)\n",
744
+ " (fc_norm): Identity()\n",
745
+ " (head_drop): Dropout(p=0.0, inplace=False)\n",
746
+ " (head): Linear(in_features=64, out_features=2, bias=True)\n",
747
+ " )\n",
748
+ " (train_metrics): ModuleDict(\n",
749
+ " (accuracy): MulticlassAccuracy()\n",
750
+ " (precision): MulticlassPrecision()\n",
751
+ " (recall): MulticlassRecall()\n",
752
+ " (f1): MulticlassF1Score()\n",
753
+ " )\n",
754
+ " (val_metrics): ModuleDict(\n",
755
+ " (accuracy): MulticlassAccuracy()\n",
756
+ " (precision): MulticlassPrecision()\n",
757
+ " (recall): MulticlassRecall()\n",
758
+ " (f1): MulticlassF1Score()\n",
759
+ " )\n",
760
+ " (test_metrics): ModuleDict(\n",
761
+ " (accuracy): MulticlassAccuracy()\n",
762
+ " (precision): MulticlassPrecision()\n",
763
+ " (recall): MulticlassRecall()\n",
764
+ " (f1): MulticlassF1Score()\n",
765
+ " )\n",
766
+ " (criterion): CrossEntropyLoss()\n",
767
+ ")\u001b[0m\n"
768
+ ]
769
+ }
770
+ ],
771
+ "source": [
772
+ "logger.info(f\"Model summary:\\n{model}\")"
773
+ ]
774
+ },
775
+ {
776
+ "cell_type": "code",
777
+ "execution_count": 13,
778
+ "metadata": {},
779
+ "outputs": [],
780
+ "source": [
781
+ "def initialize_callbacks(cfg: DictConfig) -> List[L.Callback]:\n",
782
+ " \"\"\"Initialize the callbacks based on the configuration.\"\"\"\n",
783
+ " if not cfg:\n",
784
+ " logger.warning(\"No callback configs found! Skipping..\")\n",
785
+ " return callbacks\n",
786
+ "\n",
787
+ " if not isinstance(cfg, DictConfig):\n",
788
+ " raise TypeError(\"Callbacks config must be a DictConfig!\")\n",
789
+ " callbacks = []\n",
790
+ "\n",
791
+ " # Initialize the model checkpoint callback\n",
792
+ " model_checkpoint = ModelCheckpoint(**cfg.callbacks.model_checkpoint)\n",
793
+ " callbacks.append(model_checkpoint)\n",
794
+ "\n",
795
+ " # Initialize the early stopping callback\n",
796
+ " early_stopping = EarlyStopping(**cfg.callbacks.early_stopping)\n",
797
+ " callbacks.append(early_stopping)\n",
798
+ "\n",
799
+ " # Initialize the rich model summary callback\n",
800
+ " model_summary = RichModelSummary(**cfg.callbacks.rich_model_summary)\n",
801
+ " callbacks.append(model_summary)\n",
802
+ "\n",
803
+ " # Initialize the rich progress bar callback\n",
804
+ " progress_bar = RichProgressBar(**cfg.callbacks.rich_progress_bar)\n",
805
+ " callbacks.append(progress_bar)\n",
806
+ "\n",
807
+ " return callbacks\n",
808
+ "\n",
809
+ "\n",
810
+ "def initialize_logger(cfg: DictConfig) -> Logger:\n",
811
+ " \"\"\"Initialize the logger based on the configuration.\"\"\"\n",
812
+ " if not cfg:\n",
813
+ " logger.warning(\"No logger configs found! Skipping..\")\n",
814
+ " return None\n",
815
+ "\n",
816
+ " if not isinstance(cfg, DictConfig):\n",
817
+ " raise TypeError(\"Logger config must be a DictConfig!\")\n",
818
+ "\n",
819
+ " loggers = []\n",
820
+ "\n",
821
+ " # Initialize the TensorBoard logger\n",
822
+ " tensorboard_logger = TensorBoardLogger(**cfg.loggers.tensorboard)\n",
823
+ " loggers.append(tensorboard_logger)\n",
824
+ "\n",
825
+ " # Initialize the CSV logger\n",
826
+ " csv_logger = CSVLogger(**cfg.loggers.csv)\n",
827
+ " loggers.append(csv_logger)\n",
828
+ "\n",
829
+ " return loggers"
830
+ ]
831
+ },
832
+ {
833
+ "cell_type": "code",
834
+ "execution_count": null,
835
+ "metadata": {},
836
+ "outputs": [],
837
+ "source": []
838
+ },
839
+ {
840
+ "cell_type": "code",
841
+ "execution_count": null,
842
+ "metadata": {},
843
+ "outputs": [],
844
+ "source": []
845
+ },
846
+ {
847
+ "cell_type": "code",
848
+ "execution_count": null,
849
+ "metadata": {},
850
+ "outputs": [],
851
+ "source": []
852
+ },
853
+ {
854
+ "cell_type": "code",
855
+ "execution_count": null,
856
+ "metadata": {},
857
+ "outputs": [],
858
+ "source": []
859
+ },
860
+ {
861
+ "cell_type": "markdown",
862
+ "metadata": {},
863
+ "source": [
864
+ "########################################## End of the script ##########################################"
865
+ ]
866
+ }
867
+ ],
868
+ "metadata": {
869
+ "kernelspec": {
870
+ "display_name": "emlo_env",
871
+ "language": "python",
872
+ "name": "python3"
873
+ },
874
+ "language_info": {
875
+ "codemirror_mode": {
876
+ "name": "ipython",
877
+ "version": 3
878
+ },
879
+ "file_extension": ".py",
880
+ "mimetype": "text/x-python",
881
+ "name": "python",
882
+ "nbconvert_exporter": "python",
883
+ "pygments_lexer": "ipython3",
884
+ "version": "3.10.15"
885
+ }
886
+ },
887
+ "nbformat": 4,
888
+ "nbformat_minor": 2
889
+ }
src/datamodules/catdog_datamodule.py CHANGED
@@ -48,7 +48,8 @@ class CatDogImageDataModule(L.LightningDataModule):
48
 
49
  def setup(self, stage: Optional[str] = None):
50
  """Set up the train, validation, and test datasets."""
51
- transform = transforms.Compose(
 
52
  [
53
  transforms.Resize((self.image_size, self.image_size)),
54
  transforms.RandomHorizontalFlip(),
@@ -59,11 +60,21 @@ class CatDogImageDataModule(L.LightningDataModule):
59
  ]
60
  )
61
 
 
 
 
 
 
 
 
 
 
 
62
  train_path = self.data_dir / "cats_and_dogs_filtered" / "train"
63
  test_path = self.data_dir / "cats_and_dogs_filtered" / "validation"
64
 
65
  if stage == "fit" or stage is None:
66
- full_train_dataset = ImageFolder(root=train_path, transform=transform)
67
  train_size = int(self.train_val_split[0] * len(full_train_dataset))
68
  val_size = len(full_train_dataset) - train_size
69
  self.train_dataset, self.val_dataset = random_split(
@@ -74,7 +85,7 @@ class CatDogImageDataModule(L.LightningDataModule):
74
  )
75
 
76
  if stage == "test" or stage is None:
77
- self.test_dataset = ImageFolder(root=test_path, transform=transform)
78
  logger.info(f"Test dataset size: {len(self.test_dataset)} images.")
79
 
80
  def _create_dataloader(self, dataset, shuffle: bool = False) -> DataLoader:
@@ -123,7 +134,7 @@ if __name__ == "__main__":
123
  train_val_split=cfg.data.train_val_split,
124
  pin_memory=cfg.data.pin_memory,
125
  image_size=cfg.data.image_size,
126
- url=cfg.data.dataset_url,
127
  )
128
  datamodule.prepare_data()
129
  datamodule.setup()
 
48
 
49
  def setup(self, stage: Optional[str] = None):
50
  """Set up the train, validation, and test datasets."""
51
+
52
+ train_transform = transforms.Compose(
53
  [
54
  transforms.Resize((self.image_size, self.image_size)),
55
  transforms.RandomHorizontalFlip(),
 
60
  ]
61
  )
62
 
63
+ test_transform = transforms.Compose(
64
+ [
65
+ transforms.Resize((self.image_size, self.image_size)),
66
+ transforms.ToTensor(),
67
+ transforms.Normalize(
68
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
69
+ ),
70
+ ]
71
+ )
72
+
73
  train_path = self.data_dir / "cats_and_dogs_filtered" / "train"
74
  test_path = self.data_dir / "cats_and_dogs_filtered" / "validation"
75
 
76
  if stage == "fit" or stage is None:
77
+ full_train_dataset = ImageFolder(root=train_path, transform=train_transform)
78
  train_size = int(self.train_val_split[0] * len(full_train_dataset))
79
  val_size = len(full_train_dataset) - train_size
80
  self.train_dataset, self.val_dataset = random_split(
 
85
  )
86
 
87
  if stage == "test" or stage is None:
88
+ self.test_dataset = ImageFolder(root=test_path, transform=test_transform)
89
  logger.info(f"Test dataset size: {len(self.test_dataset)} images.")
90
 
91
  def _create_dataloader(self, dataset, shuffle: bool = False) -> DataLoader:
 
134
  train_val_split=cfg.data.train_val_split,
135
  pin_memory=cfg.data.pin_memory,
136
  image_size=cfg.data.image_size,
137
+ url=cfg.data.url,
138
  )
139
  datamodule.prepare_data()
140
  datamodule.setup()
src/models/catdog_model.py CHANGED
@@ -9,7 +9,7 @@ class ViTTinyClassifier(L.LightningModule):
9
  def __init__(
10
  self,
11
  img_size: int = 224,
12
- num_classes: int = 2,
13
  embed_dim: int = 64,
14
  depth: int = 6,
15
  num_heads: int = 2,
@@ -40,16 +40,12 @@ class ViTTinyClassifier(L.LightningModule):
40
  global_pool="token",
41
  )
42
 
43
- # Metrics for multi-class classification
44
  metrics = {
45
- "accuracy": Accuracy(task="multiclass", num_classes=num_classes),
46
- "precision": Precision(
47
- task="multiclass", num_classes=num_classes, average="macro"
48
- ),
49
- "recall": Recall(
50
- task="multiclass", num_classes=num_classes, average="macro"
51
- ),
52
- "f1": F1Score(task="multiclass", num_classes=num_classes, average="macro"),
53
  }
54
 
55
  # Initialize metrics for each stage
 
9
  def __init__(
10
  self,
11
  img_size: int = 224,
12
+ num_classes: int = 2, # Should be 2 for binary classification
13
  embed_dim: int = 64,
14
  depth: int = 6,
15
  num_heads: int = 2,
 
40
  global_pool="token",
41
  )
42
 
43
+ # Metrics for binary classification
44
  metrics = {
45
+ "acc": Accuracy(task="binary"),
46
+ "precision": Precision(task="binary"),
47
+ "recall": Recall(task="binary"),
48
+ "f1": F1Score(task="binary"),
 
 
 
 
49
  }
50
 
51
  # Initialize metrics for each stage
src/train.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import List
5
+ import torch
6
+ import lightning as L
7
+ from lightning.pytorch.loggers import Logger, TensorBoardLogger, CSVLogger
8
+ from lightning.pytorch.callbacks import (
9
+ ModelCheckpoint,
10
+ EarlyStopping,
11
+ RichModelSummary,
12
+ RichProgressBar,
13
+ )
14
+ from dotenv import load_dotenv, find_dotenv
15
+ import hydra
16
+ from omegaconf import DictConfig, OmegaConf
17
+ from src.datamodules.catdog_datamodule import CatDogImageDataModule
18
+ from src.utils.logging_utils import setup_logger, task_wrapper
19
+ from loguru import logger
20
+ import rootutils
21
+
22
+ # Load environment variables
23
+ load_dotenv(find_dotenv(".env"))
24
+
25
+ # Setup root directory
26
+ try:
27
+ root = rootutils.setup_root(__file__, indicator=".project-root")
28
+ except Exception as e:
29
+ root = os.getcwd()
30
+
31
+ os.environ.setdefault("PROJECT_ROOT", str(root))
32
+
33
+
34
+ def initialize_callbacks(cfg: DictConfig) -> List[L.Callback]:
35
+ """Initialize callbacks based on configuration."""
36
+ callback_classes = {
37
+ "model_checkpoint": ModelCheckpoint,
38
+ "early_stopping": EarlyStopping,
39
+ "rich_model_summary": RichModelSummary,
40
+ "rich_progress_bar": RichProgressBar,
41
+ }
42
+ return [callback_classes[name](**params) for name, params in cfg.callbacks.items()]
43
+
44
+
45
+ def initialize_loggers(cfg: DictConfig) -> List[Logger]:
46
+ """Initialize loggers based on configuration."""
47
+ logger_classes = {
48
+ "tensorboard": TensorBoardLogger,
49
+ "csv": CSVLogger,
50
+ }
51
+ return [logger_classes[name](**params) for name, params in cfg.logger.items()]
52
+
53
+
54
+ def load_checkpoint_if_available(ckpt_path: str) -> str:
55
+ """Return the checkpoint path if available, else None."""
56
+ if ckpt_path and Path(ckpt_path).exists():
57
+ logger.info(f"Using checkpoint: {ckpt_path}")
58
+ return ckpt_path
59
+ logger.warning(f"Checkpoint not found at {ckpt_path}. Using current model weights.")
60
+ return None
61
+
62
+
63
+ def clear_checkpoint_directory(ckpt_dir: str):
64
+ """Clear checkpoint directory contents without removing the directory."""
65
+ ckpt_dir_path = Path(ckpt_dir)
66
+ if not ckpt_dir_path.exists():
67
+ logger.info(f"Creating checkpoint directory: {ckpt_dir}")
68
+ ckpt_dir_path.mkdir(parents=True, exist_ok=True)
69
+ else:
70
+ logger.info(f"Clearing checkpoint directory: {ckpt_dir}")
71
+ for item in ckpt_dir_path.iterdir():
72
+ try:
73
+ item.unlink() if item.is_file() else shutil.rmtree(item)
74
+ except Exception as e:
75
+ logger.error(f"Failed to delete {item}: {e}")
76
+
77
+
78
+ @task_wrapper
79
+ def train_module(
80
+ data_module: L.LightningDataModule, model: L.LightningModule, trainer: L.Trainer
81
+ ):
82
+ """Train the model and log metrics."""
83
+ logger.info("Starting training")
84
+ trainer.fit(model, data_module)
85
+ train_metrics = trainer.callback_metrics
86
+ train_acc = train_metrics.get("train_acc")
87
+ val_acc = train_metrics.get("val_acc")
88
+ logger.info(
89
+ f"Training completed. Metrics - train_acc: {train_acc}, val_acc: {val_acc}"
90
+ )
91
+ return train_metrics
92
+
93
+
94
+ @task_wrapper
95
+ def run_test_module(
96
+ cfg: DictConfig,
97
+ datamodule: L.LightningDataModule,
98
+ model: L.LightningModule,
99
+ trainer: L.Trainer,
100
+ ):
101
+ """Test the model using the best checkpoint or current model weights."""
102
+ logger.info("Starting testing")
103
+ datamodule.setup(stage="test")
104
+ test_metrics = trainer.test(
105
+ model, datamodule, ckpt_path=load_checkpoint_if_available(cfg.ckpt_path)
106
+ )
107
+ logger.info(f"Test metrics: {test_metrics}")
108
+ return test_metrics[0] if test_metrics else {}
109
+
110
+
111
+ @hydra.main(config_path="../configs", config_name="train", version_base="1.1")
112
+ def setup_run_trainer(cfg: DictConfig):
113
+ """Set up and run the Trainer for training and testing."""
114
+ # Display configuration
115
+ logger.info(f"Config:\n{OmegaConf.to_yaml(cfg)}")
116
+
117
+ # Initialize logger
118
+ log_path = Path(cfg.paths.log_dir) / (
119
+ "train.log" if cfg.task_name == "train" else "eval.log"
120
+ )
121
+ setup_logger(log_path)
122
+
123
+ # Display key paths
124
+ for path_name in [
125
+ "root_dir",
126
+ "data_dir",
127
+ "log_dir",
128
+ "ckpt_dir",
129
+ "artifact_dir",
130
+ "output_dir",
131
+ ]:
132
+ logger.info(
133
+ f"{path_name.replace('_', ' ').capitalize()}: {cfg.paths[path_name]}"
134
+ )
135
+
136
+ # Initialize DataModule and Model
137
+ logger.info(f"Instantiating datamodule <{cfg.data._target_}>")
138
+ datamodule: L.LightningDataModule = hydra.utils.instantiate(cfg.data)
139
+ logger.info(f"Instantiating model <{cfg.model._target_}>")
140
+ model: L.LightningModule = hydra.utils.instantiate(cfg.model)
141
+
142
+ # Check GPU availability and set seed for reproducibility
143
+ logger.info("GPU available" if torch.cuda.is_available() else "No GPU available")
144
+ L.seed_everything(cfg.seed, workers=True)
145
+
146
+ # Set up callbacks, loggers, and Trainer
147
+ callbacks = initialize_callbacks(cfg)
148
+ logger.info(f"Callbacks: {callbacks}")
149
+ loggers = initialize_loggers(cfg)
150
+ logger.info(f"Loggers: {loggers}")
151
+ trainer: L.Trainer = hydra.utils.instantiate(
152
+ cfg.trainer, callbacks=callbacks, logger=loggers
153
+ )
154
+
155
+ # Training phase
156
+ train_metrics = {}
157
+ if cfg.get("train"):
158
+ clear_checkpoint_directory(cfg.paths.ckpt_dir)
159
+ train_metrics = train_module(datamodule, model, trainer)
160
+ (Path(cfg.paths.ckpt_dir) / "train_done.flag").write_text(
161
+ "Training completed.\n"
162
+ )
163
+
164
+ # Testing phase
165
+ test_metrics = {}
166
+ if cfg.get("test"):
167
+ test_metrics = run_test_module(cfg, datamodule, model, trainer)
168
+
169
+ # Combine metrics and extract optimization metric
170
+ all_metrics = {**train_metrics, **test_metrics}
171
+ optimization_metric = all_metrics.get(cfg.get("optimization_metric"), 0.0)
172
+ (
173
+ logger.warning(
174
+ f"Optimization metric '{cfg.get('optimization_metric')}' not found. Defaulting to 0."
175
+ )
176
+ if optimization_metric == 0.0
177
+ else logger.info(f"Optimization metric: {optimization_metric}")
178
+ )
179
+
180
+ return optimization_metric
181
+
182
+
183
+ if __name__ == "__main__":
184
+ setup_run_trainer()