diff --git a/configs/computer/a100.yaml b/configs/computer/a100.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3b04a38b8241a9ebae448eac00b2f10c40200edd
--- /dev/null
+++ b/configs/computer/a100.yaml
@@ -0,0 +1,8 @@
+devices: 1
+progress_bar_refresh_rate: 2
+num_workers: 8
+sync_batchnorm: False
+accelerator: gpu
+precision: 32
+strategy: auto
+num_nodes: 1
diff --git a/configs/computer/cluster-node-a100.yaml b/configs/computer/cluster-node-a100.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..09742d7d495526cc0cdc60e7cc8c41b0383424f2
--- /dev/null
+++ b/configs/computer/cluster-node-a100.yaml
@@ -0,0 +1,8 @@
+devices: 8
+num_workers: 8
+progress_bar_refresh_rate: 2
+sync_batchnorm: True
+accelerator: gpu
+precision: 32
+strategy: ddp
+num_nodes: 1
diff --git a/configs/computer/cluster-node-v100.yaml b/configs/computer/cluster-node-v100.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b5f41cbff1d1de8c6c24cdd4ef54001fa5d6211
--- /dev/null
+++ b/configs/computer/cluster-node-v100.yaml
@@ -0,0 +1,8 @@
+devices: 4
+num_workers: 10
+progress_bar_refresh_rate: 2
+sync_batchnorm: True
+accelerator: gpu
+precision: 32
+strategy: ddp
+num_nodes: 1
diff --git a/configs/computer/cpu.yaml b/configs/computer/cpu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9460ab763f7f68612288dc50c9bd5934d5d14d4f
--- /dev/null
+++ b/configs/computer/cpu.yaml
@@ -0,0 +1,8 @@
+devices: null
+num_workers: 0
+progress_bar_refresh_rate: 2
+sync_batchnorm: False
+accelerator: cpu
+precision: 32
+strategy: auto
+num_nodes: null
diff --git a/configs/computer/v100.yaml b/configs/computer/v100.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8251dedb4026eaeb77286e3f750a1abe8862d261
--- /dev/null
+++ b/configs/computer/v100.yaml
@@ -0,0 +1,8 @@
+devices: 1
+num_workers: 10
+progress_bar_refresh_rate: 2
+sync_batchnorm: False
+accelerator: gpu
+precision: 32
+strategy: auto
+num_nodes: 1
diff --git a/configs/config.yaml b/configs/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ffc1405d94a9bb4e260820e40a85ea5f8c32155e
--- /dev/null
+++ b/configs/config.yaml
@@ -0,0 +1,89 @@
+defaults:
+  - model: default
+  - computer: v100
+  - dataset: osv5m
+  - _self_
+  - exp: ???
+
+model:
+  val_metrics:
+    _target_: metrics.distance_based.HaversineMetrics
+    acc_radiuses:
+      - 1
+      - 25
+      - 200
+      - 750
+      - 2500
+    acc_area: []
+    aux_data: ${aux_data}
+  test_metrics:
+    _target_: metrics.distance_based.HaversineMetrics
+    acc_radiuses:
+      - 1
+      - 25
+      - 200
+      - 750
+      - 2500
+    acc_area: ${areas}
+    aux_data: ${aux_data}
+
+datamodule:
+  _target_: data.datamodule.ImageDataModule
+  train_dataset: ${dataset.train_dataset}
+  val_dataset: ${dataset.val_dataset}
+  test_dataset: ${dataset.test_dataset}
+  global_batch_size: ${dataset.global_batch_size}
+  num_workers: ${computer.num_workers}
+  num_nodes: ${computer.num_nodes}
+  num_devices: ${computer.devices}
+  val_proportion: 0.1
+
+trainer:
+  _target_: pytorch_lightning.Trainer
+  devices: ${computer.devices}
+  accelerator: ${computer.accelerator}
+  strategy: ${computer.strategy}
+  num_nodes: ${computer.num_nodes}
+  precision: ${computer.precision}
+  max_epochs: ${max_epochs}
+
+logger:
+  _target_: pytorch_lightning.loggers.WandbLogger
+  save_dir: ${root_dir}
+  name: ${experiment_name}
+  project: plonk
+  log_model: False
+  offline: False
+  entity: imaginelab
+
+checkpoints:
+  _target_: pytorch_lightning.callbacks.ModelCheckpoint
+  dirpath: ${root_dir}/checkpoints/${experiment_name}
+  filename: 'epoch_{epoch}'
+  monitor: val/loss
+  save_last: True
+  save_top_k: 0
+  every_n_epochs: 1
+
+progress_bar:
+  _target_: pytorch_lightning.callbacks.TQDMProgressBar
+  refresh_rate: ${computer.progress_bar_refresh_rate}
+
+aux_data: []
+max_epochs: 100
+data_dir: ${root_dir}/datasets
+root_dir:  ${hydra:runtime.cwd}
+experiment_name: ${dataset.name}__${model.name}
+mode: train # change that to eval to do the testing
+num_classes: 0
+areas: ['country', 'region', 'sub-region', 'city']
+class_name: null
+streetclip: False
+blur: False
+text_tuning: False
+
+hydra:
+  run:
+    dir: outputs/${hydra.job.name}/${now:%Y-%m-%d_%H-%M-%S}/${experiment_name}
+  job:
+    chdir: true
diff --git a/configs/dataset/baselines/im2gps.yaml b/configs/dataset/baselines/im2gps.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9bd956b848db55eb4420328451dc1a1e208e1d44
--- /dev/null
+++ b/configs/dataset/baselines/im2gps.yaml
@@ -0,0 +1,16 @@
+dataset:
+  name: im2gps
+  global_batch_size: 512
+  test_dataset:
+    _partial_: true
+    _target_: data.data.Baseline
+    path: ${data_dir}/baselines/im2gps
+    which: 'im2gps'
+    transforms: ${dataset.test_transform}
+datamodule:
+  _target_: data.datamodule.BaselineDataModule
+  test_dataset: ${dataset.test_dataset}
+  global_batch_size: ${dataset.global_batch_size}
+  num_workers: ${computer.num_workers}
+  num_nodes: ${computer.num_nodes}
+  num_devices: ${computer.devices}
\ No newline at end of file
diff --git a/configs/dataset/baselines/im2gps3k.yaml b/configs/dataset/baselines/im2gps3k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6a5bc31cd92f7d5b8e654e2029ab85fd3704765b
--- /dev/null
+++ b/configs/dataset/baselines/im2gps3k.yaml
@@ -0,0 +1,16 @@
+dataset:
+  name: im2gps3k
+  global_batch_size: 512
+  test_dataset:
+    _partial_: true
+    _target_: data.data.Baseline
+    path: ${data_dir}/baselines/im2gps3k
+    which: 'im2gps3k'
+    transforms: ${dataset.test_transform}
+datamodule:
+  _target_: data.datamodule.BaselineDataModule
+  test_dataset: ${dataset.test_dataset}
+  global_batch_size: ${dataset.global_batch_size}
+  num_workers: ${computer.num_workers}
+  num_nodes: ${computer.num_nodes}
+  num_devices: ${computer.devices}
\ No newline at end of file
diff --git a/configs/dataset/baselines/yfcc4k.yaml b/configs/dataset/baselines/yfcc4k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..65ef8274be9c538f9d871e50e4fda473bd98bb35
--- /dev/null
+++ b/configs/dataset/baselines/yfcc4k.yaml
@@ -0,0 +1,16 @@
+dataset:
+  name: yfcc4k
+  global_batch_size: 512
+  test_dataset:
+    _partial_: true
+    _target_: data.data.Baseline
+    path: ${data_dir}/baselines/yfcc4k
+    which: 'yfcc4k'
+    transforms: ${dataset.test_transform}
+datamodule:
+  _target_: data.datamodule.BaselineDataModule
+  test_dataset: ${dataset.test_dataset}
+  global_batch_size: ${dataset.global_batch_size}
+  num_workers: ${computer.num_workers}
+  num_nodes: ${computer.num_nodes}
+  num_devices: ${computer.devices}
\ No newline at end of file
diff --git a/configs/dataset/osv5m.yaml b/configs/dataset/osv5m.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..90d2626a83be312502fd8bf82f83357c405b0181
--- /dev/null
+++ b/configs/dataset/osv5m.yaml
@@ -0,0 +1,46 @@
+defaults:
+  - train_transform: fast_clip
+  - test_transform: fast_clip
+  - _self_
+
+name: osv5m
+global_batch_size: 256
+
+train_dataset:
+  _partial_: true
+  _target_: data.data.osv5m
+  path: ${data_dir}/osv5m/
+  split: train
+  class_name: ${class_name}
+  transforms: ${dataset.train_transform}
+  aux_data: ${aux_data}
+  is_baseline: ${is_baseline}
+  areas: ${areas}
+  streetclip: ${streetclip}
+  blur: ${blur}
+
+val_dataset:
+  _partial_: true
+  _target_: data.data.osv5m
+  path: ${data_dir}/osv5m/
+  split: val
+  class_name: ${class_name}
+  transforms: ${dataset.test_transform}
+  aux_data: ${aux_data}
+  is_baseline: ${is_baseline}
+  areas: ${areas}
+  streetclip: ${streetclip}
+  blur: ${blur}
+
+test_dataset:
+  _partial_: true
+  _target_: data.data.osv5m
+  path: ${data_dir}/osv5m/
+  split: test
+  class_name: ${class_name}
+  transforms: ${dataset.test_transform}
+  aux_data: ${aux_data}
+  is_baseline: ${is_baseline}
+  areas: ${areas}
+  streetclip: ${streetclip}
+  blur: ${blur}
diff --git a/configs/dataset/osv5m_contrastive.yaml b/configs/dataset/osv5m_contrastive.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..78d154f823ce670328972e9364d4163a8f16cd97
--- /dev/null
+++ b/configs/dataset/osv5m_contrastive.yaml
@@ -0,0 +1,34 @@
+defaults:
+  - train_transform: fast_clip
+  - test_transform: fast_clip
+  - _self_
+
+name: osv5m
+global_batch_size: 256
+
+train_dataset:
+  _partial_: true
+  _target_: data.data.Contrastiveosv5m
+  path: ${data_dir}/osv5m/
+  split: train
+  class_name: ${class_name}
+  transforms: ${dataset.train_transform}
+  blur: ${blur}
+
+val_dataset:
+  _partial_: true
+  _target_: data.data.Contrastiveosv5m
+  path: ${data_dir}/osv5m/
+  split: val
+  class_name: ${class_name}
+  transforms: ${dataset.test_transform}
+  blur: ${blur}
+
+test_dataset:
+  _partial_: true
+  _target_: data.data.Contrastiveosv5m
+  path: ${data_dir}/osv5m/
+  split: test
+  class_name: ${class_name}
+  transforms: ${dataset.test_transform}
+  blur: ${blur}
diff --git a/configs/dataset/osv5m_contrastive_best.yaml b/configs/dataset/osv5m_contrastive_best.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cb9fc4bae3b46442844f0032ed41261de3dfc8e2
--- /dev/null
+++ b/configs/dataset/osv5m_contrastive_best.yaml
@@ -0,0 +1,37 @@
+defaults:
+  - train_transform: fast_clip
+  - test_transform: fast_clip
+  - _self_
+
+name: osv5m
+global_batch_size: 256
+
+train_dataset:
+  _partial_: true
+  _target_: data.data.Contrastiveosv5m
+  path: ${data_dir}/osv5m/
+  split: train
+  class_name: ${class_name}
+  transforms: ${dataset.train_transform}
+  class_name2: 'unique_region'
+  blur: ${blur}
+
+val_dataset:
+  _partial_: true
+  _target_: data.data.Contrastiveosv5m
+  path: ${data_dir}/osv5m/
+  split: val
+  class_name: ${class_name}
+  transforms: ${dataset.test_transform}
+  class_name2: 'unique_region'
+  blur: ${blur}
+
+test_dataset:
+  _partial_: true
+  _target_: data.data.Contrastiveosv5m
+  path: ${data_dir}/osv5m/
+  split: test
+  class_name: ${class_name}
+  transforms: ${dataset.test_transform}
+  class_name2: 'unique_region'
+  blur: ${blur}
\ No newline at end of file
diff --git a/configs/dataset/osv5m_text_contrastive.yaml b/configs/dataset/osv5m_text_contrastive.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..407f3fd6d8f6f7b3076b753b95304d0dba95953d
--- /dev/null
+++ b/configs/dataset/osv5m_text_contrastive.yaml
@@ -0,0 +1,34 @@
+defaults:
+  - train_transform: fast_clip
+  - test_transform: fast_clip
+  - _self_
+
+name: osv5m
+global_batch_size: 256
+
+train_dataset:
+  _partial_: true
+  _target_: data.data.TextContrastiveosv5m
+  path: ${data_dir}/osv5m/
+  split: train
+  class_name: ${class_name}
+  transforms: ${dataset.train_transform}
+  blur: ${blur}
+
+val_dataset:
+  _partial_: true
+  _target_: data.data.TextContrastiveosv5m
+  path: ${data_dir}/osv5m/
+  split: val
+  class_name: ${class_name}
+  transforms: ${dataset.test_transform}
+  blur: ${blur}
+
+test_dataset:
+  _partial_: true
+  _target_: data.data.TextContrastiveosv5m
+  path: ${data_dir}/osv5m/
+  split: test
+  class_name: ${class_name}
+  transforms: ${dataset.test_transform}
+  blur: ${blur}
diff --git a/configs/dataset/test_transform/center_crop.yaml b/configs/dataset/test_transform/center_crop.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ee4cbad1b36738048774feedb11a83a616fd222c
--- /dev/null
+++ b/configs/dataset/test_transform/center_crop.yaml
@@ -0,0 +1,12 @@
+_target_: torchvision.transforms.Compose
+transforms:
+  - _target_: torchvision.transforms.ToTensor
+  - _target_: utils.image_processing.CenterCrop
+    ratio: "1:1"
+  - _target_: torchvision.transforms.Resize
+    size: ${dataset.img_resolution}
+    interpolation: 3
+    antialias: true
+  - _target_: torchvision.transforms.Normalize
+    mean: 0.5
+    std: 0.5
diff --git a/configs/dataset/test_transform/clip.yaml b/configs/dataset/test_transform/clip.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6b95064021c7b01515892be99de05a5da6fbbb10
--- /dev/null
+++ b/configs/dataset/test_transform/clip.yaml
@@ -0,0 +1,2 @@
+_target_: data.transforms.ClipTransform
+split: val
diff --git a/configs/dataset/test_transform/fast_clip.yaml b/configs/dataset/test_transform/fast_clip.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0c59fc7d9b1e9ab7a9ac93a0fb95f33c5f009b05
--- /dev/null
+++ b/configs/dataset/test_transform/fast_clip.yaml
@@ -0,0 +1,12 @@
+_target_: torchvision.transforms.Compose
+transforms:
+  - _target_: torchvision.transforms.Resize
+    size: 224
+    interpolation: 3
+    antialias: true
+  - _target_: torchvision.transforms.CenterCrop
+    size: 224
+  - _target_: torchvision.transforms.ToTensor
+  - _target_: torchvision.transforms.Normalize
+    mean: [0.48145466, 0.4578275, 0.40821073]
+    std: [0.26862954, 0.26130258, 0.27577711]
diff --git a/configs/dataset/test_transform/fast_resnet.yaml b/configs/dataset/test_transform/fast_resnet.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c4e0c6eddd05ba839d8c44aaa85ac9e62da2ba7
--- /dev/null
+++ b/configs/dataset/test_transform/fast_resnet.yaml
@@ -0,0 +1,12 @@
+_target_: torchvision.transforms.Compose
+transforms:
+  - _target_: torchvision.transforms.Resize
+    size: 224
+    interpolation: 3
+    antialias: true
+  - _target_: torchvision.transforms.CenterCrop
+    size: 224
+  - _target_: torchvision.transforms.ToTensor
+  - _target_: torchvision.transforms.Normalize
+    mean: [0.485 ,0.456 ,0.406]
+    std: [0.229, 0.224, 0.225]
\ No newline at end of file
diff --git a/configs/dataset/test_transform/none.yaml b/configs/dataset/test_transform/none.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..35a7d36b94c3f9c42e595bbcd7742190ec42058d
--- /dev/null
+++ b/configs/dataset/test_transform/none.yaml
@@ -0,0 +1,6 @@
+_target_: torchvision.transforms.Compose
+transforms:
+  - _target_: torchvision.transforms.ToTensor
+  - _target_: torchvision.transforms.Normalize
+    mean: 0.5
+    std: 0.5
diff --git a/configs/dataset/train_transform/augmentation.yaml b/configs/dataset/train_transform/augmentation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3a44879cd406d15cbc0b31070f00d062290457f9
--- /dev/null
+++ b/configs/dataset/train_transform/augmentation.yaml
@@ -0,0 +1,85 @@
+_target_: data.augmentation.ImageAugmentation
+names: "standard_augmentation,geometric_augmentation,clip_transform"
+
+# always apply clip_transform at the end
+clip_transform:
+  _target_: torchvision.transforms.Compose
+  transforms:
+    - _target_: torchvision.transforms.Resize
+      size: 224
+      interpolation: 3
+      antialias: true
+    - _target_: torchvision.transforms.CenterCrop
+      size: 224
+    - _target_: torchvision.transforms.ToTensor
+    - _target_: torchvision.transforms.Normalize
+      mean: [0.48145466, 0.4578275, 0.40821073]
+      std: [0.26862954, 0.26130258, 0.27577711]
+
+standard_augmentation:
+  _target_: data.augmentation.StandardAugmentation
+  # by default, we all augmentation methods
+  names: "brightness,contrast,sharpness,color,blur,gaussian_noise"
+
+  # random PIL brigtness
+  brightness:
+    _target_: data.augmentation.PillowBrightness
+    p: 0.2
+    factor_interval: [0.5, 1.5]
+
+  # random PIL contrast
+  contrast:
+    _target_: data.augmentation.PillowContrast
+    p: 0.2
+    factor_interval: [0.3, 3]
+
+  # random PIL sharpness
+  sharpness:
+    _target_: data.augmentation.PillowSharpness
+    p: 0.2
+    factor_interval: [0.5, 30.0]
+
+  # random PIL color
+  color:
+    _target_: data.augmentation.PillowColor
+    p: 0.2
+    factor_interval: [0.0, 2.0]
+
+  # random PIL blur
+  blur:
+    _target_: data.augmentation.PillowBlur
+    p: 0.2
+    factor_interval: [1, 2]
+
+  # random numpy gaussian noise
+  gaussian_noise:
+    _target_: data.augmentation.NumpyGaussianNoise
+    p: 0.2
+    factor_interval: [0.1, 0.04]
+
+geometric_augmentation:
+  _target_: data.augmentation.GeometricAugmentation
+  # by default, we all augmentation methods
+  names: "random_rotation,random_resized_crop,random_horizontal_flip"
+
+  # random rotation
+  random_rotation:
+    _target_: torchvision.transforms.RandomRotation
+    degrees: [-15, 15]
+
+  # random crop
+  random_resized_crop:
+    _target_: torchvision.transforms.RandomResizedCrop
+    scale: [0.5, 1.0]
+    ratio: [0.9, 1.1]
+    size: 224
+
+  # random horizontal flip
+  random_horizontal_flip:
+    _target_: torchvision.transforms.RandomHorizontalFlip
+    p: 0.5
+
+  # random vertical flip
+  random_vertical_flip:
+    _target_: torchvision.transforms.RandomVerticalFlip
+    p: 0.5
diff --git a/configs/dataset/train_transform/center_crop.yaml b/configs/dataset/train_transform/center_crop.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2f29b4f6055d2df6491a90206318f8a0bb4b836b
--- /dev/null
+++ b/configs/dataset/train_transform/center_crop.yaml
@@ -0,0 +1,14 @@
+_target_: torchvision.transforms.Compose
+transforms:
+  - _target_: torchvision.transforms.ToTensor
+  - _target_: utils.image_processing.CenterCrop
+    ratio: "1:1"
+  - _target_: torchvision.transforms.Resize
+    size: ${dataset.img_resolution}
+    interpolation: 3
+    antialias: true
+  - _target_: torchvision.transforms.RandomHorizontalFlip
+    p: 0.5
+  - _target_: torchvision.transforms.Normalize
+    mean: 0.5
+    std: 0.5
diff --git a/configs/dataset/train_transform/clip.yaml b/configs/dataset/train_transform/clip.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6b95064021c7b01515892be99de05a5da6fbbb10
--- /dev/null
+++ b/configs/dataset/train_transform/clip.yaml
@@ -0,0 +1,2 @@
+_target_: data.transforms.ClipTransform
+split: val
diff --git a/configs/dataset/train_transform/fast_clip.yaml b/configs/dataset/train_transform/fast_clip.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0c59fc7d9b1e9ab7a9ac93a0fb95f33c5f009b05
--- /dev/null
+++ b/configs/dataset/train_transform/fast_clip.yaml
@@ -0,0 +1,12 @@
+_target_: torchvision.transforms.Compose
+transforms:
+  - _target_: torchvision.transforms.Resize
+    size: 224
+    interpolation: 3
+    antialias: true
+  - _target_: torchvision.transforms.CenterCrop
+    size: 224
+  - _target_: torchvision.transforms.ToTensor
+  - _target_: torchvision.transforms.Normalize
+    mean: [0.48145466, 0.4578275, 0.40821073]
+    std: [0.26862954, 0.26130258, 0.27577711]
diff --git a/configs/dataset/train_transform/fast_resnet.yaml b/configs/dataset/train_transform/fast_resnet.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c4e0c6eddd05ba839d8c44aaa85ac9e62da2ba7
--- /dev/null
+++ b/configs/dataset/train_transform/fast_resnet.yaml
@@ -0,0 +1,12 @@
+_target_: torchvision.transforms.Compose
+transforms:
+  - _target_: torchvision.transforms.Resize
+    size: 224
+    interpolation: 3
+    antialias: true
+  - _target_: torchvision.transforms.CenterCrop
+    size: 224
+  - _target_: torchvision.transforms.ToTensor
+  - _target_: torchvision.transforms.Normalize
+    mean: [0.485 ,0.456 ,0.406]
+    std: [0.229, 0.224, 0.225]
\ No newline at end of file
diff --git a/configs/dataset/train_transform/none.yaml b/configs/dataset/train_transform/none.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..235b00288f91bf207e8b21feae6a93dba7cd9120
--- /dev/null
+++ b/configs/dataset/train_transform/none.yaml
@@ -0,0 +1,7 @@
+_target_: torchvision.transforms.Compose
+transforms:
+  - _target_: torchvision.transforms.Resize
+    size: 224
+    interpolation: 3
+    antialias: true
+  - _target_: torchvision.transforms.ToTensor
diff --git a/configs/exp/DinoV2.yaml b/configs/exp/DinoV2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dc244acb79f20a52a05eb8b9684c6c038d932bfa
--- /dev/null
+++ b/configs/exp/DinoV2.yaml
@@ -0,0 +1,18 @@
+# @package _global_
+
+defaults:
+  - override /model: regression
+  - override /model/network/backbone: dinov2_vitl14
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 30
+
+dataset:
+  global_batch_size: 2048
\ No newline at end of file
diff --git a/configs/exp/ResNet.yaml b/configs/exp/ResNet.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fae93795a757e831ad75809950080698b5085c76
--- /dev/null
+++ b/configs/exp/ResNet.yaml
@@ -0,0 +1,21 @@
+# @package _global_
+
+defaults:
+  - override /model: regression
+  - override /dataset/test_transform: fast_resnet
+  - override /dataset/train_transform: fast_resnet 
+  - override /model.network.mid: mlp_resnet
+  - override /model/network/backbone: ResNet50
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 30
+
+dataset:
+  global_batch_size: 2048
\ No newline at end of file
diff --git a/configs/exp/base_model.yaml b/configs/exp/base_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c526707801a66df05fa11ceec9bd5347cf50406
--- /dev/null
+++ b/configs/exp/base_model.yaml
@@ -0,0 +1,19 @@
+# @package _global_
+
+defaults:
+  - override /model: regression
+  - override /model/network/backbone: openclip_B_32
+  - _self_
+
+model:
+  name: base_model
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 30
+
+dataset:
+  global_batch_size: 2048
\ No newline at end of file
diff --git a/configs/exp/best_model.yaml b/configs/exp/best_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a67b491f3e47020257a8eea6ca6367828dc26d2d
--- /dev/null
+++ b/configs/exp/best_model.yaml
@@ -0,0 +1,25 @@
+# @package _global_
+
+defaults:
+  - override /dataset: osv5m_contrastive_best
+  - override /model: hybrid
+  - override /model/network: best_backbone
+  - override /model/network/backbone: clip_L_14_DataComp
+  - override /model/network/mid: mlp_hybrid
+  - override /model/loss: best_model
+  - _self_
+
+class_name: 'quadtree_10_1000'
+is_baseline: false
+max_epochs: 30
+
+model:
+  name: best_model
+  optimizer:
+    optim:
+      lr: 2e-4
+      weight_decay: 0.0001
+    backbone_lr: 2e-5
+
+dataset:
+  global_batch_size: 2048
\ No newline at end of file
diff --git a/configs/exp/classification_area.yaml b/configs/exp/classification_area.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ad4fa7f2fe4f68cedf40bbc26064238c861f1ac0
--- /dev/null
+++ b/configs/exp/classification_area.yaml
@@ -0,0 +1,19 @@
+# @package _global_
+
+defaults:
+  - override /model: classification
+  - override /model/network/backbone: openclip_B_32
+  - _self_
+
+class_name: 'area'
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 15
+
+dataset:
+  global_batch_size: 2048
diff --git a/configs/exp/classification_cell.yaml b/configs/exp/classification_cell.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..060116385fbde15ada31b94612b9200ab6abeb2b
--- /dev/null
+++ b/configs/exp/classification_cell.yaml
@@ -0,0 +1,19 @@
+# @package _global_
+
+defaults:
+  - override /model: classification
+  - override /model/network/backbone: openclip_B_32
+  - _self_
+
+class_name: quadtree_10_1000
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 15
+
+dataset:
+  global_batch_size: 2048
diff --git a/configs/exp/classification_cell_hier.yaml b/configs/exp/classification_cell_hier.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..60d9b4d08b48fc1dcbec89149f3ff2726b31dc0c
--- /dev/null
+++ b/configs/exp/classification_cell_hier.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+defaults:
+  - override /model: classification
+  - override /model/network/backbone: openclip_B_32
+  - override /model/loss: cls_hier_quad
+  - _self_
+
+class_name: quadtree_10_1000
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 15
+
+dataset:
+  global_batch_size: 2048
diff --git a/configs/exp/classification_city.yaml b/configs/exp/classification_city.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fb4ffa233649b95e1b32ec63cd2ee36f5665c6ed
--- /dev/null
+++ b/configs/exp/classification_city.yaml
@@ -0,0 +1,19 @@
+# @package _global_
+
+defaults:
+  - override /model: classification
+  - override /model/network/backbone: openclip_B_32
+  - _self_
+
+class_name: 'city'
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 15
+
+dataset:
+  global_batch_size: 2048
diff --git a/configs/exp/classification_city_hier.yaml b/configs/exp/classification_city_hier.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6239da3a6759956df37cb2839a8ba8b9661adfc4
--- /dev/null
+++ b/configs/exp/classification_city_hier.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+defaults:
+  - override /model: classification
+  - override /model/network/backbone: openclip_B_32
+  - override /model/loss: cls_hier
+  - _self_
+
+class_name: 'city'
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 15
+
+dataset:
+  global_batch_size: 2048
diff --git a/configs/exp/classification_country.yaml b/configs/exp/classification_country.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5eba63e8fbcb6301b42929206d34570aa0659058
--- /dev/null
+++ b/configs/exp/classification_country.yaml
@@ -0,0 +1,19 @@
+# @package _global_
+
+defaults:
+  - override /model: classification
+  - override /model/network/backbone: openclip_B_32
+  - _self_
+
+class_name: 'country'
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 15
+
+dataset:
+  global_batch_size: 2048
diff --git a/configs/exp/classification_region copy.yaml b/configs/exp/classification_region copy.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b47d706b4c757d6f552221909e7fa135b9dfed96
--- /dev/null
+++ b/configs/exp/classification_region copy.yaml	
@@ -0,0 +1,19 @@
+# @package _global_
+
+defaults:
+  - override /model: classification
+  - override /model/network/backbone: openclip_B_32
+  - _self_
+
+class_name: 'region'
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 15
+
+dataset:
+  global_batch_size: 2048
diff --git a/configs/exp/classification_region.yaml b/configs/exp/classification_region.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b47d706b4c757d6f552221909e7fa135b9dfed96
--- /dev/null
+++ b/configs/exp/classification_region.yaml
@@ -0,0 +1,19 @@
+# @package _global_
+
+defaults:
+  - override /model: classification
+  - override /model/network/backbone: openclip_B_32
+  - _self_
+
+class_name: 'region'
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 15
+
+dataset:
+  global_batch_size: 2048
diff --git a/configs/exp/clip_L_14_DataComp.yaml b/configs/exp/clip_L_14_DataComp.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..20c0d9805e90352b1611a75cb5ee2c4f59fe6d3b
--- /dev/null
+++ b/configs/exp/clip_L_14_DataComp.yaml
@@ -0,0 +1,18 @@
+# @package _global_
+
+defaults:
+  - override /model: regression
+  - override /model/network/backbone: clip_L_14_DataComp
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 30
+
+dataset:
+  global_batch_size: 2048
\ No newline at end of file
diff --git a/configs/exp/clip_L_14_Laion.yaml b/configs/exp/clip_L_14_Laion.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..50015a29e6646b7d1067d76c0eacd35b9050083d
--- /dev/null
+++ b/configs/exp/clip_L_14_Laion.yaml
@@ -0,0 +1,18 @@
+# @package _global_
+
+defaults:
+  - override /model: regression
+  - override /model/network/backbone: openclip_L_14
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 30
+
+dataset:
+  global_batch_size: 2048
\ No newline at end of file
diff --git a/configs/exp/clip_L_14_OpenAI.yaml b/configs/exp/clip_L_14_OpenAI.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b71c7caab5f2c92b23b9eb4f5bed061ec3b74f2d
--- /dev/null
+++ b/configs/exp/clip_L_14_OpenAI.yaml
@@ -0,0 +1,18 @@
+# @package _global_
+
+defaults:
+  - override /model: regression
+  - override /model/network/backbone: clip_L_14
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 30
+
+dataset:
+  global_batch_size: 2048
\ No newline at end of file
diff --git a/configs/exp/clip_bigG_14_Laion.yaml b/configs/exp/clip_bigG_14_Laion.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..340cd07e004084e8fb577d97bfb8326367804dd9
--- /dev/null
+++ b/configs/exp/clip_bigG_14_Laion.yaml
@@ -0,0 +1,18 @@
+# @package _global_
+
+defaults:
+  - override /model: regression
+  - override /model/network/backbone: openclip_bigG_14
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 30
+
+dataset:
+  global_batch_size: 2048
\ No newline at end of file
diff --git a/configs/exp/contrastive_area.yaml b/configs/exp/contrastive_area.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..92f34df2ad3af9c64827337993be9163c137e994
--- /dev/null
+++ b/configs/exp/contrastive_area.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+defaults:
+  - override /dataset: osv5m_contrastive
+  - override /model: regression
+  - override /model/network: contrastive_unfrozen_backbone
+  - override /model/network/backbone: openclip_B_32
+  - override /model/loss: contrastive
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 2e-4
+      weight_decay: 0.0001
+    backbone_lr: 2e-5
+
+class_name: area
+is_baseline: false
+max_epochs: 30
diff --git a/configs/exp/contrastive_cell.yaml b/configs/exp/contrastive_cell.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c5be9e29a2227b55839af173c2b7761a12488fc
--- /dev/null
+++ b/configs/exp/contrastive_cell.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+defaults:
+  - override /dataset: osv5m_contrastive
+  - override /model: regression
+  - override /model/network: contrastive_unfrozen_backbone
+  - override /model/network/backbone: openclip_B_32
+  - override /model/loss: contrastive
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 2e-4
+      weight_decay: 0.0001
+    backbone_lr: 2e-5
+
+class_name: quadtree_10_1000
+is_baseline: false
+max_epochs: 30
diff --git a/configs/exp/contrastive_city.yaml b/configs/exp/contrastive_city.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c9524b16486736eef6a4b7a606d8d74f783e900
--- /dev/null
+++ b/configs/exp/contrastive_city.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+defaults:
+  - override /dataset: osv5m_contrastive
+  - override /model: regression
+  - override /model/network: contrastive_unfrozen_backbone
+  - override /model/network/backbone: openclip_B_32
+  - override /model/loss: contrastive
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 2e-4
+      weight_decay: 0.0001
+    backbone_lr: 2e-5
+
+class_name: city
+is_baseline: false
+max_epochs: 30
diff --git a/configs/exp/contrastive_country.yaml b/configs/exp/contrastive_country.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4f973f805d406c249ff25f1d3e14525e60eaf891
--- /dev/null
+++ b/configs/exp/contrastive_country.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+defaults:
+  - override /dataset: osv5m_contrastive
+  - override /model: regression
+  - override /model/network: contrastive_unfrozen_backbone
+  - override /model/network/backbone: openclip_B_32
+  - override /model/loss: contrastive
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 2e-4
+      weight_decay: 0.0001
+    backbone_lr: 2e-5
+
+class_name: country
+is_baseline: false
+max_epochs: 30
diff --git a/configs/exp/contrastive_region.yaml b/configs/exp/contrastive_region.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e469d19d359d33436f8f27d131c36fc574a83c32
--- /dev/null
+++ b/configs/exp/contrastive_region.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+defaults:
+  - override /dataset: osv5m_contrastive
+  - override /model: regression
+  - override /model/network: contrastive_unfrozen_backbone
+  - override /model/network/backbone: openclip_B_32
+  - override /model/loss: contrastive
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 2e-4
+      weight_decay: 0.0001
+    backbone_lr: 2e-5
+
+class_name: region
+is_baseline: false
+max_epochs: 30
diff --git a/configs/exp/contrastive_text.yaml b/configs/exp/contrastive_text.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f881237de30a6a5ded481ea2e25b14924955a759
--- /dev/null
+++ b/configs/exp/contrastive_text.yaml
@@ -0,0 +1,22 @@
+# @package _global_
+
+defaults:
+  - override /dataset: osv5m_text_contrastive
+  - override /model: text_tuning
+  - override /model/network/backbone: openclip_B_32
+  - _self_
+
+model:
+  network:
+    backbone:
+      instance:
+        _target_: models.networks.backbones.CLIPText
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+class_name: city
+text_tuning: True
+max_epochs: 30
diff --git a/configs/exp/eval_best_model.yaml b/configs/exp/eval_best_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5d0ac9c8234c42858b2f9c20b224e0b6b98ff99
--- /dev/null
+++ b/configs/exp/eval_best_model.yaml
@@ -0,0 +1,29 @@
+# @package _global_
+
+defaults:
+  - override /dataset: osv5m_contrastive_best
+  - override /model: hybrid
+  - override /model/network: best_backbone
+  - override /model/network/backbone: clip_L_14_DataComp
+  - override /model/network/mid: mlp_hybrid
+  - _self_
+
+class_name: 'quadtree_10_1000'
+is_baseline: false
+max_epochs: 30
+mode: 'eval'
+
+model:
+  name: best_model
+  optimizer:
+    optim:
+      lr: 2e-4
+      weight_decay: 0.0001
+    backbone_lr: 2e-5
+  network:
+    head:
+      instance:
+        quadtree_path: ${root_dir}/quadtree_10_1000.csv
+
+dataset:
+  global_batch_size: 2048
diff --git a/configs/exp/fine_tuning.yaml b/configs/exp/fine_tuning.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9582b23461c899929ee1209c6f5e671c6e90c600
--- /dev/null
+++ b/configs/exp/fine_tuning.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+defaults:
+  - override /model: regression
+  - override /model/network: unfrozen_backbone
+  - override /model/network/backbone: openclip_B_32
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 2e-4
+      weight_decay: 0.0001
+    backbone_lr: 2e-5
+
+is_baseline: false
+max_epochs: 30
+
+dataset:
+  global_batch_size: 2048
diff --git a/configs/exp/hybrid.yaml b/configs/exp/hybrid.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a646b76a616b38bd6919311bbf005f7b7de5a172
--- /dev/null
+++ b/configs/exp/hybrid.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+defaults:
+  - override /model: hybrid
+  - override /model/network/backbone: openclip_B_32
+  - override /model/network/mid: mlp_hybrid
+  - _self_
+
+class_name: 'quadtree_10_1000'
+is_baseline: false
+max_epochs: 30
+
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+dataset:
+  global_batch_size: 2048
diff --git a/configs/exp/last_block.yaml b/configs/exp/last_block.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b2cbcfab4cbd0db7689d7849644557b3c8187f1d
--- /dev/null
+++ b/configs/exp/last_block.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+
+defaults:
+  - override /model: regression
+  - override /model/network: last_block_backbone
+  - override /model/network/backbone: openclip_B_32
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 2e-4
+      weight_decay: 0.0001
+    backbone_lr: 2e-5
+
+is_baseline: false
+max_epochs: 30
+
+dataset:
+  global_batch_size: 2048
\ No newline at end of file
diff --git a/configs/exp/lora-32.yaml b/configs/exp/lora-32.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a9c651eccfcfc2d54bb7f972c5b3e749a0483dcb
--- /dev/null
+++ b/configs/exp/lora-32.yaml
@@ -0,0 +1,18 @@
+# @package _global_
+
+defaults:
+  - override /model: regression
+  - override /model/network: lora_backbone
+  - override /model/network/backbone: openclip_B_32
+  - _self_
+
+is_baseline: false
+
+lora_r: 32
+lora_alpha: 256
+lora_dropout: 0.1
+lora_bias: lora_only
+max_epochs: 30
+
+dataset:
+  global_batch_size: 2048
diff --git a/configs/exp/metaclip.yaml b/configs/exp/metaclip.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c2504e9d496d12b3361a5aa94c9611c8b100414
--- /dev/null
+++ b/configs/exp/metaclip.yaml
@@ -0,0 +1,18 @@
+# @package _global_
+
+defaults:
+  - override /model: regression
+  - override /model/network/backbone: metaclip
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 30
+
+dataset:
+  global_batch_size: 2048
\ No newline at end of file
diff --git a/configs/exp/random.yaml b/configs/exp/random.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cdf7a50d0ab1212491766736f1f8a39d02a5d2e8
--- /dev/null
+++ b/configs/exp/random.yaml
@@ -0,0 +1,10 @@
+# @package _global_
+
+defaults:
+  - override /model: random
+  - _self_
+
+class_name: 'country'
+is_baseline: false
+max_epochs: 1
+mode: eval
diff --git a/configs/exp/reg_sincos.yaml b/configs/exp/reg_sincos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de3f1e19ebdc1c6079a6c45d590af307a4626cc1
--- /dev/null
+++ b/configs/exp/reg_sincos.yaml
@@ -0,0 +1,19 @@
+# @package _global_
+
+defaults:
+  - override /model: regression
+  - override /model/network/backbone: openclip_B_32
+  - override /model/network/head: regression_angle
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 30
+
+dataset:
+  global_batch_size: 2048
\ No newline at end of file
diff --git a/configs/exp/streetclip.yaml b/configs/exp/streetclip.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b47889d5a887478526d7ad026a95d4b153f5f07
--- /dev/null
+++ b/configs/exp/streetclip.yaml
@@ -0,0 +1,19 @@
+# @package _global_
+
+defaults:
+  - override /model: regression
+  - override /model/network/backbone: streetclip
+  - _self_
+
+model:
+  optimizer:
+    optim:
+      lr: 0.0002
+      weight_decay: 0.0001
+
+is_baseline: false
+max_epochs: 30
+streetclip: True
+
+dataset:
+  global_batch_size: 2048
\ No newline at end of file
diff --git a/configs/model/baselines.yaml b/configs/model/baselines.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af46e96c9564a4ec554e2b57be192e327a08a643
--- /dev/null
+++ b/configs/model/baselines.yaml
@@ -0,0 +1,10 @@
+defaults:
+  - optimizer: none
+  - lr_scheduler: none
+  - network: baselines
+  - loss: mix
+  - _self_
+
+name: Baseline
+aux_data: ${aux_data}
+text_tuning: ${text_tuning}
diff --git a/configs/model/classification.yaml b/configs/model/classification.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1646ea39a79bccb878a70c6e62cf9cc5543e9fcc
--- /dev/null
+++ b/configs/model/classification.yaml
@@ -0,0 +1,11 @@
+defaults:
+  - optimizer: adam
+  - lr_scheduler: none
+  - network: frozen_backbone
+  - loss: cls
+  - override network/head: classification
+  - _self_
+
+name: Classification
+aux_data: ${aux_data}
+text_tuning: ${text_tuning}
diff --git a/configs/model/hybrid.yaml b/configs/model/hybrid.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..65401813d48a739db759610bac8e00c7f75f1794
--- /dev/null
+++ b/configs/model/hybrid.yaml
@@ -0,0 +1,10 @@
+defaults:
+  - optimizer: adam
+  - lr_scheduler: none
+  - network: hybrid_frozen_backbone
+  - loss: hybrid
+  - override network/head: hybrid
+  - _self_
+
+name: Hybrid
+text_tuning: ${text_tuning}
diff --git a/configs/model/hybrid_sharedreg.yaml b/configs/model/hybrid_sharedreg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fc17ba5c4e6ad5f1ed2198996b3c9abc146606c4
--- /dev/null
+++ b/configs/model/hybrid_sharedreg.yaml
@@ -0,0 +1,10 @@
+defaults:
+  - optimizer: adam
+  - lr_scheduler: none
+  - network: hybrid_frozen_backbone
+  - loss: hybrid
+  - override network/head: hybrid_sharedreg
+  - _self_
+
+name: SharedHybrid
+text_tuning: ${text_tuning}
diff --git a/configs/model/loss/aux_loss.yaml b/configs/model/loss/aux_loss.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..04f7a4539c641db26bd9a21d1cc5f9396b22debe
--- /dev/null
+++ b/configs/model/loss/aux_loss.yaml
@@ -0,0 +1,11 @@
+_target_: models.losses.Losses
+mix: {
+  haversine : 0.0,
+  L1 : 1.0,
+  land_cover: 1.0,
+  drive_side: 1.0,
+  climate: 1.0,
+  soil: 1.0,
+  dist_sea: 1.0,
+}
+aux_data: ${aux_data}
diff --git a/configs/model/loss/best_model.yaml b/configs/model/loss/best_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..02ae7172aaf89f57f0e8aaf766557b2f129c9c18
--- /dev/null
+++ b/configs/model/loss/best_model.yaml
@@ -0,0 +1,8 @@
+_target_: models.losses.Losses
+mix: {
+  region_mil : 1.0,
+  hier_quad : 1.0,
+  l2_hybrid : 1.0,
+}
+path: ${data_dir}
+num_devices: ${computer.devices}
diff --git a/configs/model/loss/cls.yaml b/configs/model/loss/cls.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..552d89f149cc38f63a17ff1a515ee43ff9f46ecd
--- /dev/null
+++ b/configs/model/loss/cls.yaml
@@ -0,0 +1,4 @@
+_target_: models.losses.Losses
+mix: {
+  CrossEntropy : 1.0,
+}
diff --git a/configs/model/loss/cls_hier.yaml b/configs/model/loss/cls_hier.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ca9b5f0f2212bfa79b23891cd1b8504455668cce
--- /dev/null
+++ b/configs/model/loss/cls_hier.yaml
@@ -0,0 +1,5 @@
+_target_: models.losses.Losses
+mix: {
+  hierarchical : 1.0,
+}
+path: ${data_dir}
diff --git a/configs/model/loss/cls_hier_quad.yaml b/configs/model/loss/cls_hier_quad.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5b6e4678f20bcaa4e9671698451c264189a6f36c
--- /dev/null
+++ b/configs/model/loss/cls_hier_quad.yaml
@@ -0,0 +1,5 @@
+_target_: models.losses.Losses
+mix: {
+  hier_quad : 1.0,
+}
+path: ${data_dir}
diff --git a/configs/model/loss/contrastive.yaml b/configs/model/loss/contrastive.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a09d89956af3cf4ff240c6959d2034308c14e61
--- /dev/null
+++ b/configs/model/loss/contrastive.yaml
@@ -0,0 +1,7 @@
+_target_: models.losses.Losses
+mix: {
+  MIL-NCE : 1.0,
+  #infoNCE : 1.0,
+  L1 : 1.0,
+}
+num_devices: ${computer.devices}
diff --git a/configs/model/loss/contrastive_only.yaml b/configs/model/loss/contrastive_only.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e4d973e32fa4106b6c9cf78a68ce8268e5dfb7c9
--- /dev/null
+++ b/configs/model/loss/contrastive_only.yaml
@@ -0,0 +1,6 @@
+_target_: models.losses.Losses
+mix: {
+  MIL-NCE : 1.0,
+  #L1 : 1.0,
+}
+num_devices: ${computer.devices}
diff --git a/configs/model/loss/geoguessr.yaml b/configs/model/loss/geoguessr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7f40223f6cbae7047f8dc9f571f08a9d3b119e25
--- /dev/null
+++ b/configs/model/loss/geoguessr.yaml
@@ -0,0 +1,4 @@
+_target_: models.losses.Losses
+mix: {
+  geoguessr : 1.0,
+}
diff --git a/configs/model/loss/hybrid.yaml b/configs/model/loss/hybrid.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..189584e8943cbeb2d2f7e957cd15bec13991a864
--- /dev/null
+++ b/configs/model/loss/hybrid.yaml
@@ -0,0 +1,6 @@
+_target_: models.losses.Losses
+mix: {
+  crossentropy: 1.0,
+  #l1 : 1.0,
+  l2_hybrid : 1.0,
+}
diff --git a/configs/model/loss/mix.yaml b/configs/model/loss/mix.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2e52a9dda0238b95e134927c78a4c40490dc8d87
--- /dev/null
+++ b/configs/model/loss/mix.yaml
@@ -0,0 +1,5 @@
+_target_: models.losses.Losses
+mix: {
+  crossentropy: 1.0,
+  l1 : 1.0,
+}
diff --git a/configs/model/loss/reg.yaml b/configs/model/loss/reg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9acdfbded1d94cbea20bf7b51d13ae7500a302df
--- /dev/null
+++ b/configs/model/loss/reg.yaml
@@ -0,0 +1,5 @@
+_target_: models.losses.Losses
+mix: {
+  haversine : 0.0,
+  L1 : 1.0,
+}
diff --git a/configs/model/loss/text_tuning.yaml b/configs/model/loss/text_tuning.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..259d09ba92574a969f494b0787dd3460eb51ee9e
--- /dev/null
+++ b/configs/model/loss/text_tuning.yaml
@@ -0,0 +1,6 @@
+_target_: models.losses.Losses
+mix: {
+  Text-NCE : 1.0,
+  L1 : 1.0,
+}
+num_devices: ${computer.devices}
diff --git a/configs/model/lr_scheduler/none.yaml b/configs/model/lr_scheduler/none.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c3332f01b219d660201225466878cd314302e263
--- /dev/null
+++ b/configs/model/lr_scheduler/none.yaml
@@ -0,0 +1,3 @@
+_partial_: true
+_target_: utils.lr_scheduler.WarmupLR
+warmup_steps: 0
diff --git a/configs/model/lr_scheduler/warmup.yaml b/configs/model/lr_scheduler/warmup.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c5fb91b182270b9ad3947aac405f413af092fba
--- /dev/null
+++ b/configs/model/lr_scheduler/warmup.yaml
@@ -0,0 +1,3 @@
+_partial_: true
+_target_: utils.lr_scheduler.WarmupLR
+warmup_steps: 20000
diff --git a/configs/model/lr_scheduler/warmup_cosine_decay.yaml b/configs/model/lr_scheduler/warmup_cosine_decay.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..17658f3c6db1c3197be06e79768341e223e549fd
--- /dev/null
+++ b/configs/model/lr_scheduler/warmup_cosine_decay.yaml
@@ -0,0 +1,4 @@
+_partial_: true
+_target_: utils.lr_scheduler.WarmupCosineDecayLR
+warmup_steps: 20000
+total_steps: ${trainer.max_steps}
diff --git a/configs/model/multi.yaml b/configs/model/multi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b17304cb927da46f78e498702d913bb24e560917
--- /dev/null
+++ b/configs/model/multi.yaml
@@ -0,0 +1,9 @@
+defaults:
+  - optimizer: adam
+  - lr_scheduler: none
+  - network: multi_task
+  - loss: aux_loss
+  - _self_
+
+name: Multi_task
+text_tuning: ${text_tuning}
diff --git a/configs/model/network/backbone/ResNet50.yaml b/configs/model/network/backbone/ResNet50.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..84d03e71ef40798dcf07da2dc97268749dd85e30
--- /dev/null
+++ b/configs/model/network/backbone/ResNet50.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.ResNet
+  path: microsoft/resnet-50
+
+output_dim: 2048
\ No newline at end of file
diff --git a/configs/model/network/backbone/clip_B_32.yaml b/configs/model/network/backbone/clip_B_32.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2e9dad30a064a3ca219e85c39e73430b91d70ea4
--- /dev/null
+++ b/configs/model/network/backbone/clip_B_32.yaml
@@ -0,0 +1,6 @@
+instance:
+  _target_: models.networks.backbones.CLIP
+  path: openai/clip-vit-base-patch32
+
+
+output_dim: 768
diff --git a/configs/model/network/backbone/clip_L_14.yaml b/configs/model/network/backbone/clip_L_14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4fb45ebc159a7c62fb3d1fcfc858e677bfec92df
--- /dev/null
+++ b/configs/model/network/backbone/clip_L_14.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.CLIP
+  path: openai/clip-vit-large-patch14
+
+output_dim: 1024
diff --git a/configs/model/network/backbone/clip_L_14_DataComp.yaml b/configs/model/network/backbone/clip_L_14_DataComp.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9785bd6540a55282072028944f67c15534aa49c2
--- /dev/null
+++ b/configs/model/network/backbone/clip_L_14_DataComp.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.CLIP
+  path: laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K
+
+output_dim: 1024
diff --git a/configs/model/network/backbone/dinov2_vitb14.yaml b/configs/model/network/backbone/dinov2_vitb14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e6e948cacc56de16738a9f76ac523c8a446f7231
--- /dev/null
+++ b/configs/model/network/backbone/dinov2_vitb14.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.DINOv2
+  tag: dinov2_vitb14
+
+output_dim: 768
diff --git a/configs/model/network/backbone/dinov2_vitg14.yaml b/configs/model/network/backbone/dinov2_vitg14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a2bec9f74c484025b94600ff15da5244698f6aba
--- /dev/null
+++ b/configs/model/network/backbone/dinov2_vitg14.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.DINOv2
+  tag: dinov2_vitg14
+
+output_dim: 1536
diff --git a/configs/model/network/backbone/dinov2_vitl14.yaml b/configs/model/network/backbone/dinov2_vitl14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8a562f1cc1f0937970697235221d15c51f33270c
--- /dev/null
+++ b/configs/model/network/backbone/dinov2_vitl14.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.DINOv2
+  tag: dinov2_vitl14
+
+output_dim: 1024
diff --git a/configs/model/network/backbone/dinov2_vits14.yaml b/configs/model/network/backbone/dinov2_vits14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5b2d2d4525826dc0986f5d403569398097b0bb0f
--- /dev/null
+++ b/configs/model/network/backbone/dinov2_vits14.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.DINOv2
+  tag: dinov2_vits14
+
+output_dim: 384
diff --git a/configs/model/network/backbone/identity.yaml b/configs/model/network/backbone/identity.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ad8a48eb52d85f0d680e0f04bfbe8feb65bfd8f7
--- /dev/null
+++ b/configs/model/network/backbone/identity.yaml
@@ -0,0 +1,2 @@
+instance:
+  _target_: torch.nn.Identity
diff --git a/configs/model/network/backbone/metaclip.yaml b/configs/model/network/backbone/metaclip.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb1ded120198c9038c8df4d98823d4402cdefecf
--- /dev/null
+++ b/configs/model/network/backbone/metaclip.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.CLIP
+  path: facebook/metaclip-l14-fullcc2.5b
+
+output_dim: 1024
diff --git a/configs/model/network/backbone/openclip_B_32.yaml b/configs/model/network/backbone/openclip_B_32.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f9e2c7a15785f11c68653e7c5cce864f056679e1
--- /dev/null
+++ b/configs/model/network/backbone/openclip_B_32.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.CLIP
+  path: laion/CLIP-ViT-B-32-laion2B-s34B-b79K
+
+output_dim: 768
diff --git a/configs/model/network/backbone/openclip_H_14.yaml b/configs/model/network/backbone/openclip_H_14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..69d510cc19b72921701414f4e076257831898f84
--- /dev/null
+++ b/configs/model/network/backbone/openclip_H_14.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.CLIP
+  path: laion/CLIP-ViT-H-14-laion2B-s32B-b79K
+
+output_dim: 1280
diff --git a/configs/model/network/backbone/openclip_L_14.yaml b/configs/model/network/backbone/openclip_L_14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bf01cdab4ffdc3ca9241eecd4e727262b5001af9
--- /dev/null
+++ b/configs/model/network/backbone/openclip_L_14.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.CLIP
+  path: laion/CLIP-ViT-L-14-laion2B-s32B-b82K
+
+output_dim: 1024
diff --git a/configs/model/network/backbone/openclip_bigG_14.yaml b/configs/model/network/backbone/openclip_bigG_14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2fe60509d7da3834811b7be6be04eef3e2225ff3
--- /dev/null
+++ b/configs/model/network/backbone/openclip_bigG_14.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.CLIP
+  path: laion/CLIP-ViT-bigG-14-laion2B-39B-b160k
+
+output_dim: 1664
diff --git a/configs/model/network/backbone/openclip_g_14.yaml b/configs/model/network/backbone/openclip_g_14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d96c4f9a8a4d3b141f3dcdf5b65946a6c333f0f0
--- /dev/null
+++ b/configs/model/network/backbone/openclip_g_14.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.CLIP
+  path: laion/CLIP-ViT-g-14-laion2B-s12B-b42K
+
+output_dim: 1408
diff --git a/configs/model/network/backbone/scratch_B_32.yaml b/configs/model/network/backbone/scratch_B_32.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e79f78928ca01421afcd4f7abadcbc9e42b444fb
--- /dev/null
+++ b/configs/model/network/backbone/scratch_B_32.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.CLIP
+  path: ''
+
+output_dim: 768
diff --git a/configs/model/network/backbone/streetclip.yaml b/configs/model/network/backbone/streetclip.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ed65d14a3ed446323ddf53d4f0e36b1e1be726e
--- /dev/null
+++ b/configs/model/network/backbone/streetclip.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.StreetCLIP
+  path: geolocal/StreetCLIP
+
+output_dim: 768
diff --git a/configs/model/network/baselines.yaml b/configs/model/network/baselines.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fe376147cad601300e67106022c8e8df7e79d4e4
--- /dev/null
+++ b/configs/model/network/baselines.yaml
@@ -0,0 +1,8 @@
+defaults:
+  - head: id_to_gps
+
+instance:
+  _target_: models.networks.network.NoFeatureBackbone
+  head: ${model.network.head}
+
+class_name: ${class_name}
diff --git a/configs/model/network/best_backbone.yaml b/configs/model/network/best_backbone.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5b40524577606842d785f0a5713420faccadf662
--- /dev/null
+++ b/configs/model/network/best_backbone.yaml
@@ -0,0 +1,14 @@
+defaults:
+  - backbone: openclip_B_32
+  - mid: mlp
+  - head: regression
+
+instance:
+  _target_: models.networks.network.ContrastiveHybridUnFrozenBackbone
+  backbone : ${model.network.backbone}
+  mid: ${model.network.mid}
+  head: ${model.network.head}
+  mode: ${mode}
+
+class_name: ${class_name}
+root_dir: ${root_dir}
diff --git a/configs/model/network/contrastive_frozen_backbone.yaml b/configs/model/network/contrastive_frozen_backbone.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2307bbc0b2d0acad41a70a4961eeaceddb08c305
--- /dev/null
+++ b/configs/model/network/contrastive_frozen_backbone.yaml
@@ -0,0 +1,13 @@
+defaults:
+  - backbone: openclip_B_32
+  - mid: mlp
+  - head: regression
+
+instance:
+  _target_: models.networks.network.ContrastiveFrozenBackbone
+  backbone : ${model.network.backbone}
+  mid: ${model.network.mid}
+  head: ${model.network.head}
+  mode: ${mode}
+
+class_name: ${class_name}
diff --git a/configs/model/network/contrastive_unfrozen_backbone.yaml b/configs/model/network/contrastive_unfrozen_backbone.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..11698012a5b4a8bf263c152401ac56b26897a0db
--- /dev/null
+++ b/configs/model/network/contrastive_unfrozen_backbone.yaml
@@ -0,0 +1,13 @@
+defaults:
+  - backbone: openclip_B_32
+  - mid: mlp
+  - head: regression
+
+instance:
+  _target_: models.networks.network.ContrastiveUnFrozenBackbone
+  backbone : ${model.network.backbone}
+  mid: ${model.network.mid}
+  head: ${model.network.head}
+  mode: ${mode}
+
+class_name: ${class_name}
diff --git a/configs/model/network/frozen_backbone.yaml b/configs/model/network/frozen_backbone.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b76f49694a4452f3f27c2f5832df8952cbd54ab
--- /dev/null
+++ b/configs/model/network/frozen_backbone.yaml
@@ -0,0 +1,12 @@
+defaults:
+  - backbone: openclip_B_32
+  - mid: mlp
+  - head: regression
+
+instance:
+  _target_: models.networks.network.FrozenBackbone
+  backbone : ${model.network.backbone}
+  mid: ${model.network.mid}
+  head: ${model.network.head}
+
+class_name: ${class_name}
diff --git a/configs/model/network/head/classification.yaml b/configs/model/network/head/classification.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4ab1ba7e92b0e2dc93a631f12ae7f692d98fdfa4
--- /dev/null
+++ b/configs/model/network/head/classification.yaml
@@ -0,0 +1,7 @@
+target_key: label
+final_dim:  ${num_classes}
+instance:
+  _target_: models.networks.heads.classification.ClassificationHead
+  id_to_gps:
+    _target_: models.networks.heads.id_to_gps.IdToGPS
+    id_to_gps: ${data_dir}/index_to_gps_unique_${class_name}.pt
diff --git a/configs/model/network/head/hybrid.yaml b/configs/model/network/head/hybrid.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aba5cedcf2811f56da364d77facce878189c18a3
--- /dev/null
+++ b/configs/model/network/head/hybrid.yaml
@@ -0,0 +1,8 @@
+target_key: label
+final_dim: ${eval:'${num_classes}*3'}
+instance:
+  _target_: models.networks.heads.hybrid.HybridHeadCentroid
+  final_dim: ${num_classes}
+  use_tanh: true
+  scale_tanh: 1.2
+  quadtree_path: ${data_dir}/${class_name}.csv
diff --git a/configs/model/network/head/hybrid_sharedreg.yaml b/configs/model/network/head/hybrid_sharedreg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..abd9a10de73c6cd5f96c6854d15bab1cadc20b57
--- /dev/null
+++ b/configs/model/network/head/hybrid_sharedreg.yaml
@@ -0,0 +1,7 @@
+final_dim: ${eval:'${num_classes}+2'}
+
+instance:
+  _target_: models.networks.heads.hybrid.SharedHybridHead
+
+defaults:
+  - hybrid
diff --git a/configs/model/network/head/id_to_gps.yaml b/configs/model/network/head/id_to_gps.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6e0855ed242021b42f23bf8d5c64449c01da3ae5
--- /dev/null
+++ b/configs/model/network/head/id_to_gps.yaml
@@ -0,0 +1,5 @@
+target_key: gps
+final_dim:  ${num_classes}
+instance:
+  _target_: models.networks.heads.id_to_gps.IdToGPS
+  id_to_gps: ${data_dir}/index_to_gps_${class_name}.pt
diff --git a/configs/model/network/head/multi_task.yaml b/configs/model/network/head/multi_task.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..72b006d74e58be16fb5c2b18b415029e5bd32477
--- /dev/null
+++ b/configs/model/network/head/multi_task.yaml
@@ -0,0 +1,6 @@
+target_key: gps
+final_dim:  2
+instance:
+  _target_: models.networks.heads.auxilliary.AuxHead
+  aux_data: ${aux_data}
+  use_tanh: true
diff --git a/configs/model/network/head/random.yaml b/configs/model/network/head/random.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7e09984cae75539d697d8f714e2960b6facbcef4
--- /dev/null
+++ b/configs/model/network/head/random.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.heads.random.Random
+  num_output: 2
+
+target_key: gps
diff --git a/configs/model/network/head/random_class.yaml b/configs/model/network/head/random_class.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d40579b0070c3c598137e4917a15df127eaeba41
--- /dev/null
+++ b/configs/model/network/head/random_class.yaml
@@ -0,0 +1,3 @@
+instance:
+  _target_: models.models.networks.random.Random
+  num_output: ${num_classes}
diff --git a/configs/model/network/head/random_coords.yaml b/configs/model/network/head/random_coords.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3a6912ed80ae5524b49375c25a2daf476ed08c1a
--- /dev/null
+++ b/configs/model/network/head/random_coords.yaml
@@ -0,0 +1,3 @@
+instance:
+  _target_: models.networks.heads.random.RandomCoords
+  coords_path: ${dataset.train_dataset.path}/train/train.csv
diff --git a/configs/model/network/head/regression.yaml b/configs/model/network/head/regression.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bbc1bfc389e799c57d2ce090e2e3d6aeab776192
--- /dev/null
+++ b/configs/model/network/head/regression.yaml
@@ -0,0 +1,5 @@
+target_key: gps
+final_dim:  2
+instance:
+  _target_: models.networks.heads.regression.RegressionHead
+  use_tanh: true
diff --git a/configs/model/network/head/regression_angle.yaml b/configs/model/network/head/regression_angle.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1edd6c5794619a35403d772d3f53882c9a3ff178
--- /dev/null
+++ b/configs/model/network/head/regression_angle.yaml
@@ -0,0 +1,4 @@
+target_key: gps
+final_dim:  4
+instance:
+  _target_: models.networks.heads.regression.RegressionHeadAngle
diff --git a/configs/model/network/hybrid_frozen_backbone.yaml b/configs/model/network/hybrid_frozen_backbone.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f9adbe44c892691383a53ff131c4eb9f6db2fe0c
--- /dev/null
+++ b/configs/model/network/hybrid_frozen_backbone.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.network.HybridFrozenBackbone
+
+defaults:
+  - frozen_backbone
diff --git a/configs/model/network/hybrid_unfrozen_backbone.yaml b/configs/model/network/hybrid_unfrozen_backbone.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa65dd02caafa3c6bffa5c59de728336518eaa74
--- /dev/null
+++ b/configs/model/network/hybrid_unfrozen_backbone.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.network.HybridUnfrozenBackbone
+
+defaults:
+  - unfrozen_backbone
diff --git a/configs/model/network/last_block_backbone.yaml b/configs/model/network/last_block_backbone.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d1b7ac56a0adbee20fc0632e62319f49b8b3a7d1
--- /dev/null
+++ b/configs/model/network/last_block_backbone.yaml
@@ -0,0 +1,12 @@
+defaults:
+  - backbone: openclip_B_32
+  - mid: mlp
+  - head: regression
+
+instance:
+  _target_: models.networks.network.UnfrozenPartBackbone
+  backbone : ${model.network.backbone}
+  mid: ${model.network.mid}
+  head: ${model.network.head}
+
+class_name: ${class_name}
diff --git a/configs/model/network/lora_backbone.yaml b/configs/model/network/lora_backbone.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cf87dcc4d6d16d4d3d6d666ee92f505b5a8224a6
--- /dev/null
+++ b/configs/model/network/lora_backbone.yaml
@@ -0,0 +1,16 @@
+defaults:
+  - backbone: openclip_B_32
+  - mid: mlp
+  - head: regression
+
+instance:
+  _target_: models.networks.network.LoraBackbone
+  backbone : ${model.network.backbone}
+  mid: ${model.network.mid}
+  head: ${model.network.head}
+  r: ${lora_r}
+  alpha: ${lora_alpha}
+  dropout: ${lora_dropout}
+  bias: ${lora_bias}
+
+class_name: ${class_name}
diff --git a/configs/model/network/mid/activation/gelu.yaml b/configs/model/network/mid/activation/gelu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..38f605b3505ed6bc61101a38585cc79b9a915f4a
--- /dev/null
+++ b/configs/model/network/mid/activation/gelu.yaml
@@ -0,0 +1,2 @@
+_target_: torch.nn.GELU
+_partial_: true
diff --git a/configs/model/network/mid/activation/relu.yaml b/configs/model/network/mid/activation/relu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f2018f12ef077af7ca12794130b47e003e100011
--- /dev/null
+++ b/configs/model/network/mid/activation/relu.yaml
@@ -0,0 +1,2 @@
+_target_: torch.nn.ReLU
+_partial_: true
diff --git a/configs/model/network/mid/identity.yaml b/configs/model/network/mid/identity.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a5994f7f8f2cc87d962fe2d8ef348b5c823478dd
--- /dev/null
+++ b/configs/model/network/mid/identity.yaml
@@ -0,0 +1,2 @@
+instance:
+  _target_: models.networks.mlp.Identity
\ No newline at end of file
diff --git a/configs/model/network/mid/mlp.yaml b/configs/model/network/mid/mlp.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dc1526e6fd55413aee194d3dff758b062637cd4b
--- /dev/null
+++ b/configs/model/network/mid/mlp.yaml
@@ -0,0 +1,13 @@
+defaults:
+  - activation: gelu
+  - norm: groupnorm #instance_1d
+
+instance:
+  _target_: models.networks.mlp.MLP
+  initial_dim: ${model.network.backbone.output_dim}
+  hidden_dim:
+    - ${model.network.backbone.output_dim}
+    - 64
+  final_dim: ${model.network.head.final_dim}
+  norm: ${model.network.mid.norm}
+  activation: ${model.network.mid.activation}
diff --git a/configs/model/network/mid/mlp_classif.yaml b/configs/model/network/mid/mlp_classif.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af2a68746947e5c3bfcc789986738d31ce373fe9
--- /dev/null
+++ b/configs/model/network/mid/mlp_classif.yaml
@@ -0,0 +1,13 @@
+defaults:
+  - activation: gelu
+  - norm: groupnorm #instance_1d
+
+instance:
+  _target_: models.networks.mlp.MLP
+  initial_dim: ${model.network.backbone.output_dim}
+  hidden_dim:
+    - ${model.network.backbone.output_dim}
+    - 512
+  final_dim: ${model.network.head.final_dim}
+  norm: ${model.network.mid.norm}
+  activation: ${model.network.mid.activation}
diff --git a/configs/model/network/mid/mlp_hybrid.yaml b/configs/model/network/mid/mlp_hybrid.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d3b2a0e5b8d9d1ee3aa756055ce6cb1c7cb9d78f
--- /dev/null
+++ b/configs/model/network/mid/mlp_hybrid.yaml
@@ -0,0 +1,13 @@
+defaults:
+  - activation: gelu
+  - norm: groupnorm #instance_1d
+
+instance:
+  _target_: models.networks.mlp.MLPCentroid
+  initial_dim: ${model.network.backbone.output_dim}
+  hidden_dim:
+    - ${model.network.backbone.output_dim}
+    - 512
+  final_dim: ${model.network.head.final_dim}
+  norm: ${model.network.mid.norm}
+  activation: ${model.network.mid.activation}
\ No newline at end of file
diff --git a/configs/model/network/mid/mlp_multi.yaml b/configs/model/network/mid/mlp_multi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c95d3d0ce657fc9582a7d1e1709654cde65e7298
--- /dev/null
+++ b/configs/model/network/mid/mlp_multi.yaml
@@ -0,0 +1,14 @@
+defaults:
+  - activation: gelu
+  - norm: identity
+
+instance:
+  _target_: models.networks.mlp.MLP
+  initial_dim: ${model.network.backbone.output_dim}
+  hidden_dim:
+    - ${model.network.backbone.output_dim}
+    - 64
+  final_dim: ${model.network.head.final_dim}
+  norm: ${model.network.mid.norm}
+  activation: ${model.network.mid.activation}
+  aux_data: ${aux_data}
diff --git a/configs/model/network/mid/mlp_resnet.yaml b/configs/model/network/mid/mlp_resnet.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2f5470e0172b165d18415e04e2e5af6c0342bc57
--- /dev/null
+++ b/configs/model/network/mid/mlp_resnet.yaml
@@ -0,0 +1,13 @@
+defaults:
+  - activation: gelu
+  - norm: groupnorm #instance_1d
+
+instance:
+  _target_: models.networks.mlp.MLPResNet
+  initial_dim: ${model.network.backbone.output_dim}
+  hidden_dim:
+    - ${model.network.backbone.output_dim}
+    - 64
+  final_dim: ${model.network.head.final_dim}
+  norm: ${model.network.mid.norm}
+  activation: ${model.network.mid.activation}
diff --git a/configs/model/network/mid/norm/batchnorm.yaml b/configs/model/network/mid/norm/batchnorm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6981245907923e7eb25a69396ea0ac1ac33303f1
--- /dev/null
+++ b/configs/model/network/mid/norm/batchnorm.yaml
@@ -0,0 +1,2 @@
+_target_: torch.nn.BatchNorm1d
+_partial_: true
diff --git a/configs/model/network/mid/norm/groupnorm.yaml b/configs/model/network/mid/norm/groupnorm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3cda2cd2679a59c0b622f1a23a0a604e2c48d5b8
--- /dev/null
+++ b/configs/model/network/mid/norm/groupnorm.yaml
@@ -0,0 +1,2 @@
+_target_: torch.nn.GroupNorm
+_partial_: true
diff --git a/configs/model/network/mid/norm/identity.yaml b/configs/model/network/mid/norm/identity.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..085370029c933d49a07de9bc621ab39e00b0d569
--- /dev/null
+++ b/configs/model/network/mid/norm/identity.yaml
@@ -0,0 +1,2 @@
+_target_: torch.nn.Identity
+_partial_: true
diff --git a/configs/model/network/mid/norm/instance_1d.yaml b/configs/model/network/mid/norm/instance_1d.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb9a092eff21a89a5b2bb5dc4b10f683b23c2cf6
--- /dev/null
+++ b/configs/model/network/mid/norm/instance_1d.yaml
@@ -0,0 +1,2 @@
+_target_: torch.nn.InstanceNorm1d
+_partial_: true
diff --git a/configs/model/network/multi_task.yaml b/configs/model/network/multi_task.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e5d36204f53fcd45cd02be3d7c2112cc46f16617
--- /dev/null
+++ b/configs/model/network/multi_task.yaml
@@ -0,0 +1,12 @@
+defaults:
+  - backbone: openclip_B_32
+  - mid: mlp_multi
+  - head: multi_task
+
+instance:
+  _target_: models.networks.network.UnfrozenBackbone
+  backbone : ${model.network.backbone}
+  mid: ${model.network.mid}
+  head: ${model.network.head}
+
+class_name: ${class_name}
diff --git a/configs/model/network/random.yaml b/configs/model/network/random.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..71cae4f78356df38be5601c92ee6f4944ac4d39f
--- /dev/null
+++ b/configs/model/network/random.yaml
@@ -0,0 +1,8 @@
+defaults:
+  - head: random
+
+instance:
+  _target_: models.networks.network.NoFeatureBackbone
+  head: ${model.network.head}
+
+class_name: ${class_name}
diff --git a/configs/model/network/text_contrastive.yaml b/configs/model/network/text_contrastive.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a2f965fd7c9b5c3a943676e4e6ef8da3c0c29ded
--- /dev/null
+++ b/configs/model/network/text_contrastive.yaml
@@ -0,0 +1,12 @@
+defaults:
+  - backbone: openclip_B_32
+  - mid: mlp
+  - head: regression
+
+instance:
+  _target_: models.networks.network.TextContrastiveUnFrozenBackbone
+  backbone : ${model.network.backbone}
+  mid: ${model.network.mid}
+  head: ${model.network.head}
+
+class_name: ${class_name}
diff --git a/configs/model/network/unfrozen_backbone.yaml b/configs/model/network/unfrozen_backbone.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e2fe0e56cf7499a859e6363036f5b4fb76b8e7b8
--- /dev/null
+++ b/configs/model/network/unfrozen_backbone.yaml
@@ -0,0 +1,12 @@
+defaults:
+  - backbone: openclip_B_32
+  - mid: mlp
+  - head: regression
+
+instance:
+  _target_: models.networks.network.UnfrozenBackbone
+  backbone : ${model.network.backbone}
+  mid: ${model.network.mid}
+  head: ${model.network.head}
+
+class_name: ${class_name}
diff --git a/configs/model/optimizer/adam.yaml b/configs/model/optimizer/adam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1626e276138a0b4a4226017f63e98cde141347a7
--- /dev/null
+++ b/configs/model/optimizer/adam.yaml
@@ -0,0 +1,12 @@
+optim:
+  _target_: torch.optim.Adam
+  lr: 1e-3
+  betas: [0.9, 0.999]
+  weight_decay: 0.01
+
+exclude_ln_and_biases_from_weight_decay: False
+lora_lr: 1e-4
+backbone_lr: 5e-6
+last_block_lr: 5e-5
+unfreeze_lr: False
+diff_backbone_last: False
diff --git a/configs/model/optimizer/adamw.yaml b/configs/model/optimizer/adamw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c50743c7edb140cf001b63d114e64c73b1168e1
--- /dev/null
+++ b/configs/model/optimizer/adamw.yaml
@@ -0,0 +1,10 @@
+optim:
+  _target_: torch.optim.AdamW
+  lr: 1e-3
+  betas: [0.9, 0.999]
+  weight_decay: 0.01
+
+exclude_ln_and_biases_from_weight_decay: False
+lora_lr: 1e-4
+backbone_lr: 2e-5
+unfreeze_lr: False
diff --git a/configs/model/optimizer/lamb.yaml b/configs/model/optimizer/lamb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..112a1a78d889236af324edaa354ddd201764fc29
--- /dev/null
+++ b/configs/model/optimizer/lamb.yaml
@@ -0,0 +1,10 @@
+optim:
+  _target_: utils.optimizers.Lamb
+  lr: 1e-3
+  betas: [0.9, 0.999]
+  weight_decay: 0.01
+
+exclude_ln_and_biases_from_weight_decay: False
+lora_lr: 1e-4
+backbone_lr: 2e-5
+unfreeze_lr: False
diff --git a/configs/model/optimizer/none.yaml b/configs/model/optimizer/none.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e8fcbba7bdb72a56b34dbaf07b6d8cfb0fe2c62a
--- /dev/null
+++ b/configs/model/optimizer/none.yaml
@@ -0,0 +1,7 @@
+optim:
+  _target_: models.misc.DoNothingOptimizer
+
+exclude_ln_and_biases_from_weight_decay: false
+lora_lr: 1e-4
+backbone_lr: 2e-5
+unfreeze_lr: False
diff --git a/configs/model/random.yaml b/configs/model/random.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fcb1a5b0f7e384f49429607ecfc60c5c69ff7e39
--- /dev/null
+++ b/configs/model/random.yaml
@@ -0,0 +1,10 @@
+defaults:
+  - optimizer: none
+  - lr_scheduler: none
+  - network: random
+  - loss: mix
+  - _self_
+
+name: Random
+aux_data: ${aux_data}
+text_tuning: ${text_tuning}
diff --git a/configs/model/regression.yaml b/configs/model/regression.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b829190c551727910db2c913ff96e9fd1745337
--- /dev/null
+++ b/configs/model/regression.yaml
@@ -0,0 +1,10 @@
+defaults:
+  - optimizer: adam
+  - lr_scheduler: none
+  - network: frozen_backbone
+  - loss: reg
+  - _self_
+
+name: Regression
+aux_data: ${aux_data}
+text_tuning: ${text_tuning}
diff --git a/configs/model/text_network/clip.yaml b/configs/model/text_network/clip.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f02c2e858c69ba47a81f0db4a0ce917809c463f9
--- /dev/null
+++ b/configs/model/text_network/clip.yaml
@@ -0,0 +1,5 @@
+instance:
+  _target_: models.networks.backbones.TextEncoder
+  path: ${model.network.backbone.instance.path}
+
+class_name: ${class_name}
diff --git a/configs/model/text_tuning.yaml b/configs/model/text_tuning.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..333606496b928dade37fa518e848d3a6fce3f14d
--- /dev/null
+++ b/configs/model/text_tuning.yaml
@@ -0,0 +1,11 @@
+defaults:
+  - optimizer: adam
+  - lr_scheduler: none
+  - network: text_contrastive
+  - text_network: clip
+  - loss: text_tuning
+  - _self_
+
+name: Regression
+aux_data: ${aux_data}
+text_tuning: ${text_tuning}
diff --git a/configs/scripts/enrich-metadata-quadtree.yaml b/configs/scripts/enrich-metadata-quadtree.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..acf366673d0976b7d15a5a1bc0bde34093680532
--- /dev/null
+++ b/configs/scripts/enrich-metadata-quadtree.yaml
@@ -0,0 +1,4 @@
+data_dir: ???
+depth: 10
+do_split: 1000
+overwrite_csv: False
diff --git a/configs/scripts/preprocess.yaml b/configs/scripts/preprocess.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ed72893008db792810a56a0587cdeb322746fb0e
--- /dev/null
+++ b/configs/scripts/preprocess.yaml
@@ -0,0 +1,4 @@
+data_dir: ???
+depth: 10
+do_split: 1000
+overwrite_csv: True
diff --git a/data/__init__.py b/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/data/augmentation.py b/data/augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1802a335705184f6709ee4a2ec1425b6175758a
--- /dev/null
+++ b/data/augmentation.py
@@ -0,0 +1,223 @@
+"""
+Adapted from https://github.com/nv-nguyen/template-pose/blob/main/src/utils/augmentation.py
+"""
+
+from torchvision import transforms
+from PIL import ImageEnhance, ImageFilter, Image
+import numpy as np
+import random
+import logging
+from torchvision.transforms import RandomResizedCrop, ToTensor
+
+
+class PillowRGBAugmentation:
+    def __init__(self, pillow_fn, p, factor_interval):
+        self._pillow_fn = pillow_fn
+        self.p = p
+        self.factor_interval = factor_interval
+
+    def __call__(self, PIL_image):
+        if random.random() <= self.p:
+            factor = random.uniform(*self.factor_interval)
+            if PIL_image.mode != "RGB":
+                logging.warning(
+                    f"Error when apply data aug, image mode: {PIL_image.mode}"
+                )
+                imgs = imgs.convert("RGB")
+                logging.warning(f"Success to change to {PIL_image.mode}")
+            PIL_image = (self._pillow_fn(PIL_image).enhance(factor=factor)).convert(
+                "RGB"
+            )
+        return PIL_image
+
+
+class PillowSharpness(PillowRGBAugmentation):
+    def __init__(
+        self,
+        p=0.3,
+        factor_interval=(0, 40.0),
+    ):
+        super().__init__(
+            pillow_fn=ImageEnhance.Sharpness,
+            p=p,
+            factor_interval=factor_interval,
+        )
+
+
+class PillowContrast(PillowRGBAugmentation):
+    def __init__(
+        self,
+        p=0.3,
+        factor_interval=(0.5, 1.6),
+    ):
+        super().__init__(
+            pillow_fn=ImageEnhance.Contrast,
+            p=p,
+            factor_interval=factor_interval,
+        )
+
+
+class PillowBrightness(PillowRGBAugmentation):
+    def __init__(
+        self,
+        p=0.5,
+        factor_interval=(0.5, 2.0),
+    ):
+        super().__init__(
+            pillow_fn=ImageEnhance.Brightness,
+            p=p,
+            factor_interval=factor_interval,
+        )
+
+
+class PillowColor(PillowRGBAugmentation):
+    def __init__(
+        self,
+        p=1,
+        factor_interval=(0.0, 20.0),
+    ):
+        super().__init__(
+            pillow_fn=ImageEnhance.Color,
+            p=p,
+            factor_interval=factor_interval,
+        )
+
+
+class PillowBlur:
+    def __init__(self, p=0.4, factor_interval=(1, 3)):
+        self.p = p
+        self.k = random.randint(*factor_interval)
+
+    def __call__(self, PIL_image):
+        if random.random() <= self.p:
+            PIL_image = PIL_image.filter(ImageFilter.GaussianBlur(self.k))
+        return PIL_image
+
+
+class NumpyGaussianNoise:
+    def __init__(self, p, factor_interval=(0.01, 0.3)):
+        self.noise_ratio = random.uniform(*factor_interval)
+        self.p = p
+
+    def __call__(self, img):
+        if random.random() <= self.p:
+            img = np.copy(img)
+            noisesigma = random.uniform(0, self.noise_ratio)
+            gauss = np.random.normal(0, noisesigma, img.shape) * 255
+            img = img + gauss
+
+            img[img > 255] = 255
+            img[img < 0] = 0
+        return Image.fromarray(np.uint8(img))
+
+
+class StandardAugmentation:
+    def __init__(
+        self, names, brightness, contrast, sharpness, color, blur, gaussian_noise
+    ):
+        self.brightness = brightness
+        self.contrast = contrast
+        self.sharpness = sharpness
+        self.color = color
+        self.blur = blur
+        self.gaussian_noise = gaussian_noise
+
+        # define a dictionary of augmentation functions to be applied
+        self.names = names.split(",")
+        self.augmentations = {
+            "brightness": self.brightness,
+            "contrast": self.contrast,
+            "sharpness": self.sharpness,
+            "color": self.color,
+            "blur": self.blur,
+            "gaussian_noise": self.gaussian_noise,
+        }
+
+    def __call__(self, img):
+        for name in self.names:
+            img = self.augmentations[name](img)
+        return img
+
+
+class GeometricAugmentation:
+    def __init__(
+        self,
+        names,
+        random_resized_crop,
+        random_horizontal_flip,
+        random_vertical_flip,
+        random_rotation,
+    ):
+        self.random_resized_crop = random_resized_crop
+        self.random_horizontal_flip = random_horizontal_flip
+        self.random_vertical_flip = random_vertical_flip
+        self.random_rotation = random_rotation
+        self.names = names.split(",")
+
+        self.augmentations = {
+            "random_resized_crop": self.random_resized_crop,
+            "random_horizontal_flip": self.random_horizontal_flip,
+            "random_vertical_flip": self.random_vertical_flip,
+            "random_rotation": self.random_rotation,
+        }
+
+    def __call__(self, img):
+        for name in self.names:
+            img = self.augmentations[name](img)
+        return img
+
+
+class ImageAugmentation:
+    def __init__(
+        self, names, clip_transform, standard_augmentation, geometric_augmentation
+    ):
+        self.clip_transform = clip_transform
+        self.standard_augmentation = standard_augmentation
+        self.geometric_augmentation = geometric_augmentation
+        self.names = names.split(",")
+        self.transforms = {
+            "clip_transform": self.clip_transform,
+            "standard_augmentation": self.standard_augmentation,
+            "geometric_augmentation": self.geometric_augmentation,
+        }
+        print(f"Image augmentation: {self.names}")
+
+    def __call__(self, img):
+        for name in self.names:
+            img = self.transforms[name](img)
+        return img
+
+
+if __name__ == "__main__":
+    # sanity check
+    import glob
+    import torchvision.transforms as transforms
+    from torchvision.utils import save_image
+    from omegaconf import DictConfig, OmegaConf
+    from hydra.utils import instantiate
+    import torch
+    from PIL import Image
+
+    augmentation_config = OmegaConf.load(
+        "./configs/dataset/train_transform/augmentation.yaml"
+    )
+    augmentation_config.names = "standard_augmentation,geometric_augmentation"
+    augmentation_transform = instantiate(augmentation_config)
+    img_paths = glob.glob("./datasets/osv5m/test/images/*.jpg")
+
+    num_try = 20
+    num_try_per_image = 8
+    num_imgs = 8
+
+    for idx in range(num_try):
+        imgs = []
+        for idx_img in range(num_imgs):
+            img = Image.open(img_paths[idx_img])
+            for idx_try in range(num_try_per_image):
+                if idx_try == 0:
+                    imgs.append(ToTensor()(img.resize((224, 224))))
+                img_aug = augmentation_transform(img.copy())
+                img_aug = ToTensor()(img_aug)
+                imgs.append(img_aug)
+        imgs = torch.stack(imgs)
+        save_image(imgs, f"augmentation_{idx:03d}.png", nrow=9)
diff --git a/data/data.py b/data/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0cb0316cd067ba2e034557ee755e797681042e2
--- /dev/null
+++ b/data/data.py
@@ -0,0 +1,711 @@
+import numpy as np
+import pandas as pd
+import torch
+import random
+
+from os.path import join
+from os.path import isfile
+from PIL import Image
+from sklearn.model_selection import train_test_split
+from torch.utils.data import Dataset
+from torchvision.transforms import (
+    Compose,
+    RandomCrop,
+    CenterCrop,
+    RandomHorizontalFlip,
+    ToTensor,
+)
+import time
+from torchvision.transforms import GaussianBlur
+from torchvision import transforms
+
+def normalize(lat, lon):
+    """Used to put all lat lon inside ±90 and ±180."""
+    lat = (lat + 90) % 360 - 90
+    if lat > 90:
+        lat = 180 - lat
+        lon += 180
+    lon = (lon + 180) % 360 - 180
+    return lat, lon
+
+
+def collate_fn(batch):
+    """Collate function for the dataloader.
+    Args:
+        batch (list): list of dictionaries with keys "img", "gps", "idx" and optionally "label"
+    Returns:
+        dict: dictionary with keys "img", "gps", "idx" and optionally "label"
+    """
+    keys = list(batch[0].keys())
+    if "weight" in batch[0].keys():
+        keys.remove("weight")
+    output = {}
+    for key in [
+        "idx",
+        "unique_country",
+        "unique_region",
+        "unique_sub-region",
+        "unique_city",
+        "img_idx",
+        "text",
+    ]:
+        if key in keys:
+            idx = [x[key] for x in batch]
+            output[key] = idx
+            keys.remove(key)
+    for key in keys:
+        if not ("text" in key):
+            output[key] = torch.stack([x[key] for x in batch])
+    return output
+
+
+def collate_fn_streetclip(batch):
+    """Collate function for the dataloader.
+    Args:
+        batch (list): list of dictionaries with keys "img", "gps", "idx" and optionally "label"
+    Returns:
+        dict: dictionary with keys "img", "gps", "idx" and optionally "label"
+    """
+    keys = list(batch[0].keys())
+    if "weight" in batch[0].keys():
+        keys.remove("weight")
+    output = {}
+    for key in [
+        "idx",
+        "unique_country",
+        "unique_region",
+        "unique_sub-region",
+        "unique_city",
+        "img_idx",
+        "img",
+        "text",
+    ]:
+        if key in keys:
+            idx = [x[key] for x in batch]
+            output[key] = idx
+            keys.remove(key)
+    for key in keys:
+        if not ("text" in key):
+            output[key] = torch.stack([x[key] for x in batch])
+    return output
+
+
+def collate_fn_denstity(batch):
+    """Collate function for the dataloader.
+    Args:
+        batch (list): list of dictionaries with keys "img", "gps", "idx" and optionally "label"
+    Returns:
+        dict: dictionary with keys "img", "gps", "idx" and optionally "label"
+    """
+    keys = list(batch[0].keys())
+    if "weight" in batch[0].keys():
+        keys.remove("weight")
+    # Sample indices based on the weights
+    weights = np.array([x["weight"] for x in batch])
+    normalized_weights = weights / np.sum(weights)
+    sampled_indices = np.random.choice(
+        len(batch), size=len(batch), p=normalized_weights, replace=True
+    )
+    output = {}
+    for key in [
+        "idx",
+        "unique_country",
+        "unique_region",
+        "unique_sub-region",
+        "unique_city",
+        "img_idx",
+        "text",
+    ]:
+        if key in keys:
+            idx = [batch[i][key] for i in sampled_indices]
+            output[key] = idx
+            keys.remove(key)
+    for key in keys:
+        if not ("text" in key):
+            output[key] = torch.stack([batch[i][key] for i in sampled_indices])
+    return output
+
+
+def collate_fn_streetclip_denstity(batch):
+    """Collate function for the dataloader.
+    Args:
+        batch (list): list of dictionaries with keys "img", "gps", "idx" and optionally "label"
+    Returns:
+        dict: dictionary with keys "img", "gps", "idx" and optionally "label"
+    """
+    keys = list(batch[0].keys())
+    if "weight" in batch[0].keys():
+        keys.remove("weight")
+    # Sample indices based on the weights
+    weights = np.array([x["weight"] for x in batch])
+    normalized_weights = weights / np.sum(weights)
+    sampled_indices = np.random.choice(
+        len(batch), size=len(batch), p=normalized_weights, replace=True
+    )
+    output = {}
+    for key in [
+        "idx",
+        "unique_country",
+        "unique_region",
+        "unique_sub-region",
+        "unique_city",
+        "img_idx",
+        "img",
+        "text",
+    ]:
+        if key in keys:
+            idx = [batch[i][key] for i in sampled_indices]
+            output[key] = idx
+            keys.remove(key)
+    for key in keys:
+        if not ("text" in key):
+            output[key] = torch.stack([batch[i][key] for i in sampled_indices])
+    return output
+
+
+def collate_fn_contrastive(batch):
+    """Collate function for the dataloader.
+    Args:
+        batch (list): list of dictionaries with keys "img", "gps", "idx" and optionally "label"
+    Returns:
+        dict: dictionary with keys "img", "gps", "idx" and optionally "label"
+    """
+    output = collate_fn(batch)
+    pos_img = torch.stack([x["pos_img"] for x in batch])
+    output["pos_img"] = pos_img
+    return output
+
+
+def collate_fn_contrastive_density(batch):
+    """Collate function for the dataloader.
+    Args:
+        batch (list): list of dictionaries with keys "img", "gps", "idx" and optionally "label"
+    Returns:
+        dict: dictionary with keys "img", "gps", "idx" and optionally "label"
+    """
+    keys = list(batch[0].keys())
+    if "weight" in batch[0].keys():
+        keys.remove("weight")
+    # Sample indices based on the weights
+    weights = np.array([x["weight"] for x in batch])
+    normalized_weights = weights / np.sum(weights)
+    sampled_indices = np.random.choice(
+        len(batch), size=len(batch), p=normalized_weights, replace=True
+    )
+    output = {}
+    for key in [
+        "idx",
+        "unique_country",
+        "unique_region",
+        "unique_sub-region",
+        "unique_city",
+        "img_idx",
+    ]:
+        if key in keys:
+            idx = [batch[i][key] for i in sampled_indices]
+            output[key] = idx
+            keys.remove(key)
+    for key in keys:
+        if not ("text" in key):
+            output[key] = torch.stack([batch[i][key] for i in sampled_indices])
+    return output
+
+
+class osv5m(Dataset):
+    csv_dtype = {"category": str, "country": str, "city": str}  # Don't remove.
+
+    def __init__(
+        self,
+        path,
+        transforms,
+        split="train",
+        class_name=None,
+        aux_data=[],
+        is_baseline=False,
+        areas=["country", "region", "sub-region", "city"],
+        streetclip=False,
+        suff="",
+        blur=False
+    ):
+        """Initializes the dataset.
+        Args:
+            path (str): path to the dataset
+            transforms (torchvision.transforms): transforms to apply to the images
+            split (str): split to use (train, val, test)
+            class_name (str): category to use (e.g. "city")
+            aux_data (list of str): auxilliary datas to use
+            areas (list of str): regions to perform accuracy
+            streetclip (bool): if the model is streetclip, do not use transform
+            suff (str): suffix of test csv
+            blur (bool): blur bottom of images or not
+        """
+        self.suff = suff
+        self.path = path
+        self.aux = len(aux_data) > 0
+        self.aux_list = aux_data
+        self.split = split
+        if split == "select":
+            self.df = self.load_split(split)
+            split = "test"
+        else:
+            self.df = self.load_split(split)
+        self.split = split
+        self.image_folder = join(
+            path,
+            'images',
+            ("train" if split == "val" else split),
+        )
+
+        self.dict_names = {}
+        for root, _, files in os.walk(self.image_folder):
+            for file in files:
+                self.dict_names[file] = os.path.join(root, file)
+
+        self.is_baseline = is_baseline
+        if self.aux:
+            self.aux_data = {}
+            for col in self.aux_list:
+                if col in ["land_cover", "climate", "soil"]:
+                    self.aux_data[col] = pd.get_dummies(self.df[col], dtype=float)
+                    if col == "climate":
+                        for i in range(31):
+                            if not (i in list(self.aux_data[col].columns)):
+                                self.aux_data[col][i] = 0
+                        desired_order = [i for i in range(31)]
+                        desired_order.remove(20)
+                        self.aux_data[col] = self.aux_data[col][desired_order]
+                else:
+                    self.aux_data[col] = self.df[col].apply(lambda x: [x])
+
+        self.areas = ["_".join(["unique", area]) for area in areas]
+        if class_name is None:
+            self.class_name = class_name
+        elif "quadtree" in class_name:
+            self.class_name = class_name
+        else:
+            self.class_name = "_".join(["unique", class_name])
+        ex = self.extract_classes(self.class_name)
+        self.df = self.df[
+            ["id", "latitude", "longitude", "weight"] + self.areas + ex
+        ].fillna("NaN")
+        if self.class_name in self.areas:
+            self.df.columns = list(self.df.columns)[:-1] + [self.class_name + "_2"]
+        self.transforms = transforms
+        self.collate_fn = collate_fn
+        self.collate_fn_density = collate_fn_denstity
+        self.blur = blur
+        self.streetclip = streetclip
+        if self.streetclip:
+            self.collate_fn = collate_fn_streetclip
+            self.collate_fn_density = collate_fn_streetclip_denstity
+
+    def load_split(self, split):
+        """Returns a new dataset with the given split."""
+        start_time = time.time()
+        if split == "test":
+            df = pd.read_csv(join(self.path, "test.csv"), dtype=self.csv_dtype)
+            # extract coord
+            longitude = df["longitude"].values
+            latitude = df["latitude"].values
+            # Create bins
+            num_bins = 100
+            lon_bins = np.linspace(longitude.min(), longitude.max(), num_bins)
+            lat_bins = np.linspace(latitude.min(), latitude.max(), num_bins)
+            # compute density and weights
+            hist, _, _ = np.histogram2d(longitude, latitude, bins=[lon_bins, lat_bins])
+            weights = 1.0 / np.power(hist[df["lon_bin"], df["lat_bin"]], 0.75)
+            normalized_weights = weights / np.sum(weights)
+            df["weight"] = normalized_weights
+            return df
+        elif split == "select":
+            df = pd.read_csv(
+                join(self.path, "select.csv"), dtype=self.csv_dtype
+            )
+            # extract coord
+            longitude = df["longitude"].values
+            latitude = df["latitude"].values
+            # Create bins
+            num_bins = 100
+            lon_bins = np.linspace(longitude.min(), longitude.max(), num_bins)
+            lat_bins = np.linspace(latitude.min(), latitude.max(), num_bins)
+            # compute density and weights
+            hist, _, _ = np.histogram2d(longitude, latitude, bins=[lon_bins, lat_bins])
+            weights = 1.0 / np.power(hist[df["lon_bin"], df["lat_bin"]], 0.75)
+            normalized_weights = weights / np.sum(weights)
+            df["weight"] = normalized_weights
+            return df
+        else:
+            if len(self.suff) == 0:
+                df = pd.read_csv(
+                    join(self.path, "train.csv"), dtype=self.csv_dtype
+                )
+            else:
+                df = pd.read_csv(
+                    join(self.path, "train" + "_" + self.suff + ".csv"),
+                    dtype=self.csv_dtype,
+                )
+
+        # extract coord
+        longitude = df["longitude"].values
+        latitude = df["latitude"].values
+        # Create bins
+        num_bins = 100
+        lon_bins = np.linspace(longitude.min(), longitude.max(), num_bins)
+        lat_bins = np.linspace(latitude.min(), latitude.max(), num_bins)
+        # compute density and weights
+        hist, _, _ = np.histogram2d(longitude, latitude, bins=[lon_bins, lat_bins])
+        weights = 1.0 / np.power(hist[df["lon_bin"], df["lat_bin"]], 0.75)
+        normalized_weights = weights / np.sum(weights)
+        df["weight"] = normalized_weights
+
+        test_df = df.sample(
+            n=int(0.1 * len(df)),
+            weights=normalized_weights,
+            replace=False,
+            random_state=42,
+        )
+
+        end_time = time.time()
+        print(f"Loading {split} dataset took {(end_time - start_time):.2f} seconds")
+
+        if split == "val":
+            return test_df
+        else:
+            return df.drop(test_df.index)
+
+    def extract_classes(self, tag=None):
+        """Extracts the categories from the dataset."""
+        if tag is None:
+            self.has_labels = False
+            return []
+        splits = ["train", "test"] if self.is_baseline else ["train"]
+        # splits = ["train", "test"]
+        print(f"Loading categories from {splits}")
+
+        # concatenate all categories from relevant splits to find the unique ones.
+        self.categories = sorted(
+            pd.concat(
+                [
+                    pd.read_csv(join(self.path, f"{split}.csv"))[tag]
+                    for split in splits
+                ]
+            )
+            .fillna("NaN")
+            .unique()
+            .tolist()
+        )
+
+        if "NaN" in self.categories:
+            self.categories.remove("NaN")
+            if self.split != "test":
+                self.df = self.df.dropna(subset=[tag])
+        # compute the total number of categories - this name is fixed and will be used as a lookup during init
+        self.num_classes = len(self.categories)
+
+        # create a mapping from category to index
+        self.category_to_index = {
+            category: i for i, category in enumerate(self.categories)
+        }
+        self.has_labels = True
+        return [tag]
+
+    def __getitem__(self, i):
+        """Returns an item from the dataset.
+        Args:
+            i (int): index of the item
+        Returns:
+            dict: dictionary with keys "img", "gps", "idx" and optionally "label"
+        """
+        x = list(self.df.iloc[i])  # id, latitude, longitude, {category}
+        if self.streetclip:
+            img = Image.open(self.dict_names[f"{int(x[0])}.jpg"])
+        elif self.blur:
+            img = transforms.ToTensor()(Image.open(self.dict_names[f"{int(x[0])}.jpg"]))
+            u = GaussianBlur(kernel_size = 13, sigma=2.0)
+            bottom_part = img[:, -14:, :].unsqueeze(0)
+            blurred_bottom = u(bottom_part)
+            img[:, -14:, :] = blurred_bottom.squeeze()
+            img = self.transforms(transforms.ToPILImage()(img))
+        else:
+            img = self.transforms(
+                Image.open(self.dict_names[f"{int(x[0])}.jpg"])
+            )
+            
+        lat, lon = normalize(x[1], x[2])
+        gps = torch.FloatTensor([np.radians(lat), np.radians(lon)]).squeeze(0)
+
+        output = {
+            "img": img,
+            "gps": gps,
+            "idx": i,
+            "img_idx": int(x[0]),
+            "weight": x[3],
+        }
+
+        for count, area in enumerate(self.areas):
+            output[area] = x[
+                count + 4
+            ]  #'country': x[3], 'region': x[4], 'sub-region': x[5], 'city': x[6]}
+
+        if self.has_labels:
+            if x[-1] in self.categories:
+                output["label"] = torch.LongTensor(
+                    [self.category_to_index[x[-1]]]
+                ).squeeze(-1)
+            else:
+                output["label"] = torch.LongTensor([-1]).squeeze(-1)
+        if self.aux:
+            for col in self.aux_list:
+                output[col] = torch.FloatTensor(self.aux_data[col].iloc[i])
+        return output
+
+    def __len__(self):
+        return len(self.df)
+
+
+class Contrastiveosv5m(osv5m):
+    def __init__(
+        self,
+        path,
+        transforms,
+        split="train",
+        class_name=None,
+        aux_data=[],
+        class_name2=None,
+        blur=False,
+    ):
+        """
+        class_name2 (str): if not None, we do contrastive an other class than the one specified for classif
+        """
+        super().__init__(
+            path,
+            transforms,
+            split=split,
+            class_name=class_name,
+            aux_data=aux_data,
+            blur=blur,
+        )
+        self.add_label = False
+        if not(class_name2 is None) and split != 'test' and split != 'select':
+            self.add_label = True
+            self.class_name = class_name2
+            self.extract_classes_contrastive(tag=class_name2)
+        self.df = self.df.reset_index(drop=True)
+        self.dict_classes = {
+            value: indices.tolist()
+            for value, indices in self.df.groupby(self.class_name).groups.items()
+        }
+        self.collate_fn = collate_fn_contrastive
+        self.random_crop = RandomCrop(224)  # use when no positive image is available
+
+    def sample_positive(self, i):
+        """
+        sample positive image from the same city, country if it is available
+        otherwise, apply different crop to the image
+        """
+        x = self.df.iloc[i]  # id, latitude, longitude, {category}
+        class_name = x[self.class_name]
+        idxs = self.dict_classes[class_name]
+        idxs.remove(i)
+
+        if len(idxs) > 0:
+            idx = random.choice(idxs)
+            x = self.df.iloc[idx]
+            pos_img = self.transforms(
+                Image.open(self.dict_names[f"{int(x['id'])}.jpg"])
+            )
+        else:
+            pos_img = self.random_crop(
+                self.transforms(
+                    Image.open(self.dict_names[f"{int(x['id'])}.jpg"])
+                )
+            )
+        return pos_img
+    
+    def extract_classes_contrastive(self, tag=None):
+        """Extracts the categories from the dataset."""
+        if tag is None:
+            self.has_labels = False
+            return []
+        splits = ["train", "test"] if self.is_baseline else ["train"]
+        # splits = ["train", "test"]
+        print(f"Loading categories from {splits}")
+
+        # concatenate all categories from relevant splits to find the unique ones.
+        categories = sorted(
+            pd.concat(
+                [
+                    pd.read_csv(join(self.path, f"{split}.csv"))[tag]
+                    for split in splits
+                ]
+            )
+            .fillna("NaN")
+            .unique()
+            .tolist()
+        )
+        # create a mapping from category to index
+        self.contrastive_category_to_index = {
+            category: i for i, category in enumerate(categories)
+        }
+ 
+
+    def __getitem__(self, i):
+        output = super().__getitem__(i)
+        pos_img = self.sample_positive(i)
+        output["pos_img"] = pos_img
+        if self.add_label:
+            output["label_contrastive"] = torch.LongTensor(
+                    [self.contrastive_category_to_index[self.df[self.class_name].iloc[i]]]
+                ).squeeze(-1)
+        return output
+
+
+class TextContrastiveosv5m(osv5m):
+    def __init__(
+        self,
+        path,
+        transforms,
+        split="train",
+        class_name=None,
+        aux_data=[],
+        blur=False,
+    ):
+        super().__init__(
+            path,
+            transforms,
+            split=split,
+            class_name=class_name,
+            aux_data=aux_data,
+            blur=blur,
+        )
+        self.df = self.df.reset_index(drop=True)
+
+    def get_text(self, i):
+        """
+        sample positive image from the same city, country if it is available
+        otherwise, apply different crop to the image
+        """
+        x = self.df.iloc[i]  # id, latitude, longitude, {category}
+        l = [
+            name.split("_")[-1]
+            for name in [
+                x["unique_city"],
+                x["unique_sub-region"],
+                x["unique_region"],
+                x["unique_country"],
+            ]
+        ]
+
+        pre = False
+        sentence = "An image of "
+        if l[0] != "NaN":
+            sentence += "the city of "
+            sentence += l[0]
+            pre = True
+
+        if l[1] != "NaN":
+            if pre:
+                sentence += ", in "
+            sentence += "the area of "
+            sentence += l[1]
+            pre = True
+
+        if l[2] != "NaN":
+            if pre:
+                sentence += ", in "
+            sentence += "the region of "
+            sentence += l[2]
+            pre = True
+
+        if l[3] != "NaN":
+            if pre:
+                sentence += ", in "
+            sentence += l[3]
+
+        return sentence
+
+    def __getitem__(self, i):
+        output = super().__getitem__(i)
+        output["text"] = self.get_text(i)
+        return output
+
+
+import os
+import json
+
+
+class Baseline(Dataset):
+    def __init__(
+        self,
+        path,
+        which,
+        transforms,
+    ):
+        """Initializes the dataset.
+        Args:
+            path (str): path to the dataset
+            which (str): which baseline to use (im2gps, im2gps3k)
+            transforms (torchvision.transforms): transforms to apply to the images
+        """
+        baselines = {
+            "im2gps": self.load_im2gps,
+            "im2gps3k": self.load_im2gps,
+            "yfcc4k": self.load_yfcc4k,
+        }
+        self.path = path
+        self.samples = baselines[which]()
+        self.transforms = transforms
+        self.collate_fn = collate_fn
+        self.class_name = which
+
+    def load_im2gps(
+        self,
+    ):
+        json_path = join(self.path, "info.json")
+        with open(json_path) as f:
+            data = json.load(f)
+
+        samples = []
+        for f in os.listdir(join(self.path, "images")):
+            if len(data[f]):
+                lat = float(data[f][-4].replace("latitude: ", ""))
+                lon = float(data[f][-3].replace("longitude: ", ""))
+                samples.append((f, lat, lon))
+
+        return samples
+
+    def load_yfcc4k(
+        self,
+    ):
+        samples = []
+        with open(join(self.path, "info.txt")) as f:
+            lines = f.readlines()
+        for line in lines:
+            x = line.split("\t")
+            f, lon, lat = x[1], x[12], x[13]
+            samples.append((f + ".jpg", float(lat), float(lon)))
+
+        return samples
+
+    def __getitem__(self, i):
+        """Returns an item from the dataset.
+        Args:
+            i (int): index of the item
+        Returns:
+            dict: dictionary with keys "img", "gps", "idx" and optionally "label"
+        """
+        img_path, lat, lon = self.samples[i]
+        img = self.transforms(
+            Image.open(join(self.path, "images", img_path)).convert("RGB")
+        )
+        lat, lon = normalize(lat, lon)
+        gps = torch.FloatTensor([np.radians(lat), np.radians(lon)]).squeeze(0)
+
+        return {
+            "img": img,
+            "gps": gps,
+            "idx": i,
+        }
+
+    def __len__(self):
+        return len(self.samples)
\ No newline at end of file
diff --git a/data/datamodule.py b/data/datamodule.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e5734cff7948e53c6ad64588debe0ecad51f46d
--- /dev/null
+++ b/data/datamodule.py
@@ -0,0 +1,85 @@
+import pytorch_lightning as L
+from torch.utils.data import DataLoader, random_split
+import torch
+import time
+
+
+class ImageDataModule(L.LightningDataModule):
+    def __init__(
+        self,
+        train_dataset,
+        val_dataset,
+        test_dataset,
+        global_batch_size,
+        num_workers,
+        num_nodes=1,
+        num_devices=1,
+        val_proportion=0.1,
+    ):
+        super().__init__()
+        self._builders = {
+            "train": train_dataset,
+            "val": val_dataset,
+            "test": test_dataset,
+        }
+        self.num_workers = num_workers
+        self.batch_size = global_batch_size // (num_nodes * num_devices)
+        print(f"Each GPU will receive {self.batch_size} images")
+        self.val_proportion = val_proportion
+
+    @property
+    def num_classes(self):
+        if hasattr(self, "train_dataset"):
+            return self.train_dataset.num_classes
+        else:
+            return self._builders["train"]().num_classes
+
+    def setup(self, stage=None):
+        """Setup the datamodule.
+        Args:
+            stage (str): stage of the datamodule
+                Is be one of "fit" or "test" or None
+        """
+        print("Stage", stage)
+        start_time = time.time()
+        if stage == "fit" or stage is None:
+            self.train_dataset = self._builders["train"]()
+            self.val_dataset = self._builders["val"]()
+            print(f"Train dataset size: {len(self.train_dataset)}")
+            print(f"Val dataset size: {len(self.val_dataset)}")
+        else:
+            self.test_dataset = self._builders["test"]()
+            print(f"Test dataset size: {len(self.test_dataset)}")
+        end_time = time.time()
+        print(f"Setup took {(end_time - start_time):.2f} seconds")
+
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            pin_memory=False,
+            drop_last=True,
+            num_workers=self.num_workers,
+            collate_fn=self.train_dataset.collate_fn_density,
+        )
+
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            pin_memory=False,
+            num_workers=self.num_workers,
+            collate_fn=self.val_dataset.collate_fn,
+        )
+
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            pin_memory=False,
+            num_workers=self.num_workers,
+            collate_fn=self.test_dataset.collate_fn,
+        )
diff --git a/data/transforms.py b/data/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..60aa1e04cdc80775f386742b038a4167c652e9e1
--- /dev/null
+++ b/data/transforms.py
@@ -0,0 +1,44 @@
+from transformers import CLIPProcessor
+
+
+class ClipTransform(object):
+    def __init__(self, split):
+        self.transform = CLIPProcessor.from_pretrained("geolocal/StreetCLIP")
+
+    def __call__(self, x):
+        # return self.transform(images=x, return_tensors="pt")["pixel_values"].squeeze(0)
+        return self.transform(images=[x], return_tensors="pt")
+
+
+if __name__ == "__main__":
+    # sanity check
+    import glob
+    import torchvision.transforms as transforms
+    from torchvision.utils import save_image
+    from omegaconf import DictConfig, OmegaConf
+    from hydra.utils import instantiate
+    import torch
+    from PIL import Image
+
+    fast_clip_config = OmegaConf.load(
+        "./configs/dataset/train_transform/fast_clip.yaml"
+    )
+    fast_clip_transform = instantiate(fast_clip_config)
+    clip_transform = ClipTransform(None)
+
+    img_paths = glob.glob("./datasets/osv5m/test/images/*.jpg")
+    original_imgs, re_implemted_imgs, diff = [], [], []
+
+    for i in range(16):
+        img = Image.open(img_paths[i])
+        clip_img = clip_transform(img)
+        fast_clip_img = fast_clip_transform(img)
+        original_imgs.append(clip_img)
+        re_implemted_imgs.append(fast_clip_img)
+        max_diff = (clip_img - fast_clip_img).abs()
+        diff.append(max_diff)
+        if max_diff.max() > 1e-5:
+            print(max_diff.max())
+    original_imgs = torch.stack(original_imgs)
+    re_implemted_imgs = torch.stack(re_implemted_imgs)
+    diff = torch.stack(diff)