katielink commited on Aug 16, 2023

Commit

cd6dcce

1 Parent(s): a264b9c

complete the model package

Browse files

Files changed (27) hide show

README.md +90 -0
configs/inference.json +138 -0
configs/logging.conf +21 -0
configs/metadata.json +82 -0
docs/README.md +83 -0
docs/demos.png +0 -0
docs/license.txt +4 -0
docs/renal.png +0 -0
docs/unest.png +0 -0
docs/val_dice.png +0 -0
models/model.pt +3 -0
scripts/__init__.py +10 -0
scripts/__pycache__/__init__.cpython-38.pyc +0 -0
scripts/networks/__init__.py +10 -0
scripts/networks/__pycache__/__init__.cpython-38.pyc +0 -0
scripts/networks/__pycache__/nest_transformer_3D.cpython-38.pyc +0 -0
scripts/networks/__pycache__/patchEmbed3D.cpython-38.pyc +0 -0
scripts/networks/__pycache__/unest.cpython-38.pyc +0 -0
scripts/networks/__pycache__/unest_block.cpython-38.pyc +0 -0
scripts/networks/nest/__init__.py +16 -0
scripts/networks/nest/__pycache__/__init__.cpython-38.pyc +0 -0
scripts/networks/nest/__pycache__/utils.cpython-38.pyc +0 -0
scripts/networks/nest/utils.py +485 -0
scripts/networks/nest_transformer_3D.py +489 -0
scripts/networks/patchEmbed3D.py +190 -0
scripts/networks/unest.py +274 -0
scripts/networks/unest_block.py +245 -0

README.md ADDED Viewed

	@@ -0,0 +1,90 @@

+---
+tags:
+- monai
+- medical
+library_name: monai
+license: unknown
+---
+# Description
+A pre-trained model for inferencing volumetric (3D) kidney substructures segmentation from contrast-enhanced CT images (Arterial/Portal Venous Phase).
+A tutorial and release of model for kidney cortex, medulla and collecting system segmentation.
+Authors: Yinchi Zhou (yinchi.zhou@vanderbilt.edu) | Xin Yu (xin.yu@vanderbilt.edu) | Yucheng Tang (yuchengt@nvidia.com) |
+# Model Overview
+A pre-trained UNEST base model [1] for volumetric (3D) renal structures segmentation using dynamic contrast enhanced arterial or venous phase CT images.
+## Data
+The training data is from the [ImageVU RenalSeg dataset] from Vanderbilt University and Vanderbilt University Medical Center.
+(The training data is not public available yet).
+- Target: Renal Cortex | Medulla | Pelvis Collecting System
+- Task: Segmentation
+- Modality: CT (Artrial | Venous phase)
+- Size: 96 3D volumes
+The data and segmentation demonstration is as follow:
+![](./renal.png) <br>
+## Method and Network
+The UNEST model is a 3D hierarchical transformer-based semgnetation network.
+Details of the architecture:
+![](./unest.png) <br>
+## Training configuration
+The training was performed with at least one 16GB-memory GPU.
+Actual Model Input: 96 x 96 x 96
+## Input and output formats
+Input: 1 channel CT image
+Output: 4: 0:Background, 1:Renal Cortex, 2:Medulla, 3:Pelvicalyceal System
+## Performance
+A graph showing the validation mean Dice for 5000 epochs.
+![](./val_dice.png) <br>
+This model achieves the following Dice score on the validation data (our own split from the training dataset):
+Mean Valdiation Dice = 0.8523
+Note that mean dice is computed in the original spacing of the input data.
+## commands example
+Download trained checkpoint model to ./model/model.pt:
+Add scripts component:  To run the workflow with customized components, PYTHONPATH should be revised to include the path to the customized component:
+```
+export PYTHONPATH=$PYTHONPATH:"'<path to the bundle root dir>/scripts'"
+```
+Execute inference:
+```
+python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file configs/inference.json --logging_file configs/logging.conf
+```
+## More examples output
+![](./demos.png) <br>
+# Disclaimer
+This is an example, not to be used for diagnostic purposes.
+# References
+[1] Yu, Xin, Yinchi Zhou, Yucheng Tang et al. "Characterizing Renal Structures with 3D Block Aggregate Transformers." arXiv preprint arXiv:2203.02430 (2022). https://arxiv.org/pdf/2203.02430.pdf
+[2] Zizhao Zhang et al. "Nested Hierarchical Transformer: Towards Accurate, Data-Efficient and Interpretable Visual Understanding." AAAI Conference on Artificial Intelligence (AAAI) 2022

configs/inference.json ADDED Viewed

	@@ -0,0 +1,138 @@

+{
+    "imports": [
+        "$import glob",
+        "$import os"
+    ],
+    "bundle_root": "/models/renalStructures_UNEST_segmentation",
+    "output_dir": "$@bundle_root + '/eval'",
+    "dataset_dir": "$@bundle_root + './dataset/spleen'",
+    "datalist": "$list(sorted(glob.glob(@dataset_dir + '/*.nii.gz')))",
+    "device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
+    "network_def": {
+        "_target_": "scripts.networks.unest.UNesT",
+        "in_channels": 1,
+        "out_channels": 4
+    },
+    "network": "$@network_def.to(@device)",
+    "preprocessing": {
+        "_target_": "Compose",
+        "transforms": [
+            {
+                "_target_": "LoadImaged",
+                "keys": "image"
+            },
+            {
+                "_target_": "AddChanneld",
+                "keys": "image"
+            },
+            {
+                "_target_": "Orientationd",
+                "keys": "image",
+                "axcodes": "RAS"
+            },
+            {
+                "_target_": "Spacingd",
+                "keys": "image",
+                "pixdim": [
+                    1.0,
+                    1.0,
+                    1.0
+                ],
+                "mode": "bilinear"
+            },
+            {
+                "_target_": "ScaleIntensityRanged",
+                "keys": "image",
+                "a_min": -175,
+                "a_max": 250,
+                "b_min": 0.0,
+                "b_max": 1.0,
+                "clip": true
+            },
+            {
+                "_target_": "EnsureTyped",
+                "keys": "image"
+            }
+        ]
+    },
+    "dataset": {
+        "_target_": "Dataset",
+        "data": "$[{'image': i} for i in @datalist]",
+        "transform": "@preprocessing"
+    },
+    "dataloader": {
+        "_target_": "DataLoader",
+        "dataset": "@dataset",
+        "batch_size": 1,
+        "shuffle": false,
+        "num_workers": 4
+    },
+    "inferer": {
+        "_target_": "SlidingWindowInferer",
+        "roi_size": [
+            96,
+            96,
+            96
+        ],
+        "sw_batch_size": 4,
+        "overlap": 0.5
+    },
+    "postprocessing": {
+        "_target_": "Compose",
+        "transforms": [
+            {
+                "_target_": "Activationsd",
+                "keys": "pred",
+                "softmax": true
+            },
+            {
+                "_target_": "Invertd",
+                "keys": "pred",
+                "transform": "@preprocessing",
+                "orig_keys": "image",
+                "meta_key_postfix": "meta_dict",
+                "nearest_interp": false,
+                "to_tensor": true
+            },
+            {
+                "_target_": "AsDiscreted",
+                "keys": "pred",
+                "argmax": true
+            },
+            {
+                "_target_": "SaveImaged",
+                "keys": "pred",
+                "meta_keys": "pred_meta_dict",
+                "output_dir": "@output_dir"
+            }
+        ]
+    },
+    "handlers": [
+        {
+            "_target_": "CheckpointLoader",
+            "load_path": "$@bundle_root + '/models/model.pt'",
+            "load_dict": {
+                "state_dict": "@network"
+            },
+            "strict": "True"
+        },
+        {
+            "_target_": "StatsHandler",
+            "iteration_log": false
+        }
+    ],
+    "evaluator": {
+        "_target_": "SupervisedEvaluator",
+        "device": "@device",
+        "val_data_loader": "@dataloader",
+        "network": "@network",
+        "inferer": "@inferer",
+        "postprocessing": "@postprocessing",
+        "val_handlers": "@handlers",
+        "amp": false
+    },
+    "evaluating": [
+        "$setattr(torch.backends.cudnn, 'benchmark', True)",
+        "$@evaluator.run()"
+    ]
+}

configs/logging.conf ADDED Viewed

	@@ -0,0 +1,21 @@

+[loggers]
+keys=root
+[handlers]
+keys=consoleHandler
+[formatters]
+keys=fullFormatter
+[logger_root]
+level=INFO
+handlers=consoleHandler
+[handler_consoleHandler]
+class=StreamHandler
+level=INFO
+formatter=fullFormatter
+args=(sys.stdout,)
+[formatter_fullFormatter]
+format=%(asctime)s - %(name)s - %(levelname)s - %(message)s

configs/metadata.json ADDED Viewed

	@@ -0,0 +1,82 @@

+{
+    "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
+    "version": "0.1.0",
+    "changelog": {
+        "0.1.0": "complete the model package",
+        "0.0.1": "initialize the model package structure"
+    },
+    "monai_version": "0.9.0",
+    "pytorch_version": "1.10.0",
+    "numpy_version": "1.21.2",
+    "optional_packages_version": {
+        "nibabel": "3.2.1",
+        "pytorch-ignite": "0.4.8",
+        "einops": "0.4.1",
+        "fire": "0.4.0",
+        "timm": "0.6.7"
+    },
+    "task": "Renal segmentation",
+    "description": "A transformer-based model for renal segmentation from CT image",
+    "authors": "Vanderbilt University + MONAI team",
+    "copyright": "Copyright (c) MONAI Consortium",
+    "data_source": "RawData.zip",
+    "data_type": "nibabel",
+    "image_classes": "single channel data, intensity scaled to [0, 1]",
+    "label_classes": "1: Kideny Cortex, 2:Medulla, 3:Pelvicalyceal system",
+    "pred_classes": "1: Kideny Cortex, 2:Medulla, 3:Pelvicalyceal system",
+    "eval_metrics": {
+        "mean_dice": 0.85
+    },
+    "intended_use": "This is an example, not to be used for diagnostic purposes",
+    "references": [
+        "Tang, Yucheng, et al. 'Self-supervised pre-training of swin transformers for 3d medical image analysis. arXiv preprint arXiv:2111.14791 (2021). https://arxiv.org/abs/2111.14791."
+    ],
+    "network_data_format": {
+        "inputs": {
+            "image": {
+                "type": "image",
+                "format": "hounsfield",
+                "modality": "CT",
+                "num_channels": 1,
+                "spatial_shape": [
+                    96,
+                    96,
+                    96
+                ],
+                "dtype": "float32",
+                "value_range": [
+                    0,
+                    1
+                ],
+                "is_patch_data": true,
+                "channel_def": {
+                    "0": "image"
+                }
+            }
+        },
+        "outputs": {
+            "pred": {
+                "type": "image",
+                "format": "segmentation",
+                "num_channels": 4,
+                "spatial_shape": [
+                    96,
+                    96,
+                    96
+                ],
+                "dtype": "float32",
+                "value_range": [
+                    0,
+                    1
+                ],
+                "is_patch_data": true,
+                "channel_def": {
+                    "0": "background",
+                    "1": "kidney cortex",
+                    "2": "medulla",
+                    "3": "pelvicalyceal system"
+                }
+            }
+        }
+    }
+}

docs/README.md ADDED Viewed

	@@ -0,0 +1,83 @@

+# Description
+A pre-trained model for inferencing volumetric (3D) kidney substructures segmentation from contrast-enhanced CT images (Arterial/Portal Venous Phase).
+A tutorial and release of model for kidney cortex, medulla and collecting system segmentation.
+Authors: Yinchi Zhou (yinchi.zhou@vanderbilt.edu) | Xin Yu (xin.yu@vanderbilt.edu) | Yucheng Tang (yuchengt@nvidia.com) |
+# Model Overview
+A pre-trained UNEST base model [1] for volumetric (3D) renal structures segmentation using dynamic contrast enhanced arterial or venous phase CT images.
+## Data
+The training data is from the [ImageVU RenalSeg dataset] from Vanderbilt University and Vanderbilt University Medical Center.
+(The training data is not public available yet).
+- Target: Renal Cortex | Medulla | Pelvis Collecting System
+- Task: Segmentation
+- Modality: CT (Artrial | Venous phase)
+- Size: 96 3D volumes
+The data and segmentation demonstration is as follow:
+![](./renal.png) <br>
+## Method and Network
+The UNEST model is a 3D hierarchical transformer-based semgnetation network.
+Details of the architecture:
+![](./unest.png) <br>
+## Training configuration
+The training was performed with at least one 16GB-memory GPU.
+Actual Model Input: 96 x 96 x 96
+## Input and output formats
+Input: 1 channel CT image
+Output: 4: 0:Background, 1:Renal Cortex, 2:Medulla, 3:Pelvicalyceal System
+## Performance
+A graph showing the validation mean Dice for 5000 epochs.
+![](./val_dice.png) <br>
+This model achieves the following Dice score on the validation data (our own split from the training dataset):
+Mean Valdiation Dice = 0.8523
+Note that mean dice is computed in the original spacing of the input data.
+## commands example
+Download trained checkpoint model to ./model/model.pt:
+Add scripts component:  To run the workflow with customized components, PYTHONPATH should be revised to include the path to the customized component:
+```
+export PYTHONPATH=$PYTHONPATH:"'<path to the bundle root dir>/scripts'"
+```
+Execute inference:
+```
+python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file configs/inference.json --logging_file configs/logging.conf
+```
+## More examples output
+![](./demos.png) <br>
+# Disclaimer
+This is an example, not to be used for diagnostic purposes.
+# References
+[1] Yu, Xin, Yinchi Zhou, Yucheng Tang et al. "Characterizing Renal Structures with 3D Block Aggregate Transformers." arXiv preprint arXiv:2203.02430 (2022). https://arxiv.org/pdf/2203.02430.pdf
+[2] Zizhao Zhang et al. "Nested Hierarchical Transformer: Towards Accurate, Data-Efficient and Interpretable Visual Understanding." AAAI Conference on Artificial Intelligence (AAAI) 2022

docs/demos.png ADDED Viewed

docs/license.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+Third Party Licenses
+-----------------------------------------------------------------------
+/*********************************************************************/

docs/renal.png ADDED Viewed

docs/unest.png ADDED Viewed

docs/val_dice.png ADDED Viewed

models/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8928e88771d31945c51d1b302a8448825e6f9861a543a6e1023acb9576840962
+size 348887167

scripts/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

scripts/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (192 Bytes). View file

scripts/networks/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

scripts/networks/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (201 Bytes). View file

scripts/networks/__pycache__/nest_transformer_3D.cpython-38.pyc ADDED Viewed

Binary file (15.5 kB). View file

scripts/networks/__pycache__/patchEmbed3D.cpython-38.pyc ADDED Viewed

Binary file (5.8 kB). View file

scripts/networks/__pycache__/unest.cpython-38.pyc ADDED Viewed

Binary file (5.79 kB). View file

scripts/networks/__pycache__/unest_block.cpython-38.pyc ADDED Viewed

Binary file (5.45 kB). View file

scripts/networks/nest/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/usr/bin/env python3
+from .utils import (
+    Conv3dSame,
+    DropPath,
+    Linear,
+    Mlp,
+    _assert,
+    conv3d_same,
+    create_conv3d,
+    create_pool3d,
+    get_padding,
+    get_same_padding,
+    pad_same,
+    to_ntuple,
+    trunc_normal_,
+)

scripts/networks/nest/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (496 Bytes). View file

scripts/networks/nest/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (15.2 kB). View file

scripts/networks/nest/utils.py ADDED Viewed

	@@ -0,0 +1,485 @@

+#!/usr/bin/env python3
+import collections.abc
+import math
+import warnings
+from itertools import repeat
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from torch import _assert
+except ImportError:
+    def _assert(condition: bool, message: str):
+        assert condition, message
+def drop_block_2d(
+    x,
+    drop_prob: float = 0.1,
+    block_size: int = 7,
+    gamma_scale: float = 1.0,
+    with_noise: bool = False,
+    inplace: bool = False,
+    batchwise: bool = False,
+):
+    """DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
+    DropBlock with an experimental gaussian noise option. This layer has been tested on a few training
+    runs with success, but needs further validation and possibly optimization for lower runtime impact.
+    """
+    b, c, h, w = x.shape
+    total_size = w * h
+    clipped_block_size = min(block_size, min(w, h))
+    # seed_drop_rate, the gamma parameter
+    gamma = (
+        gamma_scale * drop_prob * total_size / clipped_block_size**2 / ((w - block_size + 1) * (h - block_size + 1))
+    )
+    # Forces the block to be inside the feature map.
+    w_i, h_i = torch.meshgrid(torch.arange(w).to(x.device), torch.arange(h).to(x.device))
+    valid_block = ((w_i >= clipped_block_size // 2) & (w_i < w - (clipped_block_size - 1) // 2)) & (
+        (h_i >= clipped_block_size // 2) & (h_i < h - (clipped_block_size - 1) // 2)
+    )
+    valid_block = torch.reshape(valid_block, (1, 1, h, w)).to(dtype=x.dtype)
+    if batchwise:
+        # one mask for whole batch, quite a bit faster
+        uniform_noise = torch.rand((1, c, h, w), dtype=x.dtype, device=x.device)
+    else:
+        uniform_noise = torch.rand_like(x)
+    block_mask = ((2 - gamma - valid_block + uniform_noise) >= 1).to(dtype=x.dtype)
+    block_mask = -F.max_pool2d(
+        -block_mask, kernel_size=clipped_block_size, stride=1, padding=clipped_block_size // 2  # block_size,
+    )
+    if with_noise:
+        normal_noise = torch.randn((1, c, h, w), dtype=x.dtype, device=x.device) if batchwise else torch.randn_like(x)
+        if inplace:
+            x.mul_(block_mask).add_(normal_noise * (1 - block_mask))
+        else:
+            x = x * block_mask + normal_noise * (1 - block_mask)
+    else:
+        normalize_scale = (block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-7)).to(x.dtype)
+        if inplace:
+            x.mul_(block_mask * normalize_scale)
+        else:
+            x = x * block_mask * normalize_scale
+    return x
+def drop_block_fast_2d(
+    x: torch.Tensor,
+    drop_prob: float = 0.1,
+    block_size: int = 7,
+    gamma_scale: float = 1.0,
+    with_noise: bool = False,
+    inplace: bool = False,
+):
+    """DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
+    DropBlock with an experimental gaussian noise option. Simplied from above without concern for valid
+    block mask at edges.
+    """
+    b, c, h, w = x.shape
+    total_size = w * h
+    clipped_block_size = min(block_size, min(w, h))
+    gamma = (
+        gamma_scale * drop_prob * total_size / clipped_block_size**2 / ((w - block_size + 1) * (h - block_size + 1))
+    )
+    block_mask = torch.empty_like(x).bernoulli_(gamma)
+    block_mask = F.max_pool2d(
+        block_mask.to(x.dtype), kernel_size=clipped_block_size, stride=1, padding=clipped_block_size // 2
+    )
+    if with_noise:
+        normal_noise = torch.empty_like(x).normal_()
+        if inplace:
+            x.mul_(1.0 - block_mask).add_(normal_noise * block_mask)
+        else:
+            x = x * (1.0 - block_mask) + normal_noise * block_mask
+    else:
+        block_mask = 1 - block_mask
+        normalize_scale = (block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-6)).to(dtype=x.dtype)
+        if inplace:
+            x.mul_(block_mask * normalize_scale)
+        else:
+            x = x * block_mask * normalize_scale
+    return x
+class DropBlock2d(nn.Module):
+    """DropBlock. See https://arxiv.org/pdf/1810.12890.pdf"""
+    def __init__(
+        self, drop_prob=0.1, block_size=7, gamma_scale=1.0, with_noise=False, inplace=False, batchwise=False, fast=True
+    ):
+        super(DropBlock2d, self).__init__()
+        self.drop_prob = drop_prob
+        self.gamma_scale = gamma_scale
+        self.block_size = block_size
+        self.with_noise = with_noise
+        self.inplace = inplace
+        self.batchwise = batchwise
+        self.fast = fast  # FIXME finish comparisons of fast vs not
+    def forward(self, x):
+        if not self.training or not self.drop_prob:
+            return x
+        if self.fast:
+            return drop_block_fast_2d(
+                x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace
+            )
+        else:
+            return drop_block_2d(
+                x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace, self.batchwise
+            )
+def drop_path(x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None, scale_by_keep=True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+def create_conv3d(in_channels, out_channels, kernel_size, **kwargs):
+    """Select a 2d convolution implementation based on arguments
+    Creates and returns one of torch.nn.Conv2d, Conv2dSame, MixedConv3d, or CondConv2d.
+    Used extensively by EfficientNet, MobileNetv3 and related networks.
+    """
+    depthwise = kwargs.pop("depthwise", False)
+    # for DW out_channels must be multiple of in_channels as must have out_channels % groups == 0
+    groups = in_channels if depthwise else kwargs.pop("groups", 1)
+    m = create_conv3d_pad(in_channels, out_channels, kernel_size, groups=groups, **kwargs)
+    return m
+def conv3d_same(
+    x,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    stride: Tuple[int, int] = (1, 1, 1),
+    padding: Tuple[int, int] = (0, 0, 0),
+    dilation: Tuple[int, int] = (1, 1, 1),
+    groups: int = 1,
+):
+    x = pad_same(x, weight.shape[-3:], stride, dilation)
+    return F.conv3d(x, weight, bias, stride, (0, 0, 0), dilation, groups)
+class Conv3dSame(nn.Conv2d):
+    """Tensorflow like 'SAME' convolution wrapper for 2D convolutions"""
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
+        super(Conv3dSame, self).__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
+    def forward(self, x):
+        return conv3d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+def create_conv3d_pad(in_chs, out_chs, kernel_size, **kwargs):
+    padding = kwargs.pop("padding", "")
+    kwargs.setdefault("bias", False)
+    padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs)
+    if is_dynamic:
+        return Conv3dSame(in_chs, out_chs, kernel_size, **kwargs)
+    else:
+        return nn.Conv3d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
+# Calculate symmetric padding for a convolution
+def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int:
+    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+    return padding
+# Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution
+def get_same_padding(x: int, k: int, s: int, d: int):
+    return max((math.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0)
+# Can SAME padding for given args be done statically?
+def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_):
+    return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
+# Dynamically pad input x with 'SAME' padding for conv with specified args
+def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1, 1), value: float = 0):
+    id, ih, iw = x.size()[-3:]
+    pad_d, pad_h, pad_w = (
+        get_same_padding(id, k[0], s[0], d[0]),
+        get_same_padding(ih, k[1], s[1], d[1]),
+        get_same_padding(iw, k[2], s[2], d[2]),
+    )
+    if pad_d > 0 or pad_h > 0 or pad_w > 0:
+        x = F.pad(
+            x,
+            [pad_d // 2, pad_d - pad_d // 2, pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2],
+            value=value,
+        )
+    return x
+def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]:
+    dynamic = False
+    if isinstance(padding, str):
+        # for any string padding, the padding will be calculated for you, one of three ways
+        padding = padding.lower()
+        if padding == "same":
+            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
+            if is_static_pad(kernel_size, **kwargs):
+                # static case, no extra overhead
+                padding = get_padding(kernel_size, **kwargs)
+            else:
+                # dynamic 'SAME' padding, has runtime/GPU memory overhead
+                padding = 0
+                dynamic = True
+        elif padding == "valid":
+            # 'VALID' padding, same as padding=0
+            padding = 0
+        else:
+            # Default to PyTorch style 'same'-ish symmetric padding
+            padding = get_padding(kernel_size, **kwargs)
+    return padding, dynamic
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+def make_divisible(v, divisor=8, min_value=None, round_limit=0.9):
+    min_value = min_value or divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < round_limit * v:
+        new_v += divisor
+    return new_v
+class Linear(nn.Linear):
+    r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
+    Wraps torch.nn.Linear to support AMP + torchscript usage by manually casting
+    weight & bias to input.dtype to work around an issue w/ torch.addmm in this use case.
+    """
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if torch.jit.is_scripting():
+            bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None
+            return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias)
+        else:
+            return F.linear(input, self.weight, self.bias)
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        drop_probs = to_2tuple(drop)
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+def avg_pool3d_same(
+    x,
+    kernel_size: List[int],
+    stride: List[int],
+    padding: List[int] = (0, 0, 0),
+    ceil_mode: bool = False,
+    count_include_pad: bool = True,
+):
+    # FIXME how to deal with count_include_pad vs not for external padding?
+    x = pad_same(x, kernel_size, stride)
+    return F.avg_pool3d(x, kernel_size, stride, (0, 0, 0), ceil_mode, count_include_pad)
+class AvgPool3dSame(nn.AvgPool2d):
+    """Tensorflow like 'SAME' wrapper for 2D average pooling"""
+    def __init__(self, kernel_size: int, stride=None, padding=0, ceil_mode=False, count_include_pad=True):
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        super(AvgPool3dSame, self).__init__(kernel_size, stride, (0, 0, 0), ceil_mode, count_include_pad)
+    def forward(self, x):
+        x = pad_same(x, self.kernel_size, self.stride)
+        return F.avg_pool3d(x, self.kernel_size, self.stride, self.padding, self.ceil_mode, self.count_include_pad)
+def max_pool3d_same(
+    x,
+    kernel_size: List[int],
+    stride: List[int],
+    padding: List[int] = (0, 0, 0),
+    dilation: List[int] = (1, 1, 1),
+    ceil_mode: bool = False,
+):
+    x = pad_same(x, kernel_size, stride, value=-float("inf"))
+    return F.max_pool3d(x, kernel_size, stride, (0, 0, 0), dilation, ceil_mode)
+class MaxPool3dSame(nn.MaxPool2d):
+    """Tensorflow like 'SAME' wrapper for 3D max pooling"""
+    def __init__(self, kernel_size: int, stride=None, padding=0, dilation=1, ceil_mode=False):
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+        super(MaxPool3dSame, self).__init__(kernel_size, stride, (0, 0, 0), dilation, ceil_mode)
+    def forward(self, x):
+        x = pad_same(x, self.kernel_size, self.stride, value=-float("inf"))
+        return F.max_pool3d(x, self.kernel_size, self.stride, (0, 0, 0), self.dilation, self.ceil_mode)
+def create_pool3d(pool_type, kernel_size, stride=None, **kwargs):
+    stride = stride or kernel_size
+    padding = kwargs.pop("padding", "")
+    padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, **kwargs)
+    if is_dynamic:
+        if pool_type == "avg":
+            return AvgPool3dSame(kernel_size, stride=stride, **kwargs)
+        elif pool_type == "max":
+            return MaxPool3dSame(kernel_size, stride=stride, **kwargs)
+        else:
+            raise AssertionError()
+            # assert False, f"Unsupported pool type {pool_type}"
+    else:
+        if pool_type == "avg":
+            return nn.AvgPool3d(kernel_size, stride=stride, padding=padding, **kwargs)
+        elif pool_type == "max":
+            return nn.MaxPool3d(kernel_size, stride=stride, padding=padding, **kwargs)
+        else:
+            raise AssertionError()
+            # assert False, f"Unsupported pool type {pool_type}"
+def _float_to_int(x: float) -> int:
+    """
+    Symbolic tracing helper to substitute for inbuilt `int`.
+    Hint: Inbuilt `int` can't accept an argument of type `Proxy`
+    """
+    return int(x)
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.0))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)

scripts/networks/nest_transformer_3D.py ADDED Viewed

	@@ -0,0 +1,489 @@

+#!/usr/bin/env python3
+# =========================================================================
+# Adapted from https://github.com/google-research/nested-transformer.
+# which has the following license...
+# https://github.com/pytorch/vision/blob/main/LICENSE
+#
+# BSD 3-Clause License
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+""" Nested Transformer (NesT) in PyTorch
+A PyTorch implement of Aggregating Nested Transformers as described in:
+'Aggregating Nested Transformers'
+    - https://arxiv.org/abs/2105.12723
+The official Jax code is released and available at https://github.com/google-research/nested-transformer.
+The weights have been converted with convert/convert_nest_flax.py
+Acknowledgments:
+* The paper authors for sharing their research, code, and model weights
+* Ross Wightman's existing code off which I based this
+Copyright 2021 Alexander Soare
+"""
+import collections.abc
+import logging
+import math
+from functools import partial
+from typing import Callable, Sequence
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .nest import DropPath, Mlp, _assert, create_conv3d, create_pool3d, to_ntuple, trunc_normal_
+from .patchEmbed3D import PatchEmbed3D
+_logger = logging.getLogger(__name__)
+class Attention(nn.Module):
+    """
+    This is much like `.vision_transformer.Attention` but uses *localised* self attention by accepting an input with
+     an extra "image block" dim
+    """
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, 3 * dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        """
+        x is shape: B (batch_size), T (image blocks), N (seq length per image block), C (embed dim)
+        """
+        b, t, n, c = x.shape
+        # result of next line is (qkv, B, num (H)eads, T, N, (C')hannels per head)
+        qkv = self.qkv(x).reshape(b, t, n, 3, self.num_heads, c // self.num_heads).permute(3, 0, 4, 1, 2, 5)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * self.scale  # (B, H, T, N, N)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).permute(0, 2, 3, 4, 1).reshape(b, t, n, c)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x  # (B, T, N, C)
+class TransformerLayer(nn.Module):
+    """
+    This is much like `.vision_transformer.Block` but:
+        - Called TransformerLayer here to allow for "block" as defined in the paper ("non-overlapping image blocks")
+        - Uses modified Attention layer that handles the "block" dimension
+    """
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x):
+        y = self.norm1(x)
+        x = x + self.drop_path(self.attn(y))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class ConvPool(nn.Module):
+    def __init__(self, in_channels, out_channels, norm_layer, pad_type=""):
+        super().__init__()
+        self.conv = create_conv3d(in_channels, out_channels, kernel_size=3, padding=pad_type, bias=True)
+        self.norm = norm_layer(out_channels)
+        self.pool = create_pool3d("max", kernel_size=3, stride=2, padding=pad_type)
+    def forward(self, x):
+        """
+        x is expected to have shape (B, C, D, H, W)
+        """
+        _assert(x.shape[-3] % 2 == 0, "BlockAggregation requires even input spatial dims")
+        _assert(x.shape[-2] % 2 == 0, "BlockAggregation requires even input spatial dims")
+        _assert(x.shape[-1] % 2 == 0, "BlockAggregation requires even input spatial dims")
+        # print('In ConvPool x : {}'.format(x.shape))
+        x = self.conv(x)
+        # Layer norm done over channel dim only
+        x = self.norm(x.permute(0, 2, 3, 4, 1)).permute(0, 4, 1, 2, 3)
+        x = self.pool(x)
+        return x  # (B, C, D//2, H//2, W//2)
+def blockify(x, block_size: int):
+    """image to blocks
+    Args:
+        x (Tensor): with shape (B, D, H, W, C)
+        block_size (int): edge length of a single square block in units of D, H, W
+    """
+    b, d, h, w, c = x.shape
+    _assert(d % block_size == 0, "`block_size` must divide input depth evenly")
+    _assert(h % block_size == 0, "`block_size` must divide input height evenly")
+    _assert(w % block_size == 0, "`block_size` must divide input width evenly")
+    grid_depth = d // block_size
+    grid_height = h // block_size
+    grid_width = w // block_size
+    x = x.reshape(b, grid_depth, block_size, grid_height, block_size, grid_width, block_size, c)
+    x = x.permute(0, 1, 3, 5, 2, 4, 6, 7).reshape(
+        b, grid_depth * grid_height * grid_width, -1, c
+    )  # shape [2, 512, 27, 128]
+    return x  # (B, T, N, C)
+# @register_notrace_function  # reason: int receives Proxy
+def deblockify(x, block_size: int):
+    """blocks to image
+    Args:
+        x (Tensor): with shape (B, T, N, C) where T is number of blocks and N is sequence size per block
+        block_size (int): edge length of a single square block in units of desired D, H, W
+    """
+    b, t, _, c = x.shape
+    grid_size = round(math.pow(t, 1 / 3))
+    depth = height = width = grid_size * block_size
+    x = x.reshape(b, grid_size, grid_size, grid_size, block_size, block_size, block_size, c)
+    x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).reshape(b, depth, height, width, c)
+    return x  # (B, D, H, W, C)
+class NestLevel(nn.Module):
+    """Single hierarchical level of a Nested Transformer"""
+    def __init__(
+        self,
+        num_blocks,
+        block_size,
+        seq_length,
+        num_heads,
+        depth,
+        embed_dim,
+        prev_embed_dim=None,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rates: Sequence[int] = (),
+        norm_layer=None,
+        act_layer=None,
+        pad_type="",
+    ):
+        super().__init__()
+        self.block_size = block_size
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_blocks, seq_length, embed_dim))
+        if prev_embed_dim is not None:
+            self.pool = ConvPool(prev_embed_dim, embed_dim, norm_layer=norm_layer, pad_type=pad_type)
+        else:
+            self.pool = nn.Identity()
+        # Transformer encoder
+        if len(drop_path_rates):
+            assert len(drop_path_rates) == depth, "Must provide as many drop path rates as there are transformer layers"
+        self.transformer_encoder = nn.Sequential(
+            *[
+                TransformerLayer(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=drop_path_rates[i],
+                    norm_layer=norm_layer,
+                    act_layer=act_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+    def forward(self, x):
+        """
+        expects x as (B, C, D, H, W)
+        """
+        x = self.pool(x)
+        x = x.permute(0, 2, 3, 4, 1)  # (B, H', W', C), switch to channels last for transformer
+        x = blockify(x, self.block_size)  # (B, T, N, C')
+        x = x + self.pos_embed
+        x = self.transformer_encoder(x)  # (B, ,T, N, C')
+        x = deblockify(x, self.block_size)  # (B, D', H', W', C') [2, 24, 24, 24, 128]
+        # Channel-first for block aggregation, and generally to replicate convnet feature map at each stage
+        return x.permute(0, 4, 1, 2, 3)  # (B, C, D', H', W')
+class NestTransformer3D(nn.Module):
+    """Nested Transformer (NesT)
+    A PyTorch impl of : `Aggregating Nested Transformers`
+        - https://arxiv.org/abs/2105.12723
+    """
+    def __init__(
+        self,
+        img_size=96,
+        in_chans=1,
+        patch_size=2,
+        num_levels=3,
+        embed_dims=(128, 256, 512),
+        num_heads=(4, 8, 16),
+        depths=(2, 2, 20),
+        num_classes=1000,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.5,
+        norm_layer=None,
+        act_layer=None,
+        pad_type="",
+        weight_init="",
+        global_pool="avg",
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            in_chans (int): number of input channels
+            patch_size (int): patch size
+            num_levels (int): number of block hierarchies (T_d in the paper)
+            embed_dims (int, tuple): embedding dimensions of each level
+            num_heads (int, tuple): number of attention heads for each level
+            depths (int, tuple): number of transformer layers for each level
+            num_classes (int): number of classes for classification head
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim for MLP of transformer layers
+            qkv_bias (bool): enable bias for qkv if True
+            drop_rate (float): dropout rate for MLP of transformer layers, MSA final projection layer, and classifier
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            norm_layer: (nn.Module): normalization layer for transformer layers
+            act_layer: (nn.Module): activation layer in MLP of transformer layers
+            pad_type: str: Type of padding to use '' for PyTorch symmetric, 'same' for TF SAME
+            weight_init: (str): weight init scheme
+            global_pool: (str): type of pooling operation to apply to final feature map
+        Notes:
+            - Default values follow NesT-B from the original Jax code.
+            - `embed_dims`, `num_heads`, `depths` should be ints or tuples with length `num_levels`.
+            - For those following the paper, Table A1 may have errors!
+                - https://github.com/google-research/nested-transformer/issues/2
+        """
+        super().__init__()
+        for param_name in ["embed_dims", "num_heads", "depths"]:
+            param_value = locals()[param_name]
+            if isinstance(param_value, collections.abc.Sequence):
+                assert len(param_value) == num_levels, f"Require `len({param_name}) == num_levels`"
+        embed_dims = to_ntuple(num_levels)(embed_dims)
+        num_heads = to_ntuple(num_levels)(num_heads)
+        depths = to_ntuple(num_levels)(depths)
+        self.num_classes = num_classes
+        self.num_features = embed_dims[-1]
+        self.feature_info = []
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+        self.drop_rate = drop_rate
+        self.num_levels = num_levels
+        if isinstance(img_size, collections.abc.Sequence):
+            assert img_size[0] == img_size[1], "Model only handles square inputs"
+            img_size = img_size[0]
+        assert img_size % patch_size == 0, "`patch_size` must divide `img_size` evenly"
+        self.patch_size = patch_size
+        # Number of blocks at each level
+        self.num_blocks = (8 ** torch.arange(num_levels)).flip(0).tolist()
+        assert (img_size // patch_size) % round(
+            math.pow(self.num_blocks[0], 1 / 3)
+        ) == 0, "First level blocks don't fit evenly. Check `img_size`, `patch_size`, and `num_levels`"
+        # Block edge size in units of patches
+        # Hint: (img_size // patch_size) gives number of patches along edge of image. sqrt(self.num_blocks[0]) is the
+        #  number of blocks along edge of image
+        self.block_size = int((img_size // patch_size) // round(math.pow(self.num_blocks[0], 1 / 3)))
+        # Patch embedding
+        self.patch_embed = PatchEmbed3D(
+            img_size=[img_size, img_size, img_size],
+            patch_size=[patch_size, patch_size, patch_size],
+            in_chans=in_chans,
+            embed_dim=embed_dims[0],
+        )
+        self.num_patches = self.patch_embed.num_patches
+        self.seq_length = self.num_patches // self.num_blocks[0]
+        # Build up each hierarchical level
+        levels = []
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        prev_dim = None
+        curr_stride = 4
+        for i in range(len(self.num_blocks)):
+            dim = embed_dims[i]
+            levels.append(
+                NestLevel(
+                    self.num_blocks[i],
+                    self.block_size,
+                    self.seq_length,
+                    num_heads[i],
+                    depths[i],
+                    dim,
+                    prev_dim,
+                    mlp_ratio,
+                    qkv_bias,
+                    drop_rate,
+                    attn_drop_rate,
+                    dp_rates[i],
+                    norm_layer,
+                    act_layer,
+                    pad_type=pad_type,
+                )
+            )
+            self.feature_info += [dict(num_chs=dim, reduction=curr_stride, module=f"levels.{i}")]
+            prev_dim = dim
+            curr_stride *= 2
+        self.levels = nn.ModuleList([levels[i] for i in range(num_levels)])
+        # Final normalization layer
+        self.norm = norm_layer(embed_dims[-1])
+        self.init_weights(weight_init)
+    def init_weights(self, mode=""):
+        assert mode in ("nlhb", "")
+        head_bias = -math.log(self.num_classes) if "nlhb" in mode else 0.0
+        for level in self.levels:
+            trunc_normal_(level.pos_embed, std=0.02, a=-2, b=2)
+        named_apply(partial(_init_nest_weights, head_bias=head_bias), self)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {f"level.{i}.pos_embed" for i in range(len(self.levels))}
+    def get_classifier(self):
+        return self.head
+    def forward_features(self, x):
+        """x shape (B, C, D, H, W)"""
+        x = self.patch_embed(x)
+        hidden_states_out = [x]
+        for _, level in enumerate(self.levels):
+            x = level(x)
+            hidden_states_out.append(x)
+        # Layer norm done over channel dim only (to NDHWC and back)
+        x = self.norm(x.permute(0, 2, 3, 4, 1)).permute(0, 4, 1, 2, 3)
+        return x, hidden_states_out
+    def forward(self, x):
+        """x shape (B, C, D, H, W)"""
+        x = self.forward_features(x)
+        if self.drop_rate > 0.0:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        return x
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+def _init_nest_weights(module: nn.Module, name: str = "", head_bias: float = 0.0):
+    """NesT weight initialization
+    Can replicate Jax implementation. Otherwise follows vision_transformer.py
+    """
+    if isinstance(module, nn.Linear):
+        if name.startswith("head"):
+            trunc_normal_(module.weight, std=0.02, a=-2, b=2)
+            nn.init.constant_(module.bias, head_bias)
+        else:
+            trunc_normal_(module.weight, std=0.02, a=-2, b=2)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Conv2d):
+        trunc_normal_(module.weight, std=0.02, a=-2, b=2)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, (nn.LayerNorm, nn.GroupNorm, nn.BatchNorm2d)):
+        nn.init.zeros_(module.bias)
+        nn.init.ones_(module.weight)
+def resize_pos_embed(posemb, posemb_new):
+    """
+    Rescale the grid of position embeddings when loading from state_dict
+    Expected shape of position embeddings is (1, T, N, C), and considers only square images
+    """
+    _logger.info("Resized position embedding: %s to %s", posemb.shape, posemb_new.shape)
+    seq_length_old = posemb.shape[2]
+    num_blocks_new, seq_length_new = posemb_new.shape[1:3]
+    size_new = int(math.sqrt(num_blocks_new * seq_length_new))
+    # First change to (1, C, H, W)
+    posemb = deblockify(posemb, int(math.sqrt(seq_length_old))).permute(0, 3, 1, 2)
+    posemb = F.interpolate(posemb, size=[size_new, size_new], mode="bicubic", align_corners=False)
+    # Now change to new (1, T, N, C)
+    posemb = blockify(posemb.permute(0, 2, 3, 1), int(math.sqrt(seq_length_new)))
+    return posemb
+def checkpoint_filter_fn(state_dict, model):
+    """resize positional embeddings of pretrained weights"""
+    pos_embed_keys = [k for k in state_dict.keys() if k.startswith("pos_embed_")]
+    for k in pos_embed_keys:
+        if state_dict[k].shape != getattr(model, k).shape:
+            state_dict[k] = resize_pos_embed(state_dict[k], getattr(model, k))
+    return state_dict

scripts/networks/patchEmbed3D.py ADDED Viewed

	@@ -0,0 +1,190 @@

+#!/usr/bin/env python3
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from monai.utils import optional_import
+Rearrange, _ = optional_import("einops.layers.torch", name="Rearrange")
+class PatchEmbeddingBlock(nn.Module):
+    """
+    A patch embedding block, based on: "Dosovitskiy et al.,
+    An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>"
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        img_size: Tuple[int, int, int],
+        patch_size: Tuple[int, int, int],
+        hidden_size: int,
+        num_heads: int,
+        pos_embed: str,
+        dropout_rate: float = 0.0,
+    ) -> None:
+        """
+        Args:
+            in_channels: dimension of input channels.
+            img_size: dimension of input image.
+            patch_size: dimension of patch size.
+            hidden_size: dimension of hidden layer.
+            num_heads: number of attention heads.
+            pos_embed: position embedding layer type.
+            dropout_rate: faction of the input units to drop.
+        """
+        super().__init__()
+        if not (0 <= dropout_rate <= 1):
+            raise AssertionError("dropout_rate should be between 0 and 1.")
+        if hidden_size % num_heads != 0:
+            raise AssertionError("hidden size should be divisible by num_heads.")
+        for m, p in zip(img_size, patch_size):
+            if m < p:
+                raise AssertionError("patch_size should be smaller than img_size.")
+        if pos_embed not in ["conv", "perceptron"]:
+            raise KeyError(f"Position embedding layer of type {pos_embed} is not supported.")
+        if pos_embed == "perceptron":
+            if img_size[0] % patch_size[0] != 0:
+                raise AssertionError("img_size should be divisible by patch_size for perceptron patch embedding.")
+        self.n_patches = (
+            (img_size[0] // patch_size[0]) * (img_size[1] // patch_size[1]) * (img_size[2] // patch_size[2])
+        )
+        self.patch_dim = in_channels * patch_size[0] * patch_size[1] * patch_size[2]
+        self.pos_embed = pos_embed
+        self.patch_embeddings: Union[nn.Conv3d, nn.Sequential]
+        if self.pos_embed == "conv":
+            self.patch_embeddings = nn.Conv3d(
+                in_channels=in_channels, out_channels=hidden_size, kernel_size=patch_size, stride=patch_size
+            )
+        elif self.pos_embed == "perceptron":
+            self.patch_embeddings = nn.Sequential(
+                Rearrange(
+                    "b c (h p1) (w p2) (d p3)-> b (h w d) (p1 p2 p3 c)",
+                    p1=patch_size[0],
+                    p2=patch_size[1],
+                    p3=patch_size[2],
+                ),
+                nn.Linear(self.patch_dim, hidden_size),
+            )
+        self.position_embeddings = nn.Parameter(torch.zeros(1, self.n_patches, hidden_size))
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size))
+        self.dropout = nn.Dropout(dropout_rate)
+        self.trunc_normal_(self.position_embeddings, mean=0.0, std=0.02, a=-2.0, b=2.0)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            self.trunc_normal_(m.weight, mean=0.0, std=0.02, a=-2.0, b=2.0)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def trunc_normal_(self, tensor, mean, std, a, b):
+        # From PyTorch official master until it's in a few official releases - RW
+        # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+        def norm_cdf(x):
+            return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+        with torch.no_grad():
+            l = norm_cdf((a - mean) / std)
+            u = norm_cdf((b - mean) / std)
+            tensor.uniform_(2 * l - 1, 2 * u - 1)
+            tensor.erfinv_()
+            tensor.mul_(std * math.sqrt(2.0))
+            tensor.add_(mean)
+            tensor.clamp_(min=a, max=b)
+            return tensor
+    def forward(self, x):
+        if self.pos_embed == "conv":
+            x = self.patch_embeddings(x)
+            x = x.flatten(2)
+            x = x.transpose(-1, -2)
+        elif self.pos_embed == "perceptron":
+            x = self.patch_embeddings(x)
+        embeddings = x + self.position_embeddings
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class PatchEmbed3D(nn.Module):
+    """Video to Patch Embedding.
+    Args:
+        patch_size (int): Patch token size. Default: (2,4,4).
+        in_chans (int): Number of input video channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(
+        self,
+        img_size: Sequence[int] = (96, 96, 96),
+        patch_size=(4, 4, 4),
+        in_chans: int = 1,
+        embed_dim: int = 96,
+        norm_layer=None,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1], img_size[2] // patch_size[2])
+        self.num_patches = self.grid_size[0] * self.grid_size[1] * self.grid_size[2]
+        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, d, h, w = x.size()
+        if w % self.patch_size[2] != 0:
+            x = F.pad(x, (0, self.patch_size[2] - w % self.patch_size[2]))
+        if h % self.patch_size[1] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[1] - h % self.patch_size[1]))
+        if d % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - d % self.patch_size[0]))
+        x = self.proj(x)  # B C D Wh Ww
+        if self.norm is not None:
+            d, wh, ww = x.size(2), x.size(3), x.size(4)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, d, wh, ww)
+            # pdb.set_trace()
+        return x

scripts/networks/unest.py ADDED Viewed

	@@ -0,0 +1,274 @@

+#!/usr/bin/env python3
+"""
+The 3D NEST transformer based segmentation model
+MASI Lab, Vanderbilty University
+Authors: Xin Yu, Yinchi Zhou, Yucheng Tang, Bennett Landman
+The NEST code is partly from
+Nested Hierarchical Transformer: Towards Accurate, Data-Efficient and
+Interpretable Visual Understanding
+https://arxiv.org/pdf/2105.12723.pdf
+"""
+# limitations under the License.
+from typing import Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+from monai.networks.blocks import Convolution
+from monai.networks.blocks.dynunet_block import UnetOutBlock
+# from scripts.networks.swin_transformer_3d import SwinTransformer3D
+from scripts.networks.nest_transformer_3D import NestTransformer3D
+from scripts.networks.unest_block import UNesTBlock, UNesTConvBlock, UNestUpBlock
+# from monai.networks.blocks.unetr_block import UnetstrBasicBlock, UnetrPrUpBlock, UnetResBlock
+class UNesT(nn.Module):
+    """
+    UNesT model implementation
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        img_size: Sequence[int] = (96, 96, 96),
+        feature_size: int = 16,
+        patch_size: int = 2,
+        depths: Sequence[int] = (2, 2, 2, 2),
+        num_heads: Sequence[int] = (3, 6, 12, 24),
+        window_size: Sequence[int] = (7, 7, 7),
+        norm_name: Union[Tuple, str] = "instance",
+        conv_block: bool = False,
+        res_block: bool = True,
+        # featResBlock: bool = False,
+        dropout_rate: float = 0.0,
+    ) -> None:
+        """
+        Args:
+            in_channels: dimension of input channels.
+            out_channels: dimension of output channels.
+            img_size: dimension of input image.
+            feature_size: dimension of network feature size.
+            hidden_size: dimension of hidden layer.
+            mlp_dim: dimension of feedforward layer.
+            num_heads: number of attention heads.
+            pos_embed: position embedding layer type.
+            norm_name: feature normalization type and arguments.
+            conv_block: bool argument to determine if convolutional block is used.
+            res_block: bool argument to determine if residual block is used.
+            dropout_rate: faction of the input units to drop.
+        """
+        super().__init__()
+        if not (0 <= dropout_rate <= 1):
+            raise AssertionError("dropout_rate should be between 0 and 1.")
+        self.embed_dim = [128, 256, 512]
+        self.nestViT = NestTransformer3D(
+            img_size=96,
+            in_chans=1,
+            patch_size=4,
+            num_levels=3,
+            embed_dims=(128, 256, 512),
+            num_heads=(4, 8, 16),
+            depths=(2, 2, 8),
+            num_classes=1000,
+            mlp_ratio=4.0,
+            qkv_bias=True,
+            drop_rate=0.0,
+            attn_drop_rate=0.0,
+            drop_path_rate=0.5,
+            norm_layer=None,
+            act_layer=None,
+            pad_type="",
+            weight_init="",
+            global_pool="avg",
+        )
+        self.encoder1 = UNesTConvBlock(
+            spatial_dims=3,
+            in_channels=1,
+            out_channels=feature_size * 2,
+            kernel_size=3,
+            stride=1,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.encoder2 = UNestUpBlock(
+            spatial_dims=3,
+            in_channels=self.embed_dim[0],
+            out_channels=feature_size * 4,
+            num_layer=1,
+            kernel_size=3,
+            stride=1,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            conv_block=False,
+            res_block=False,
+        )
+        self.encoder3 = UNesTConvBlock(
+            spatial_dims=3,
+            in_channels=self.embed_dim[0],
+            out_channels=8 * feature_size,
+            kernel_size=3,
+            stride=1,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.encoder4 = UNesTConvBlock(
+            spatial_dims=3,
+            in_channels=self.embed_dim[1],
+            out_channels=16 * feature_size,
+            kernel_size=3,
+            stride=1,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.decoder5 = UNesTBlock(
+            spatial_dims=3,
+            in_channels=2 * self.embed_dim[2],
+            out_channels=feature_size * 32,
+            stride=1,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.decoder4 = UNesTBlock(
+            spatial_dims=3,
+            in_channels=self.embed_dim[2],
+            out_channels=feature_size * 16,
+            stride=1,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.decoder3 = UNesTBlock(
+            spatial_dims=3,
+            in_channels=feature_size * 16,
+            out_channels=feature_size * 8,
+            stride=1,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.decoder2 = UNesTBlock(
+            spatial_dims=3,
+            in_channels=feature_size * 8,
+            out_channels=feature_size * 4,
+            stride=1,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.decoder1 = UNesTBlock(
+            spatial_dims=3,
+            in_channels=feature_size * 4,
+            out_channels=feature_size * 2,
+            stride=1,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.encoder10 = Convolution(
+            dimensions=3,
+            in_channels=32 * feature_size,
+            out_channels=64 * feature_size,
+            strides=2,
+            adn_ordering="ADN",
+            dropout=0.0,
+        )
+        self.out = UnetOutBlock(spatial_dims=3, in_channels=feature_size * 2, out_channels=out_channels)  # type: ignore
+    def proj_feat(self, x, hidden_size, feat_size):
+        x = x.view(x.size(0), feat_size[0], feat_size[1], feat_size[2], hidden_size)
+        x = x.permute(0, 4, 1, 2, 3).contiguous()
+        return x
+    def load_from(self, weights):
+        with torch.no_grad():
+            # copy weights from patch embedding
+            for i in weights["state_dict"]:
+                print(i)
+            self.vit.patch_embedding.position_embeddings.copy_(
+                weights["state_dict"]["module.transformer.patch_embedding.position_embeddings_3d"]
+            )
+            self.vit.patch_embedding.cls_token.copy_(
+                weights["state_dict"]["module.transformer.patch_embedding.cls_token"]
+            )
+            self.vit.patch_embedding.patch_embeddings[1].weight.copy_(
+                weights["state_dict"]["module.transformer.patch_embedding.patch_embeddings_3d.1.weight"]
+            )
+            self.vit.patch_embedding.patch_embeddings[1].bias.copy_(
+                weights["state_dict"]["module.transformer.patch_embedding.patch_embeddings_3d.1.bias"]
+            )
+            # copy weights from  encoding blocks (default: num of blocks: 12)
+            for bname, block in self.vit.blocks.named_children():
+                print(block)
+                block.loadFrom(weights, n_block=bname)
+            # last norm layer of transformer
+            self.vit.norm.weight.copy_(weights["state_dict"]["module.transformer.norm.weight"])
+            self.vit.norm.bias.copy_(weights["state_dict"]["module.transformer.norm.bias"])
+    def forward(self, x_in):
+        x, hidden_states_out = self.nestViT(x_in)
+        enc0 = self.encoder1(x_in)  # 2, 32, 96, 96, 96
+        x1 = hidden_states_out[0]  # 2, 128, 24, 24, 24
+        enc1 = self.encoder2(x1)  # 2, 64, 48, 48, 48
+        x2 = hidden_states_out[1]  # 2, 128, 24, 24, 24
+        enc2 = self.encoder3(x2)  # 2, 128, 24, 24, 24
+        x3 = hidden_states_out[2]  # 2, 256, 12, 12, 12
+        enc3 = self.encoder4(x3)  # 2, 256, 12, 12, 12
+        x4 = hidden_states_out[3]
+        enc4 = x4  # 2, 512, 6, 6, 6
+        dec4 = x  # 2, 512, 6, 6, 6
+        dec4 = self.encoder10(dec4)  # 2, 1024, 3, 3, 3
+        dec3 = self.decoder5(dec4, enc4)  # 2, 512, 6, 6, 6
+        dec2 = self.decoder4(dec3, enc3)  # 2, 256, 12, 12, 12
+        dec1 = self.decoder3(dec2, enc2)  # 2, 128, 24, 24, 24
+        dec0 = self.decoder2(dec1, enc1)  # 2, 64, 48, 48, 48
+        out = self.decoder1(dec0, enc0)  # 2, 32, 96, 96, 96
+        logits = self.out(out)
+        return logits

scripts/networks/unest_block.py ADDED Viewed

	@@ -0,0 +1,245 @@

+#!/usr/bin/env python3
+from typing import Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+from monai.networks.blocks.dynunet_block import UnetBasicBlock, UnetResBlock, get_conv_layer
+class UNesTBlock(nn.Module):
+    """ """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,  # type: ignore
+        kernel_size: Union[Sequence[int], int],
+        stride: Union[Sequence[int], int],
+        upsample_kernel_size: Union[Sequence[int], int],
+        norm_name: Union[Tuple, str],
+        res_block: bool = False,
+    ) -> None:
+        """
+        Args:
+            spatial_dims: number of spatial dimensions.
+            in_channels: number of input channels.
+            out_channels: number of output channels.
+            kernel_size: convolution kernel size.
+            stride: convolution stride.
+            upsample_kernel_size: convolution kernel size for transposed convolution layers.
+            norm_name: feature normalization type and arguments.
+            res_block: bool argument to determine if residual block is used.
+        """
+        super(UNesTBlock, self).__init__()
+        upsample_stride = upsample_kernel_size
+        self.transp_conv = get_conv_layer(
+            spatial_dims,
+            in_channels,
+            out_channels,
+            kernel_size=upsample_kernel_size,
+            stride=upsample_stride,
+            conv_only=True,
+            is_transposed=True,
+        )
+        if res_block:
+            self.conv_block = UnetResBlock(
+                spatial_dims,
+                out_channels + out_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                norm_name=norm_name,
+            )
+        else:
+            self.conv_block = UnetBasicBlock(  # type: ignore
+                spatial_dims,
+                out_channels + out_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                norm_name=norm_name,
+            )
+    def forward(self, inp, skip):
+        # number of channels for skip should equals to out_channels
+        out = self.transp_conv(inp)
+        # print(out.shape)
+        # print(skip.shape)
+        out = torch.cat((out, skip), dim=1)
+        out = self.conv_block(out)
+        return out
+class UNestUpBlock(nn.Module):
+    """ """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,
+        num_layer: int,
+        kernel_size: Union[Sequence[int], int],
+        stride: Union[Sequence[int], int],
+        upsample_kernel_size: Union[Sequence[int], int],
+        norm_name: Union[Tuple, str],
+        conv_block: bool = False,
+        res_block: bool = False,
+    ) -> None:
+        """
+        Args:
+            spatial_dims: number of spatial dimensions.
+            in_channels: number of input channels.
+            out_channels: number of output channels.
+            num_layer: number of upsampling blocks.
+            kernel_size: convolution kernel size.
+            stride: convolution stride.
+            upsample_kernel_size: convolution kernel size for transposed convolution layers.
+            norm_name: feature normalization type and arguments.
+            conv_block: bool argument to determine if convolutional block is used.
+            res_block: bool argument to determine if residual block is used.
+        """
+        super().__init__()
+        upsample_stride = upsample_kernel_size
+        self.transp_conv_init = get_conv_layer(
+            spatial_dims,
+            in_channels,
+            out_channels,
+            kernel_size=upsample_kernel_size,
+            stride=upsample_stride,
+            conv_only=True,
+            is_transposed=True,
+        )
+        if conv_block:
+            if res_block:
+                self.blocks = nn.ModuleList(
+                    [
+                        nn.Sequential(
+                            get_conv_layer(
+                                spatial_dims,
+                                out_channels,
+                                out_channels,
+                                kernel_size=upsample_kernel_size,
+                                stride=upsample_stride,
+                                conv_only=True,
+                                is_transposed=True,
+                            ),
+                            UnetResBlock(
+                                spatial_dims=3,
+                                in_channels=out_channels,
+                                out_channels=out_channels,
+                                kernel_size=kernel_size,
+                                stride=stride,
+                                norm_name=norm_name,
+                            ),
+                        )
+                        for i in range(num_layer)
+                    ]
+                )
+            else:
+                self.blocks = nn.ModuleList(
+                    [
+                        nn.Sequential(
+                            get_conv_layer(
+                                spatial_dims,
+                                out_channels,
+                                out_channels,
+                                kernel_size=upsample_kernel_size,
+                                stride=upsample_stride,
+                                conv_only=True,
+                                is_transposed=True,
+                            ),
+                            UnetBasicBlock(
+                                spatial_dims=3,
+                                in_channels=out_channels,
+                                out_channels=out_channels,
+                                kernel_size=kernel_size,
+                                stride=stride,
+                                norm_name=norm_name,
+                            ),
+                        )
+                        for i in range(num_layer)
+                    ]
+                )
+        else:
+            self.blocks = nn.ModuleList(
+                [
+                    get_conv_layer(
+                        spatial_dims,
+                        out_channels,
+                        out_channels,
+                        kernel_size=1,
+                        stride=1,
+                        conv_only=True,
+                        is_transposed=True,
+                    )
+                    for i in range(num_layer)
+                ]
+            )
+    def forward(self, x):
+        x = self.transp_conv_init(x)
+        for blk in self.blocks:
+            x = blk(x)
+        return x
+class UNesTConvBlock(nn.Module):
+    """
+    UNesT block with skip connections
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[Sequence[int], int],
+        stride: Union[Sequence[int], int],
+        norm_name: Union[Tuple, str],
+        res_block: bool = False,
+    ) -> None:
+        """
+        Args:
+            spatial_dims: number of spatial dimensions.
+            in_channels: number of input channels.
+            out_channels: number of output channels.
+            kernel_size: convolution kernel size.
+            stride: convolution stride.
+            norm_name: feature normalization type and arguments.
+            res_block: bool argument to determine if residual block is used.
+        """
+        super().__init__()
+        if res_block:
+            self.layer = UnetResBlock(
+                spatial_dims=spatial_dims,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                norm_name=norm_name,
+            )
+        else:
+            self.layer = UnetBasicBlock(  # type: ignore
+                spatial_dims=spatial_dims,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                norm_name=norm_name,
+            )
+    def forward(self, inp):
+        out = self.layer(inp)
+        return out