Spaces:
Running
on
Zero
Running
on
Zero
# Copyright (c) Meta Platforms, Inc. and affiliates. | |
# All rights reserved. | |
# | |
# This source code is licensed under the license found in the | |
# LICENSE file in the root directory of this source tree. | |
# Copyright (c) 2019 Western Digital Corporation or its affiliates. | |
from typing import List, Tuple | |
import torch | |
import torch.nn.functional as F | |
from mmcv.cnn import ConvModule | |
from mmengine.model import BaseModule | |
from torch import Tensor | |
from mmdet.registry import MODELS | |
from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig | |
class DetectionBlock(BaseModule): | |
"""Detection block in YOLO neck. | |
Let out_channels = n, the DetectionBlock contains: | |
Six ConvLayers, 1 Conv2D Layer and 1 YoloLayer. | |
The first 6 ConvLayers are formed the following way: | |
1x1xn, 3x3x2n, 1x1xn, 3x3x2n, 1x1xn, 3x3x2n. | |
The Conv2D layer is 1x1x255. | |
Some block will have branch after the fifth ConvLayer. | |
The input channel is arbitrary (in_channels) | |
Args: | |
in_channels (int): The number of input channels. | |
out_channels (int): The number of output channels. | |
conv_cfg (dict): Config dict for convolution layer. Default: None. | |
norm_cfg (dict): Dictionary to construct and config norm layer. | |
Default: dict(type='BN', requires_grad=True) | |
act_cfg (dict): Config dict for activation layer. | |
Default: dict(type='LeakyReLU', negative_slope=0.1). | |
init_cfg (dict or list[dict], optional): Initialization config dict. | |
Default: None | |
""" | |
def __init__(self, | |
in_channels: int, | |
out_channels: int, | |
conv_cfg: OptConfigType = None, | |
norm_cfg: ConfigType = dict(type='BN', requires_grad=True), | |
act_cfg: ConfigType = dict( | |
type='LeakyReLU', negative_slope=0.1), | |
init_cfg: OptMultiConfig = None) -> None: | |
super(DetectionBlock, self).__init__(init_cfg) | |
double_out_channels = out_channels * 2 | |
# shortcut | |
cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) | |
self.conv1 = ConvModule(in_channels, out_channels, 1, **cfg) | |
self.conv2 = ConvModule( | |
out_channels, double_out_channels, 3, padding=1, **cfg) | |
self.conv3 = ConvModule(double_out_channels, out_channels, 1, **cfg) | |
self.conv4 = ConvModule( | |
out_channels, double_out_channels, 3, padding=1, **cfg) | |
self.conv5 = ConvModule(double_out_channels, out_channels, 1, **cfg) | |
def forward(self, x: Tensor) -> Tensor: | |
tmp = self.conv1(x) | |
tmp = self.conv2(tmp) | |
tmp = self.conv3(tmp) | |
tmp = self.conv4(tmp) | |
out = self.conv5(tmp) | |
return out | |
class YOLOV3Neck(BaseModule): | |
"""The neck of YOLOV3. | |
It can be treated as a simplified version of FPN. It | |
will take the result from Darknet backbone and do some upsampling and | |
concatenation. It will finally output the detection result. | |
Note: | |
The input feats should be from top to bottom. | |
i.e., from high-lvl to low-lvl | |
But YOLOV3Neck will process them in reversed order. | |
i.e., from bottom (high-lvl) to top (low-lvl) | |
Args: | |
num_scales (int): The number of scales / stages. | |
in_channels (List[int]): The number of input channels per scale. | |
out_channels (List[int]): The number of output channels per scale. | |
conv_cfg (dict, optional): Config dict for convolution layer. | |
Default: None. | |
norm_cfg (dict, optional): Dictionary to construct and config norm | |
layer. Default: dict(type='BN', requires_grad=True) | |
act_cfg (dict, optional): Config dict for activation layer. | |
Default: dict(type='LeakyReLU', negative_slope=0.1). | |
init_cfg (dict or list[dict], optional): Initialization config dict. | |
Default: None | |
""" | |
def __init__(self, | |
num_scales: int, | |
in_channels: List[int], | |
out_channels: List[int], | |
conv_cfg: OptConfigType = None, | |
norm_cfg: ConfigType = dict(type='BN', requires_grad=True), | |
act_cfg: ConfigType = dict( | |
type='LeakyReLU', negative_slope=0.1), | |
init_cfg: OptMultiConfig = None) -> None: | |
super(YOLOV3Neck, self).__init__(init_cfg) | |
assert (num_scales == len(in_channels) == len(out_channels)) | |
self.num_scales = num_scales | |
self.in_channels = in_channels | |
self.out_channels = out_channels | |
# shortcut | |
cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) | |
# To support arbitrary scales, the code looks awful, but it works. | |
# Better solution is welcomed. | |
self.detect1 = DetectionBlock(in_channels[0], out_channels[0], **cfg) | |
for i in range(1, self.num_scales): | |
in_c, out_c = self.in_channels[i], self.out_channels[i] | |
inter_c = out_channels[i - 1] | |
self.add_module(f'conv{i}', ConvModule(inter_c, out_c, 1, **cfg)) | |
# in_c + out_c : High-lvl feats will be cat with low-lvl feats | |
self.add_module(f'detect{i+1}', | |
DetectionBlock(in_c + out_c, out_c, **cfg)) | |
def forward(self, feats=Tuple[Tensor]) -> Tuple[Tensor]: | |
assert len(feats) == self.num_scales | |
# processed from bottom (high-lvl) to top (low-lvl) | |
outs = [] | |
out = self.detect1(feats[-1]) | |
outs.append(out) | |
for i, x in enumerate(reversed(feats[:-1])): | |
conv = getattr(self, f'conv{i+1}') | |
tmp = conv(out) | |
# Cat with low-lvl feats | |
tmp = F.interpolate(tmp, scale_factor=2) | |
tmp = torch.cat((tmp, x), 1) | |
detect = getattr(self, f'detect{i+2}') | |
out = detect(tmp) | |
outs.append(out) | |
return tuple(outs) | |