Spaces:
Sleeping
Sleeping
Create models/modnet.py
Browse files- src/models/modnet.py +255 -0
src/models/modnet.py
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
|
5 |
+
from .backbones import SUPPORTED_BACKBONES
|
6 |
+
|
7 |
+
|
8 |
+
#------------------------------------------------------------------------------
|
9 |
+
# MODNet Basic Modules
|
10 |
+
#------------------------------------------------------------------------------
|
11 |
+
|
12 |
+
class IBNorm(nn.Module):
|
13 |
+
""" Combine Instance Norm and Batch Norm into One Layer
|
14 |
+
"""
|
15 |
+
|
16 |
+
def __init__(self, in_channels):
|
17 |
+
super(IBNorm, self).__init__()
|
18 |
+
in_channels = in_channels
|
19 |
+
self.bnorm_channels = int(in_channels / 2)
|
20 |
+
self.inorm_channels = in_channels - self.bnorm_channels
|
21 |
+
|
22 |
+
self.bnorm = nn.BatchNorm2d(self.bnorm_channels, affine=True)
|
23 |
+
self.inorm = nn.InstanceNorm2d(self.inorm_channels, affine=False)
|
24 |
+
|
25 |
+
def forward(self, x):
|
26 |
+
bn_x = self.bnorm(x[:, :self.bnorm_channels, ...].contiguous())
|
27 |
+
in_x = self.inorm(x[:, self.bnorm_channels:, ...].contiguous())
|
28 |
+
|
29 |
+
return torch.cat((bn_x, in_x), 1)
|
30 |
+
|
31 |
+
|
32 |
+
class Conv2dIBNormRelu(nn.Module):
|
33 |
+
""" Convolution + IBNorm + ReLu
|
34 |
+
"""
|
35 |
+
|
36 |
+
def __init__(self, in_channels, out_channels, kernel_size,
|
37 |
+
stride=1, padding=0, dilation=1, groups=1, bias=True,
|
38 |
+
with_ibn=True, with_relu=True):
|
39 |
+
super(Conv2dIBNormRelu, self).__init__()
|
40 |
+
|
41 |
+
layers = [
|
42 |
+
nn.Conv2d(in_channels, out_channels, kernel_size,
|
43 |
+
stride=stride, padding=padding, dilation=dilation,
|
44 |
+
groups=groups, bias=bias)
|
45 |
+
]
|
46 |
+
|
47 |
+
if with_ibn:
|
48 |
+
layers.append(IBNorm(out_channels))
|
49 |
+
if with_relu:
|
50 |
+
layers.append(nn.ReLU(inplace=True))
|
51 |
+
|
52 |
+
self.layers = nn.Sequential(*layers)
|
53 |
+
|
54 |
+
def forward(self, x):
|
55 |
+
return self.layers(x)
|
56 |
+
|
57 |
+
|
58 |
+
class SEBlock(nn.Module):
|
59 |
+
""" SE Block Proposed in https://arxiv.org/pdf/1709.01507.pdf
|
60 |
+
"""
|
61 |
+
|
62 |
+
def __init__(self, in_channels, out_channels, reduction=1):
|
63 |
+
super(SEBlock, self).__init__()
|
64 |
+
self.pool = nn.AdaptiveAvgPool2d(1)
|
65 |
+
self.fc = nn.Sequential(
|
66 |
+
nn.Linear(in_channels, int(in_channels // reduction), bias=False),
|
67 |
+
nn.ReLU(inplace=True),
|
68 |
+
nn.Linear(int(in_channels // reduction), out_channels, bias=False),
|
69 |
+
nn.Sigmoid()
|
70 |
+
)
|
71 |
+
|
72 |
+
def forward(self, x):
|
73 |
+
b, c, _, _ = x.size()
|
74 |
+
w = self.pool(x).view(b, c)
|
75 |
+
w = self.fc(w).view(b, c, 1, 1)
|
76 |
+
|
77 |
+
return x * w.expand_as(x)
|
78 |
+
|
79 |
+
|
80 |
+
#------------------------------------------------------------------------------
|
81 |
+
# MODNet Branches
|
82 |
+
#------------------------------------------------------------------------------
|
83 |
+
|
84 |
+
class LRBranch(nn.Module):
|
85 |
+
""" Low Resolution Branch of MODNet
|
86 |
+
"""
|
87 |
+
|
88 |
+
def __init__(self, backbone):
|
89 |
+
super(LRBranch, self).__init__()
|
90 |
+
|
91 |
+
enc_channels = backbone.enc_channels
|
92 |
+
|
93 |
+
self.backbone = backbone
|
94 |
+
self.se_block = SEBlock(enc_channels[4], enc_channels[4], reduction=4)
|
95 |
+
self.conv_lr16x = Conv2dIBNormRelu(enc_channels[4], enc_channels[3], 5, stride=1, padding=2)
|
96 |
+
self.conv_lr8x = Conv2dIBNormRelu(enc_channels[3], enc_channels[2], 5, stride=1, padding=2)
|
97 |
+
self.conv_lr = Conv2dIBNormRelu(enc_channels[2], 1, kernel_size=3, stride=2, padding=1, with_ibn=False, with_relu=False)
|
98 |
+
|
99 |
+
def forward(self, img, inference):
|
100 |
+
enc_features = self.backbone.forward(img)
|
101 |
+
enc2x, enc4x, enc32x = enc_features[0], enc_features[1], enc_features[4]
|
102 |
+
|
103 |
+
enc32x = self.se_block(enc32x)
|
104 |
+
lr16x = F.interpolate(enc32x, scale_factor=2, mode='bilinear', align_corners=False)
|
105 |
+
lr16x = self.conv_lr16x(lr16x)
|
106 |
+
lr8x = F.interpolate(lr16x, scale_factor=2, mode='bilinear', align_corners=False)
|
107 |
+
lr8x = self.conv_lr8x(lr8x)
|
108 |
+
|
109 |
+
pred_semantic = None
|
110 |
+
if not inference:
|
111 |
+
lr = self.conv_lr(lr8x)
|
112 |
+
pred_semantic = torch.sigmoid(lr)
|
113 |
+
|
114 |
+
return pred_semantic, lr8x, [enc2x, enc4x]
|
115 |
+
|
116 |
+
|
117 |
+
class HRBranch(nn.Module):
|
118 |
+
""" High Resolution Branch of MODNet
|
119 |
+
"""
|
120 |
+
|
121 |
+
def __init__(self, hr_channels, enc_channels):
|
122 |
+
super(HRBranch, self).__init__()
|
123 |
+
|
124 |
+
self.tohr_enc2x = Conv2dIBNormRelu(enc_channels[0], hr_channels, 1, stride=1, padding=0)
|
125 |
+
self.conv_enc2x = Conv2dIBNormRelu(hr_channels + 3, hr_channels, 3, stride=2, padding=1)
|
126 |
+
|
127 |
+
self.tohr_enc4x = Conv2dIBNormRelu(enc_channels[1], hr_channels, 1, stride=1, padding=0)
|
128 |
+
self.conv_enc4x = Conv2dIBNormRelu(2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1)
|
129 |
+
|
130 |
+
self.conv_hr4x = nn.Sequential(
|
131 |
+
Conv2dIBNormRelu(3 * hr_channels + 3, 2 * hr_channels, 3, stride=1, padding=1),
|
132 |
+
Conv2dIBNormRelu(2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1),
|
133 |
+
Conv2dIBNormRelu(2 * hr_channels, hr_channels, 3, stride=1, padding=1),
|
134 |
+
)
|
135 |
+
|
136 |
+
self.conv_hr2x = nn.Sequential(
|
137 |
+
Conv2dIBNormRelu(2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1),
|
138 |
+
Conv2dIBNormRelu(2 * hr_channels, hr_channels, 3, stride=1, padding=1),
|
139 |
+
Conv2dIBNormRelu(hr_channels, hr_channels, 3, stride=1, padding=1),
|
140 |
+
Conv2dIBNormRelu(hr_channels, hr_channels, 3, stride=1, padding=1),
|
141 |
+
)
|
142 |
+
|
143 |
+
self.conv_hr = nn.Sequential(
|
144 |
+
Conv2dIBNormRelu(hr_channels + 3, hr_channels, 3, stride=1, padding=1),
|
145 |
+
Conv2dIBNormRelu(hr_channels, 1, kernel_size=1, stride=1, padding=0, with_ibn=False, with_relu=False),
|
146 |
+
)
|
147 |
+
|
148 |
+
def forward(self, img, enc2x, enc4x, lr8x, inference):
|
149 |
+
img2x = F.interpolate(img, scale_factor=1/2, mode='bilinear', align_corners=False)
|
150 |
+
img4x = F.interpolate(img, scale_factor=1/4, mode='bilinear', align_corners=False)
|
151 |
+
|
152 |
+
enc2x = self.tohr_enc2x(enc2x)
|
153 |
+
hr4x = self.conv_enc2x(torch.cat((img2x, enc2x), dim=1))
|
154 |
+
|
155 |
+
enc4x = self.tohr_enc4x(enc4x)
|
156 |
+
hr4x = self.conv_enc4x(torch.cat((hr4x, enc4x), dim=1))
|
157 |
+
|
158 |
+
lr4x = F.interpolate(lr8x, scale_factor=2, mode='bilinear', align_corners=False)
|
159 |
+
hr4x = self.conv_hr4x(torch.cat((hr4x, lr4x, img4x), dim=1))
|
160 |
+
|
161 |
+
hr2x = F.interpolate(hr4x, scale_factor=2, mode='bilinear', align_corners=False)
|
162 |
+
hr2x = self.conv_hr2x(torch.cat((hr2x, enc2x), dim=1))
|
163 |
+
|
164 |
+
pred_detail = None
|
165 |
+
if not inference:
|
166 |
+
hr = F.interpolate(hr2x, scale_factor=2, mode='bilinear', align_corners=False)
|
167 |
+
hr = self.conv_hr(torch.cat((hr, img), dim=1))
|
168 |
+
pred_detail = torch.sigmoid(hr)
|
169 |
+
|
170 |
+
return pred_detail, hr2x
|
171 |
+
|
172 |
+
|
173 |
+
class FusionBranch(nn.Module):
|
174 |
+
""" Fusion Branch of MODNet
|
175 |
+
"""
|
176 |
+
|
177 |
+
def __init__(self, hr_channels, enc_channels):
|
178 |
+
super(FusionBranch, self).__init__()
|
179 |
+
self.conv_lr4x = Conv2dIBNormRelu(enc_channels[2], hr_channels, 5, stride=1, padding=2)
|
180 |
+
|
181 |
+
self.conv_f2x = Conv2dIBNormRelu(2 * hr_channels, hr_channels, 3, stride=1, padding=1)
|
182 |
+
self.conv_f = nn.Sequential(
|
183 |
+
Conv2dIBNormRelu(hr_channels + 3, int(hr_channels / 2), 3, stride=1, padding=1),
|
184 |
+
Conv2dIBNormRelu(int(hr_channels / 2), 1, 1, stride=1, padding=0, with_ibn=False, with_relu=False),
|
185 |
+
)
|
186 |
+
|
187 |
+
def forward(self, img, lr8x, hr2x):
|
188 |
+
lr4x = F.interpolate(lr8x, scale_factor=2, mode='bilinear', align_corners=False)
|
189 |
+
lr4x = self.conv_lr4x(lr4x)
|
190 |
+
lr2x = F.interpolate(lr4x, scale_factor=2, mode='bilinear', align_corners=False)
|
191 |
+
|
192 |
+
f2x = self.conv_f2x(torch.cat((lr2x, hr2x), dim=1))
|
193 |
+
f = F.interpolate(f2x, scale_factor=2, mode='bilinear', align_corners=False)
|
194 |
+
f = self.conv_f(torch.cat((f, img), dim=1))
|
195 |
+
pred_matte = torch.sigmoid(f)
|
196 |
+
|
197 |
+
return pred_matte
|
198 |
+
|
199 |
+
|
200 |
+
#------------------------------------------------------------------------------
|
201 |
+
# MODNet
|
202 |
+
#------------------------------------------------------------------------------
|
203 |
+
|
204 |
+
class MODNet(nn.Module):
|
205 |
+
""" Architecture of MODNet
|
206 |
+
"""
|
207 |
+
|
208 |
+
def __init__(self, in_channels=3, hr_channels=32, backbone_arch='mobilenetv2', backbone_pretrained=True):
|
209 |
+
super(MODNet, self).__init__()
|
210 |
+
|
211 |
+
self.in_channels = in_channels
|
212 |
+
self.hr_channels = hr_channels
|
213 |
+
self.backbone_arch = backbone_arch
|
214 |
+
self.backbone_pretrained = backbone_pretrained
|
215 |
+
|
216 |
+
self.backbone = SUPPORTED_BACKBONES[self.backbone_arch](self.in_channels)
|
217 |
+
|
218 |
+
self.lr_branch = LRBranch(self.backbone)
|
219 |
+
self.hr_branch = HRBranch(self.hr_channels, self.backbone.enc_channels)
|
220 |
+
self.f_branch = FusionBranch(self.hr_channels, self.backbone.enc_channels)
|
221 |
+
|
222 |
+
for m in self.modules():
|
223 |
+
if isinstance(m, nn.Conv2d):
|
224 |
+
self._init_conv(m)
|
225 |
+
elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.InstanceNorm2d):
|
226 |
+
self._init_norm(m)
|
227 |
+
|
228 |
+
if self.backbone_pretrained:
|
229 |
+
self.backbone.load_pretrained_ckpt()
|
230 |
+
|
231 |
+
def forward(self, img, inference):
|
232 |
+
pred_semantic, lr8x, [enc2x, enc4x] = self.lr_branch(img, inference)
|
233 |
+
pred_detail, hr2x = self.hr_branch(img, enc2x, enc4x, lr8x, inference)
|
234 |
+
pred_matte = self.f_branch(img, lr8x, hr2x)
|
235 |
+
|
236 |
+
return pred_semantic, pred_detail, pred_matte
|
237 |
+
|
238 |
+
def freeze_norm(self):
|
239 |
+
norm_types = [nn.BatchNorm2d, nn.InstanceNorm2d]
|
240 |
+
for m in self.modules():
|
241 |
+
for n in norm_types:
|
242 |
+
if isinstance(m, n):
|
243 |
+
m.eval()
|
244 |
+
continue
|
245 |
+
|
246 |
+
def _init_conv(self, conv):
|
247 |
+
nn.init.kaiming_uniform_(
|
248 |
+
conv.weight, a=0, mode='fan_in', nonlinearity='relu')
|
249 |
+
if conv.bias is not None:
|
250 |
+
nn.init.constant_(conv.bias, 0)
|
251 |
+
|
252 |
+
def _init_norm(self, norm):
|
253 |
+
if norm.weight is not None:
|
254 |
+
nn.init.constant_(norm.weight, 1)
|
255 |
+
nn.init.constant_(norm.bias, 0)
|