haotiz's picture
initial commit
708dec4
import argparse
import logging
import torch
import torch.nn as nn
import timeit
from maskrcnn_benchmark.layers import *
from maskrcnn_benchmark.modeling.backbone.resnet_big import StdConv2d
from maskrcnn_benchmark.modeling.backbone.fpn import *
from maskrcnn_benchmark.modeling.rpn.inference import *
from maskrcnn_benchmark.modeling.roi_heads.box_head.inference import PostProcessor
from maskrcnn_benchmark.modeling.rpn.anchor_generator import BufferList
def profile(model, input_size, custom_ops={}, device="cpu", verbose=False, extra_args={}, return_time=False):
handler_collection = []
def add_hooks(m):
if len(list(m.children())) > 0:
return
m.register_buffer('total_ops', torch.zeros(1))
m.register_buffer('total_params', torch.zeros(1))
for p in m.parameters():
m.total_params += torch.Tensor([p.numel()])
m_type = type(m)
fn = None
if m_type in custom_ops:
fn = custom_ops[m_type]
elif m_type in register_hooks:
fn = register_hooks[m_type]
else:
print("Not implemented for ", m)
if fn is not None:
if verbose:
print("Register FLOP counter for module %s" % str(m))
handler = m.register_forward_hook(fn)
handler_collection.append(handler)
original_device = model.parameters().__next__().device
training = model.training
model.eval().to(device)
model.apply(add_hooks)
x = torch.zeros(input_size).to(device)
with torch.no_grad():
tic = timeit.time.perf_counter()
model(x, **extra_args)
toc = timeit.time.perf_counter()
total_time = toc-tic
total_ops = 0
total_params = 0
for m in model.modules():
if len(list(m.children())) > 0: # skip for non-leaf module
continue
total_ops += m.total_ops
total_params += m.total_params
total_ops = total_ops.item()
total_params = total_params.item()
model.train(training).to(original_device)
for handler in handler_collection:
handler.remove()
if return_time:
return total_ops, total_params, total_time
else:
return total_ops, total_params
multiply_adds = 1
def count_conv2d(m, x, y):
x = x[0]
cin = m.in_channels
cout = m.out_channels
kh, kw = m.kernel_size
batch_size = x.size()[0]
out_h = y.size(2)
out_w = y.size(3)
# ops per output element
# kernel_mul = kh * kw * cin
# kernel_add = kh * kw * cin - 1
kernel_ops = multiply_adds * kh * kw * cin // m.groups
bias_ops = 1 if m.bias is not None else 0
ops_per_element = kernel_ops + bias_ops
# total ops
# num_out_elements = y.numel()
output_elements = batch_size * out_w * out_h * cout
total_ops = output_elements * ops_per_element
m.total_ops = torch.Tensor([int(total_ops)])
def count_convtranspose2d(m, x, y):
x = x[0]
cin = m.in_channels
cout = m.out_channels
kh, kw = m.kernel_size
batch_size = x.size()[0]
out_h = y.size(2)
out_w = y.size(3)
# ops per output element
# kernel_mul = kh * kw * cin
# kernel_add = kh * kw * cin - 1
kernel_ops = multiply_adds * kh * kw * cin // m.groups
bias_ops = 1 if m.bias is not None else 0
ops_per_element = kernel_ops + bias_ops
# total ops
# num_out_elements = y.numel()
# output_elements = batch_size * out_w * out_h * cout
ops_per_element = m.weight.nelement()
output_elements = y.nelement()
total_ops = output_elements * ops_per_element
m.total_ops = torch.Tensor([int(total_ops)])
def count_bn(m, x, y):
x = x[0]
nelements = x.numel()
# subtract, divide, gamma, beta
total_ops = 4*nelements
m.total_ops = torch.Tensor([int(total_ops)])
def count_relu(m, x, y):
x = x[0]
nelements = x.numel()
total_ops = nelements
m.total_ops = torch.Tensor([int(total_ops)])
def count_softmax(m, x, y):
x = x[0]
batch_size, nfeatures = x.size()
total_exp = nfeatures
total_add = nfeatures - 1
total_div = nfeatures
total_ops = batch_size * (total_exp + total_add + total_div)
m.total_ops = torch.Tensor([int(total_ops)])
def count_maxpool(m, x, y):
kernel_ops = torch.prod(torch.Tensor([m.kernel_size]))
num_elements = y.numel()
total_ops = kernel_ops * num_elements
m.total_ops = torch.Tensor([int(total_ops)])
def count_adap_maxpool(m, x, y):
kernel = torch.Tensor([*(x[0].shape[2:])])//torch.Tensor(list((m.output_size,))).squeeze()
kernel_ops = torch.prod(kernel)
num_elements = y.numel()
total_ops = kernel_ops * num_elements
m.total_ops = torch.Tensor([int(total_ops)])
def count_avgpool(m, x, y):
total_add = torch.prod(torch.Tensor([m.kernel_size]))
total_div = 1
kernel_ops = total_add + total_div
num_elements = y.numel()
total_ops = kernel_ops * num_elements
m.total_ops = torch.Tensor([int(total_ops)])
def count_adap_avgpool(m, x, y):
kernel = torch.Tensor([*(x[0].shape[2:])])//torch.Tensor(list((m.output_size,))).squeeze()
total_add = torch.prod(kernel)
total_div = 1
kernel_ops = total_add + total_div
num_elements = y.numel()
total_ops = kernel_ops * num_elements
m.total_ops = torch.Tensor([int(total_ops)])
def count_linear(m, x, y):
# per output element
total_mul = m.in_features
total_add = m.in_features - 1
num_elements = y.numel()
total_ops = (total_mul + total_add) * num_elements
m.total_ops = torch.Tensor([int(total_ops)])
def count_LastLevelMaxPool(m, x, y):
num_elements = y[-1].numel()
total_ops = num_elements
m.total_ops = torch.Tensor([int(total_ops)])
def count_ROIAlign(m, x, y):
num_elements = y.numel()
total_ops = num_elements*4
m.total_ops = torch.Tensor([int(total_ops)])
register_hooks = {
Scale: None,
Conv2d: count_conv2d,
nn.Conv2d: count_conv2d,
ModulatedDeformConv: count_conv2d,
StdConv2d: count_conv2d,
nn.BatchNorm1d: count_bn,
nn.BatchNorm2d: count_bn,
nn.BatchNorm3d: count_bn,
FrozenBatchNorm2d: count_bn,
nn.GroupNorm: count_bn,
NaiveSyncBatchNorm2d: count_bn,
nn.ReLU: count_relu,
nn.ReLU6: count_relu,
swish: None,
nn.ConstantPad2d: None,
SPPLayer: count_LastLevelMaxPool,
LastLevelMaxPool: count_LastLevelMaxPool,
nn.MaxPool1d: count_maxpool,
nn.MaxPool2d: count_maxpool,
nn.MaxPool3d: count_maxpool,
nn.AdaptiveMaxPool1d: count_adap_maxpool,
nn.AdaptiveMaxPool2d: count_adap_maxpool,
nn.AdaptiveMaxPool3d: count_adap_maxpool,
nn.AvgPool1d: count_avgpool,
nn.AvgPool2d: count_avgpool,
nn.AvgPool3d: count_avgpool,
nn.AdaptiveAvgPool1d: count_adap_avgpool,
nn.AdaptiveAvgPool2d: count_adap_avgpool,
nn.AdaptiveAvgPool3d: count_adap_avgpool,
nn.Linear: count_linear,
nn.Upsample: None,
nn.Dropout: None,
nn.Sigmoid: None,
DropBlock2D: None,
ROIAlign: count_ROIAlign,
RPNPostProcessor: None,
PostProcessor: None,
BufferList: None,
RetinaPostProcessor: None,
FCOSPostProcessor: None,
ATSSPostProcessor: None,
}