Spaces:
Sleeping
Sleeping
File size: 5,875 Bytes
b4d6f1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
""" Convolution with Weight Standardization (StdConv and ScaledStdConv)
StdConv:
@article{weightstandardization,
author = {Siyuan Qiao and Huiyu Wang and Chenxi Liu and Wei Shen and Alan Yuille},
title = {Weight Standardization},
journal = {arXiv preprint arXiv:1903.10520},
year = {2019},
}
Code: https://github.com/joe-siyuan-qiao/WeightStandardization
ScaledStdConv:
Paper: `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
- https://arxiv.org/abs/2101.08692
Official Deepmind JAX code: https://github.com/deepmind/deepmind-research/tree/master/nfnets
Hacked together by / copyright Ross Wightman, 2021.
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from .padding import get_padding, get_padding_value, pad_same
class StdConv2d(nn.Conv2d):
"""Conv2d with Weight Standardization. Used for BiT ResNet-V2 models.
Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` -
https://arxiv.org/abs/1903.10520v2
"""
def __init__(
self, in_channel, out_channels, kernel_size, stride=1, padding=None,
dilation=1, groups=1, bias=False, eps=1e-6):
if padding is None:
padding = get_padding(kernel_size, stride, dilation)
super().__init__(
in_channel, out_channels, kernel_size, stride=stride,
padding=padding, dilation=dilation, groups=groups, bias=bias)
self.eps = eps
def forward(self, x):
weight = F.batch_norm(
self.weight.view(1, self.out_channels, -1), None, None,
training=True, momentum=0., eps=self.eps).reshape_as(self.weight)
x = F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
return x
class StdConv2dSame(nn.Conv2d):
"""Conv2d with Weight Standardization. TF compatible SAME padding. Used for ViT Hybrid model.
Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` -
https://arxiv.org/abs/1903.10520v2
"""
def __init__(
self, in_channel, out_channels, kernel_size, stride=1, padding='SAME',
dilation=1, groups=1, bias=False, eps=1e-6):
padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, dilation=dilation)
super().__init__(
in_channel, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation,
groups=groups, bias=bias)
self.same_pad = is_dynamic
self.eps = eps
def forward(self, x):
if self.same_pad:
x = pad_same(x, self.kernel_size, self.stride, self.dilation)
weight = F.batch_norm(
self.weight.view(1, self.out_channels, -1), None, None,
training=True, momentum=0., eps=self.eps).reshape_as(self.weight)
x = F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
return x
class ScaledStdConv2d(nn.Conv2d):
"""Conv2d layer with Scaled Weight Standardization.
Paper: `Characterizing signal propagation to close the performance gap in unnormalized ResNets` -
https://arxiv.org/abs/2101.08692
NOTE: the operations used in this impl differ slightly from the DeepMind Haiku impl. The impact is minor.
"""
def __init__(
self, in_channels, out_channels, kernel_size, stride=1, padding=None,
dilation=1, groups=1, bias=True, gamma=1.0, eps=1e-6, gain_init=1.0):
if padding is None:
padding = get_padding(kernel_size, stride, dilation)
super().__init__(
in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation,
groups=groups, bias=bias)
self.gain = nn.Parameter(torch.full((self.out_channels, 1, 1, 1), gain_init))
self.scale = gamma * self.weight[0].numel() ** -0.5 # gamma * 1 / sqrt(fan-in)
self.eps = eps
def forward(self, x):
weight = F.batch_norm(
self.weight.view(1, self.out_channels, -1), None, None,
weight=(self.gain * self.scale).view(-1),
training=True, momentum=0., eps=self.eps).reshape_as(self.weight)
return F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
class ScaledStdConv2dSame(nn.Conv2d):
"""Conv2d layer with Scaled Weight Standardization and Tensorflow-like SAME padding support
Paper: `Characterizing signal propagation to close the performance gap in unnormalized ResNets` -
https://arxiv.org/abs/2101.08692
NOTE: the operations used in this impl differ slightly from the DeepMind Haiku impl. The impact is minor.
"""
def __init__(
self, in_channels, out_channels, kernel_size, stride=1, padding='SAME',
dilation=1, groups=1, bias=True, gamma=1.0, eps=1e-6, gain_init=1.0):
padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, dilation=dilation)
super().__init__(
in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation,
groups=groups, bias=bias)
self.gain = nn.Parameter(torch.full((self.out_channels, 1, 1, 1), gain_init))
self.scale = gamma * self.weight[0].numel() ** -0.5
self.same_pad = is_dynamic
self.eps = eps
def forward(self, x):
if self.same_pad:
x = pad_same(x, self.kernel_size, self.stride, self.dilation)
weight = F.batch_norm(
self.weight.view(1, self.out_channels, -1), None, None,
weight=(self.gain * self.scale).view(-1),
training=True, momentum=0., eps=self.eps).reshape_as(self.weight)
return F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
|