|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
import math |
|
import paddle |
|
from paddle import nn, ParamAttr |
|
from paddle.nn import functional as F |
|
import numpy as np |
|
import functools |
|
from .tps import GridGenerator |
|
|
|
'''This code is refer from: |
|
https://github.com/hikopensource/DAVAR-Lab-OCR/davarocr/davar_rcg/models/transformations/gaspin_transformation.py |
|
''' |
|
|
|
class SP_TransformerNetwork(nn.Layer): |
|
""" |
|
Sturture-Preserving Transformation (SPT) as Equa. (2) in Ref. [1] |
|
Ref: [1] SPIN: Structure-Preserving Inner Offset Network for Scene Text Recognition. AAAI-2021. |
|
""" |
|
|
|
def __init__(self, nc=1, default_type=5): |
|
""" Based on SPIN |
|
Args: |
|
nc (int): number of input channels (usually in 1 or 3) |
|
default_type (int): the complexity of transformation intensities (by default set to 6 as the paper) |
|
""" |
|
super(SP_TransformerNetwork, self).__init__() |
|
self.power_list = self.cal_K(default_type) |
|
self.sigmoid = nn.Sigmoid() |
|
self.bn = nn.InstanceNorm2D(nc) |
|
|
|
def cal_K(self, k=5): |
|
""" |
|
|
|
Args: |
|
k (int): the complexity of transformation intensities (by default set to 6 as the paper) |
|
|
|
Returns: |
|
List: the normalized intensity of each pixel in [0,1], denoted as \beta [1x(2K+1)] |
|
|
|
""" |
|
from math import log |
|
x = [] |
|
if k != 0: |
|
for i in range(1, k+1): |
|
lower = round(log(1-(0.5/(k+1))*i)/log((0.5/(k+1))*i), 2) |
|
upper = round(1/lower, 2) |
|
x.append(lower) |
|
x.append(upper) |
|
x.append(1.00) |
|
return x |
|
|
|
def forward(self, batch_I, weights, offsets, lambda_color=None): |
|
""" |
|
|
|
Args: |
|
batch_I (Tensor): batch of input images [batch_size x nc x I_height x I_width] |
|
weights: |
|
offsets: the predicted offset by AIN, a scalar |
|
lambda_color: the learnable update gate \alpha in Equa. (5) as |
|
g(x) = (1 - \alpha) \odot x + \alpha \odot x_{offsets} |
|
|
|
Returns: |
|
Tensor: transformed images by SPN as Equa. (4) in Ref. [1] |
|
[batch_size x I_channel_num x I_r_height x I_r_width] |
|
|
|
""" |
|
batch_I = (batch_I + 1) * 0.5 |
|
if offsets is not None: |
|
batch_I = batch_I*(1-lambda_color) + offsets*lambda_color |
|
batch_weight_params = paddle.unsqueeze(paddle.unsqueeze(weights, -1), -1) |
|
batch_I_power = paddle.stack([batch_I.pow(p) for p in self.power_list], axis=1) |
|
|
|
batch_weight_sum = paddle.sum(batch_I_power * batch_weight_params, axis=1) |
|
batch_weight_sum = self.bn(batch_weight_sum) |
|
batch_weight_sum = self.sigmoid(batch_weight_sum) |
|
batch_weight_sum = batch_weight_sum * 2 - 1 |
|
return batch_weight_sum |
|
|
|
class GA_SPIN_Transformer(nn.Layer): |
|
""" |
|
Geometric-Absorbed SPIN Transformation (GA-SPIN) proposed in Ref. [1] |
|
|
|
|
|
Ref: [1] SPIN: Structure-Preserving Inner Offset Network for Scene Text Recognition. AAAI-2021. |
|
""" |
|
|
|
def __init__(self, in_channels=1, |
|
I_r_size=(32, 100), |
|
offsets=False, |
|
norm_type='BN', |
|
default_type=6, |
|
loc_lr=1, |
|
stn=True): |
|
""" |
|
Args: |
|
in_channels (int): channel of input features, |
|
set it to 1 if the grayscale images and 3 if RGB input |
|
I_r_size (tuple): size of rectified images (used in STN transformations) |
|
offsets (bool): set it to False if use SPN w.o. AIN, |
|
and set it to True if use SPIN (both with SPN and AIN) |
|
norm_type (str): the normalization type of the module, |
|
set it to 'BN' by default, 'IN' optionally |
|
default_type (int): the K chromatic space, |
|
set it to 3/5/6 depend on the complexity of transformation intensities |
|
loc_lr (float): learning rate of location network |
|
stn (bool): whther to use stn. |
|
|
|
""" |
|
super(GA_SPIN_Transformer, self).__init__() |
|
self.nc = in_channels |
|
self.spt = True |
|
self.offsets = offsets |
|
self.stn = stn |
|
self.I_r_size = I_r_size |
|
self.out_channels = in_channels |
|
if norm_type == 'BN': |
|
norm_layer = functools.partial(nn.BatchNorm2D, use_global_stats=True) |
|
elif norm_type == 'IN': |
|
norm_layer = functools.partial(nn.InstanceNorm2D, weight_attr=False, |
|
use_global_stats=False) |
|
else: |
|
raise NotImplementedError('normalization layer [%s] is not found' % norm_type) |
|
|
|
if self.spt: |
|
self.sp_net = SP_TransformerNetwork(in_channels, |
|
default_type) |
|
self.spt_convnet = nn.Sequential( |
|
|
|
nn.Conv2D(in_channels, 32, 3, 1, 1, bias_attr=False), |
|
norm_layer(32), nn.ReLU(), |
|
nn.MaxPool2D(kernel_size=2, stride=2), |
|
|
|
nn.Conv2D(32, 64, 3, 1, 1, bias_attr=False), |
|
norm_layer(64), nn.ReLU(), |
|
nn.MaxPool2D(kernel_size=2, stride=2), |
|
|
|
nn.Conv2D(64, 128, 3, 1, 1, bias_attr=False), |
|
norm_layer(128), nn.ReLU(), |
|
nn.MaxPool2D(kernel_size=2, stride=2), |
|
|
|
) |
|
self.stucture_fc1 = nn.Sequential( |
|
nn.Conv2D(128, 256, 3, 1, 1, bias_attr=False), |
|
norm_layer(256), nn.ReLU(), |
|
nn.MaxPool2D(kernel_size=2, stride=2), |
|
nn.Conv2D(256, 256, 3, 1, 1, bias_attr=False), |
|
norm_layer(256), nn.ReLU(), |
|
nn.MaxPool2D(kernel_size=2, stride=2), |
|
nn.Conv2D(256, 512, 3, 1, 1, bias_attr=False), |
|
norm_layer(512), nn.ReLU(), |
|
nn.AdaptiveAvgPool2D(1), |
|
nn.Flatten(1, -1), |
|
nn.Linear(512, 256, weight_attr=nn.initializer.Normal(0.001)), |
|
nn.BatchNorm1D(256), nn.ReLU() |
|
) |
|
self.out_weight = 2*default_type+1 |
|
self.spt_length = 2*default_type+1 |
|
if offsets: |
|
self.out_weight += 1 |
|
if self.stn: |
|
self.F = 20 |
|
self.out_weight += self.F * 2 |
|
self.GridGenerator = GridGenerator(self.F*2, self.F) |
|
|
|
|
|
|
|
initial_bias = self.init_spin(default_type*2) |
|
initial_bias = initial_bias.reshape(-1) |
|
param_attr = ParamAttr( |
|
learning_rate=loc_lr, |
|
initializer=nn.initializer.Assign(np.zeros([256, self.out_weight]))) |
|
bias_attr = ParamAttr( |
|
learning_rate=loc_lr, |
|
initializer=nn.initializer.Assign(initial_bias)) |
|
self.stucture_fc2 = nn.Linear(256, self.out_weight, |
|
weight_attr=param_attr, |
|
bias_attr=bias_attr) |
|
self.sigmoid = nn.Sigmoid() |
|
|
|
if offsets: |
|
self.offset_fc1 = nn.Sequential(nn.Conv2D(128, 16, |
|
3, 1, 1, |
|
bias_attr=False), |
|
norm_layer(16), |
|
nn.ReLU(),) |
|
self.offset_fc2 = nn.Conv2D(16, in_channels, |
|
3, 1, 1) |
|
self.pool = nn.MaxPool2D(2, 2) |
|
|
|
def init_spin(self, nz): |
|
""" |
|
Args: |
|
nz (int): number of paired \betas exponents, which means the value of K x 2 |
|
|
|
""" |
|
init_id = [0.00]*nz+[5.00] |
|
if self.offsets: |
|
init_id += [-5.00] |
|
|
|
init = np.array(init_id) |
|
|
|
if self.stn: |
|
F = self.F |
|
ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2)) |
|
ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(F / 2)) |
|
ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(F / 2)) |
|
ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1) |
|
ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1) |
|
initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0) |
|
initial_bias = initial_bias.reshape(-1) |
|
init = np.concatenate([init, initial_bias], axis=0) |
|
return init |
|
|
|
def forward(self, x, return_weight=False): |
|
""" |
|
Args: |
|
x (Tensor): input image batch |
|
return_weight (bool): set to False by default, |
|
if set to True return the predicted offsets of AIN, denoted as x_{offsets} |
|
|
|
Returns: |
|
Tensor: rectified image [batch_size x I_channel_num x I_height x I_width], the same as the input size |
|
""" |
|
|
|
if self.spt: |
|
feat = self.spt_convnet(x) |
|
fc1 = self.stucture_fc1(feat) |
|
sp_weight_fusion = self.stucture_fc2(fc1) |
|
sp_weight_fusion = sp_weight_fusion.reshape([x.shape[0], self.out_weight, 1]) |
|
if self.offsets: |
|
lambda_color = sp_weight_fusion[:, self.spt_length, 0] |
|
lambda_color = self.sigmoid(lambda_color).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1) |
|
sp_weight = sp_weight_fusion[:, :self.spt_length, :] |
|
offsets = self.pool(self.offset_fc2(self.offset_fc1(feat))) |
|
|
|
assert offsets.shape[2] == 2 |
|
assert offsets.shape[3] == 6 |
|
offsets = self.sigmoid(offsets) |
|
|
|
if return_weight: |
|
return offsets |
|
offsets = nn.functional.upsample(offsets, size=(x.shape[2], x.shape[3]), mode='bilinear') |
|
|
|
if self.stn: |
|
batch_C_prime = sp_weight_fusion[:, (self.spt_length + 1):, :].reshape([x.shape[0], self.F, 2]) |
|
build_P_prime = self.GridGenerator(batch_C_prime, self.I_r_size) |
|
build_P_prime_reshape = build_P_prime.reshape([build_P_prime.shape[0], |
|
self.I_r_size[0], |
|
self.I_r_size[1], |
|
2]) |
|
|
|
else: |
|
sp_weight = sp_weight_fusion[:, :self.spt_length, :] |
|
lambda_color, offsets = None, None |
|
|
|
if self.stn: |
|
batch_C_prime = sp_weight_fusion[:, self.spt_length:, :].reshape([x.shape[0], self.F, 2]) |
|
build_P_prime = self.GridGenerator(batch_C_prime, self.I_r_size) |
|
build_P_prime_reshape = build_P_prime.reshape([build_P_prime.shape[0], |
|
self.I_r_size[0], |
|
self.I_r_size[1], |
|
2]) |
|
|
|
x = self.sp_net(x, sp_weight, offsets, lambda_color) |
|
if self.stn: |
|
x = F.grid_sample(x=x, grid=build_P_prime_reshape, padding_mode='border') |
|
return x |
|
|