|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
This code is refer from: |
|
https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/encoders/channel_reduction_encoder.py |
|
https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/decoders/robust_scanner_decoder.py |
|
""" |
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
import math |
|
import paddle |
|
from paddle import ParamAttr |
|
import paddle.nn as nn |
|
import paddle.nn.functional as F |
|
|
|
class BaseDecoder(nn.Layer): |
|
def __init__(self, **kwargs): |
|
super().__init__() |
|
|
|
def forward_train(self, feat, out_enc, targets, img_metas): |
|
raise NotImplementedError |
|
|
|
def forward_test(self, feat, out_enc, img_metas): |
|
raise NotImplementedError |
|
|
|
def forward(self, |
|
feat, |
|
out_enc, |
|
label=None, |
|
valid_ratios=None, |
|
word_positions=None, |
|
train_mode=True): |
|
self.train_mode = train_mode |
|
|
|
if train_mode: |
|
return self.forward_train(feat, out_enc, label, valid_ratios, word_positions) |
|
return self.forward_test(feat, out_enc, valid_ratios, word_positions) |
|
|
|
class ChannelReductionEncoder(nn.Layer): |
|
"""Change the channel number with a one by one convoluational layer. |
|
|
|
Args: |
|
in_channels (int): Number of input channels. |
|
out_channels (int): Number of output channels. |
|
""" |
|
|
|
def __init__(self, |
|
in_channels, |
|
out_channels, |
|
**kwargs): |
|
super(ChannelReductionEncoder, self).__init__() |
|
|
|
self.layer = nn.Conv2D( |
|
in_channels, out_channels, kernel_size=1, stride=1, padding=0, weight_attr=nn.initializer.XavierNormal()) |
|
|
|
def forward(self, feat): |
|
""" |
|
Args: |
|
feat (Tensor): Image features with the shape of |
|
:math:`(N, C_{in}, H, W)`. |
|
|
|
Returns: |
|
Tensor: A tensor of shape :math:`(N, C_{out}, H, W)`. |
|
""" |
|
return self.layer(feat) |
|
|
|
|
|
def masked_fill(x, mask, value): |
|
y = paddle.full(x.shape, value, x.dtype) |
|
return paddle.where(mask, y, x) |
|
|
|
class DotProductAttentionLayer(nn.Layer): |
|
|
|
def __init__(self, dim_model=None): |
|
super().__init__() |
|
|
|
self.scale = dim_model**-0.5 if dim_model is not None else 1. |
|
|
|
def forward(self, query, key, value, h, w, valid_ratios=None): |
|
query = paddle.transpose(query, (0, 2, 1)) |
|
logits = paddle.matmul(query, key) * self.scale |
|
n, c, t = logits.shape |
|
|
|
logits = paddle.reshape(logits, [n, c, h, w]) |
|
if valid_ratios is not None: |
|
|
|
for i, valid_ratio in enumerate(valid_ratios): |
|
valid_width = min(w, int(w * valid_ratio + 0.5)) |
|
if valid_width < w: |
|
logits[i, :, :, valid_width:] = float('-inf') |
|
|
|
|
|
logits = paddle.reshape(logits, [n, c, t]) |
|
weights = F.softmax(logits, axis=2) |
|
value = paddle.transpose(value, (0, 2, 1)) |
|
glimpse = paddle.matmul(weights, value) |
|
glimpse = paddle.transpose(glimpse, (0, 2, 1)) |
|
return glimpse |
|
|
|
class SequenceAttentionDecoder(BaseDecoder): |
|
"""Sequence attention decoder for RobustScanner. |
|
|
|
RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for |
|
Robust Text Recognition <https://arxiv.org/abs/2007.07542>`_ |
|
|
|
Args: |
|
num_classes (int): Number of output classes :math:`C`. |
|
rnn_layers (int): Number of RNN layers. |
|
dim_input (int): Dimension :math:`D_i` of input vector ``feat``. |
|
dim_model (int): Dimension :math:`D_m` of the model. Should also be the |
|
same as encoder output vector ``out_enc``. |
|
max_seq_len (int): Maximum output sequence length :math:`T`. |
|
start_idx (int): The index of `<SOS>`. |
|
mask (bool): Whether to mask input features according to |
|
``img_meta['valid_ratio']``. |
|
padding_idx (int): The index of `<PAD>`. |
|
dropout (float): Dropout rate. |
|
return_feature (bool): Return feature or logits as the result. |
|
encode_value (bool): Whether to use the output of encoder ``out_enc`` |
|
as `value` of attention layer. If False, the original feature |
|
``feat`` will be used. |
|
|
|
Warning: |
|
This decoder will not predict the final class which is assumed to be |
|
`<PAD>`. Therefore, its output size is always :math:`C - 1`. `<PAD>` |
|
is also ignored by loss as specified in |
|
:obj:`mmocr.models.textrecog.recognizer.EncodeDecodeRecognizer`. |
|
""" |
|
|
|
def __init__(self, |
|
num_classes=None, |
|
rnn_layers=2, |
|
dim_input=512, |
|
dim_model=128, |
|
max_seq_len=40, |
|
start_idx=0, |
|
mask=True, |
|
padding_idx=None, |
|
dropout=0, |
|
return_feature=False, |
|
encode_value=False): |
|
super().__init__() |
|
|
|
self.num_classes = num_classes |
|
self.dim_input = dim_input |
|
self.dim_model = dim_model |
|
self.return_feature = return_feature |
|
self.encode_value = encode_value |
|
self.max_seq_len = max_seq_len |
|
self.start_idx = start_idx |
|
self.mask = mask |
|
|
|
self.embedding = nn.Embedding( |
|
self.num_classes, self.dim_model, padding_idx=padding_idx) |
|
|
|
self.sequence_layer = nn.LSTM( |
|
input_size=dim_model, |
|
hidden_size=dim_model, |
|
num_layers=rnn_layers, |
|
time_major=False, |
|
dropout=dropout) |
|
|
|
self.attention_layer = DotProductAttentionLayer() |
|
|
|
self.prediction = None |
|
if not self.return_feature: |
|
pred_num_classes = num_classes - 1 |
|
self.prediction = nn.Linear( |
|
dim_model if encode_value else dim_input, pred_num_classes) |
|
|
|
def forward_train(self, feat, out_enc, targets, valid_ratios): |
|
""" |
|
Args: |
|
feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. |
|
out_enc (Tensor): Encoder output of shape |
|
:math:`(N, D_m, H, W)`. |
|
targets (Tensor): a tensor of shape :math:`(N, T)`. Each element is the index of a |
|
character. |
|
valid_ratios (Tensor): valid length ratio of img. |
|
Returns: |
|
Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if |
|
``return_feature=False``. Otherwise it would be the hidden feature |
|
before the prediction projection layer, whose shape is |
|
:math:`(N, T, D_m)`. |
|
""" |
|
|
|
tgt_embedding = self.embedding(targets) |
|
|
|
n, c_enc, h, w = out_enc.shape |
|
assert c_enc == self.dim_model |
|
_, c_feat, _, _ = feat.shape |
|
assert c_feat == self.dim_input |
|
_, len_q, c_q = tgt_embedding.shape |
|
assert c_q == self.dim_model |
|
assert len_q <= self.max_seq_len |
|
|
|
query, _ = self.sequence_layer(tgt_embedding) |
|
query = paddle.transpose(query, (0, 2, 1)) |
|
key = paddle.reshape(out_enc, [n, c_enc, h * w]) |
|
if self.encode_value: |
|
value = key |
|
else: |
|
value = paddle.reshape(feat, [n, c_feat, h * w]) |
|
|
|
attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) |
|
attn_out = paddle.transpose(attn_out, (0, 2, 1)) |
|
|
|
if self.return_feature: |
|
return attn_out |
|
|
|
out = self.prediction(attn_out) |
|
|
|
return out |
|
|
|
def forward_test(self, feat, out_enc, valid_ratios): |
|
""" |
|
Args: |
|
feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. |
|
out_enc (Tensor): Encoder output of shape |
|
:math:`(N, D_m, H, W)`. |
|
valid_ratios (Tensor): valid length ratio of img. |
|
|
|
Returns: |
|
Tensor: The output logit sequence tensor of shape |
|
:math:`(N, T, C-1)`. |
|
""" |
|
seq_len = self.max_seq_len |
|
batch_size = feat.shape[0] |
|
|
|
decode_sequence = (paddle.ones((batch_size, seq_len), dtype='int64') * self.start_idx) |
|
|
|
outputs = [] |
|
for i in range(seq_len): |
|
step_out = self.forward_test_step(feat, out_enc, decode_sequence, |
|
i, valid_ratios) |
|
outputs.append(step_out) |
|
max_idx = paddle.argmax(step_out, axis=1, keepdim=False) |
|
if i < seq_len - 1: |
|
decode_sequence[:, i + 1] = max_idx |
|
|
|
outputs = paddle.stack(outputs, 1) |
|
|
|
return outputs |
|
|
|
def forward_test_step(self, feat, out_enc, decode_sequence, current_step, |
|
valid_ratios): |
|
""" |
|
Args: |
|
feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. |
|
out_enc (Tensor): Encoder output of shape |
|
:math:`(N, D_m, H, W)`. |
|
decode_sequence (Tensor): Shape :math:`(N, T)`. The tensor that |
|
stores history decoding result. |
|
current_step (int): Current decoding step. |
|
valid_ratios (Tensor): valid length ratio of img |
|
|
|
Returns: |
|
Tensor: Shape :math:`(N, C-1)`. The logit tensor of predicted |
|
tokens at current time step. |
|
""" |
|
|
|
embed = self.embedding(decode_sequence) |
|
|
|
n, c_enc, h, w = out_enc.shape |
|
assert c_enc == self.dim_model |
|
_, c_feat, _, _ = feat.shape |
|
assert c_feat == self.dim_input |
|
_, _, c_q = embed.shape |
|
assert c_q == self.dim_model |
|
|
|
query, _ = self.sequence_layer(embed) |
|
query = paddle.transpose(query, (0, 2, 1)) |
|
key = paddle.reshape(out_enc, [n, c_enc, h * w]) |
|
if self.encode_value: |
|
value = key |
|
else: |
|
value = paddle.reshape(feat, [n, c_feat, h * w]) |
|
|
|
|
|
attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) |
|
out = attn_out[:, :, current_step] |
|
|
|
if self.return_feature: |
|
return out |
|
|
|
out = self.prediction(out) |
|
out = F.softmax(out, dim=-1) |
|
|
|
return out |
|
|
|
|
|
class PositionAwareLayer(nn.Layer): |
|
|
|
def __init__(self, dim_model, rnn_layers=2): |
|
super().__init__() |
|
|
|
self.dim_model = dim_model |
|
|
|
self.rnn = nn.LSTM( |
|
input_size=dim_model, |
|
hidden_size=dim_model, |
|
num_layers=rnn_layers, |
|
time_major=False) |
|
|
|
self.mixer = nn.Sequential( |
|
nn.Conv2D( |
|
dim_model, dim_model, kernel_size=3, stride=1, padding=1), |
|
nn.ReLU(), |
|
nn.Conv2D( |
|
dim_model, dim_model, kernel_size=3, stride=1, padding=1)) |
|
|
|
def forward(self, img_feature): |
|
n, c, h, w = img_feature.shape |
|
rnn_input = paddle.transpose(img_feature, (0, 2, 3, 1)) |
|
rnn_input = paddle.reshape(rnn_input, (n * h, w, c)) |
|
rnn_output, _ = self.rnn(rnn_input) |
|
rnn_output = paddle.reshape(rnn_output, (n, h, w, c)) |
|
rnn_output = paddle.transpose(rnn_output, (0, 3, 1, 2)) |
|
out = self.mixer(rnn_output) |
|
return out |
|
|
|
|
|
class PositionAttentionDecoder(BaseDecoder): |
|
"""Position attention decoder for RobustScanner. |
|
|
|
RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for |
|
Robust Text Recognition <https://arxiv.org/abs/2007.07542>`_ |
|
|
|
Args: |
|
num_classes (int): Number of output classes :math:`C`. |
|
rnn_layers (int): Number of RNN layers. |
|
dim_input (int): Dimension :math:`D_i` of input vector ``feat``. |
|
dim_model (int): Dimension :math:`D_m` of the model. Should also be the |
|
same as encoder output vector ``out_enc``. |
|
max_seq_len (int): Maximum output sequence length :math:`T`. |
|
mask (bool): Whether to mask input features according to |
|
``img_meta['valid_ratio']``. |
|
return_feature (bool): Return feature or logits as the result. |
|
encode_value (bool): Whether to use the output of encoder ``out_enc`` |
|
as `value` of attention layer. If False, the original feature |
|
``feat`` will be used. |
|
|
|
Warning: |
|
This decoder will not predict the final class which is assumed to be |
|
`<PAD>`. Therefore, its output size is always :math:`C - 1`. `<PAD>` |
|
is also ignored by loss |
|
|
|
""" |
|
|
|
def __init__(self, |
|
num_classes=None, |
|
rnn_layers=2, |
|
dim_input=512, |
|
dim_model=128, |
|
max_seq_len=40, |
|
mask=True, |
|
return_feature=False, |
|
encode_value=False): |
|
super().__init__() |
|
|
|
self.num_classes = num_classes |
|
self.dim_input = dim_input |
|
self.dim_model = dim_model |
|
self.max_seq_len = max_seq_len |
|
self.return_feature = return_feature |
|
self.encode_value = encode_value |
|
self.mask = mask |
|
|
|
self.embedding = nn.Embedding(self.max_seq_len + 1, self.dim_model) |
|
|
|
self.position_aware_module = PositionAwareLayer( |
|
self.dim_model, rnn_layers) |
|
|
|
self.attention_layer = DotProductAttentionLayer() |
|
|
|
self.prediction = None |
|
if not self.return_feature: |
|
pred_num_classes = num_classes - 1 |
|
self.prediction = nn.Linear( |
|
dim_model if encode_value else dim_input, pred_num_classes) |
|
|
|
def _get_position_index(self, length, batch_size): |
|
position_index_list = [] |
|
for i in range(batch_size): |
|
position_index = paddle.arange(0, end=length, step=1, dtype='int64') |
|
position_index_list.append(position_index) |
|
batch_position_index = paddle.stack(position_index_list, axis=0) |
|
return batch_position_index |
|
|
|
def forward_train(self, feat, out_enc, targets, valid_ratios, position_index): |
|
""" |
|
Args: |
|
feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. |
|
out_enc (Tensor): Encoder output of shape |
|
:math:`(N, D_m, H, W)`. |
|
targets (dict): A dict with the key ``padded_targets``, a |
|
tensor of shape :math:`(N, T)`. Each element is the index of a |
|
character. |
|
valid_ratios (Tensor): valid length ratio of img. |
|
position_index (Tensor): The position of each word. |
|
|
|
Returns: |
|
Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if |
|
``return_feature=False``. Otherwise it will be the hidden feature |
|
before the prediction projection layer, whose shape is |
|
:math:`(N, T, D_m)`. |
|
""" |
|
n, c_enc, h, w = out_enc.shape |
|
assert c_enc == self.dim_model |
|
_, c_feat, _, _ = feat.shape |
|
assert c_feat == self.dim_input |
|
_, len_q = targets.shape |
|
assert len_q <= self.max_seq_len |
|
|
|
position_out_enc = self.position_aware_module(out_enc) |
|
|
|
query = self.embedding(position_index) |
|
query = paddle.transpose(query, (0, 2, 1)) |
|
key = paddle.reshape(position_out_enc, (n, c_enc, h * w)) |
|
if self.encode_value: |
|
value = paddle.reshape(out_enc,(n, c_enc, h * w)) |
|
else: |
|
value = paddle.reshape(feat,(n, c_feat, h * w)) |
|
|
|
attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) |
|
attn_out = paddle.transpose(attn_out, (0, 2, 1)) |
|
|
|
if self.return_feature: |
|
return attn_out |
|
|
|
return self.prediction(attn_out) |
|
|
|
def forward_test(self, feat, out_enc, valid_ratios, position_index): |
|
""" |
|
Args: |
|
feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. |
|
out_enc (Tensor): Encoder output of shape |
|
:math:`(N, D_m, H, W)`. |
|
valid_ratios (Tensor): valid length ratio of img |
|
position_index (Tensor): The position of each word. |
|
|
|
Returns: |
|
Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if |
|
``return_feature=False``. Otherwise it would be the hidden feature |
|
before the prediction projection layer, whose shape is |
|
:math:`(N, T, D_m)`. |
|
""" |
|
n, c_enc, h, w = out_enc.shape |
|
assert c_enc == self.dim_model |
|
_, c_feat, _, _ = feat.shape |
|
assert c_feat == self.dim_input |
|
|
|
position_out_enc = self.position_aware_module(out_enc) |
|
|
|
query = self.embedding(position_index) |
|
query = paddle.transpose(query, (0, 2, 1)) |
|
key = paddle.reshape(position_out_enc, (n, c_enc, h * w)) |
|
if self.encode_value: |
|
value = paddle.reshape(out_enc,(n, c_enc, h * w)) |
|
else: |
|
value = paddle.reshape(feat,(n, c_feat, h * w)) |
|
|
|
attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) |
|
attn_out = paddle.transpose(attn_out, (0, 2, 1)) |
|
|
|
if self.return_feature: |
|
return attn_out |
|
|
|
return self.prediction(attn_out) |
|
|
|
class RobustScannerFusionLayer(nn.Layer): |
|
|
|
def __init__(self, dim_model, dim=-1): |
|
super(RobustScannerFusionLayer, self).__init__() |
|
|
|
self.dim_model = dim_model |
|
self.dim = dim |
|
self.linear_layer = nn.Linear(dim_model * 2, dim_model * 2) |
|
|
|
def forward(self, x0, x1): |
|
assert x0.shape == x1.shape |
|
fusion_input = paddle.concat([x0, x1], self.dim) |
|
output = self.linear_layer(fusion_input) |
|
output = F.glu(output, self.dim) |
|
return output |
|
|
|
class RobustScannerDecoder(BaseDecoder): |
|
"""Decoder for RobustScanner. |
|
|
|
RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for |
|
Robust Text Recognition <https://arxiv.org/abs/2007.07542>`_ |
|
|
|
Args: |
|
num_classes (int): Number of output classes :math:`C`. |
|
dim_input (int): Dimension :math:`D_i` of input vector ``feat``. |
|
dim_model (int): Dimension :math:`D_m` of the model. Should also be the |
|
same as encoder output vector ``out_enc``. |
|
max_seq_len (int): Maximum output sequence length :math:`T`. |
|
start_idx (int): The index of `<SOS>`. |
|
mask (bool): Whether to mask input features according to |
|
``img_meta['valid_ratio']``. |
|
padding_idx (int): The index of `<PAD>`. |
|
encode_value (bool): Whether to use the output of encoder ``out_enc`` |
|
as `value` of attention layer. If False, the original feature |
|
``feat`` will be used. |
|
|
|
Warning: |
|
This decoder will not predict the final class which is assumed to be |
|
`<PAD>`. Therefore, its output size is always :math:`C - 1`. `<PAD>` |
|
is also ignored by loss as specified in |
|
:obj:`mmocr.models.textrecog.recognizer.EncodeDecodeRecognizer`. |
|
""" |
|
|
|
def __init__(self, |
|
num_classes=None, |
|
dim_input=512, |
|
dim_model=128, |
|
hybrid_decoder_rnn_layers=2, |
|
hybrid_decoder_dropout=0, |
|
position_decoder_rnn_layers=2, |
|
max_seq_len=40, |
|
start_idx=0, |
|
mask=True, |
|
padding_idx=None, |
|
encode_value=False): |
|
super().__init__() |
|
self.num_classes = num_classes |
|
self.dim_input = dim_input |
|
self.dim_model = dim_model |
|
self.max_seq_len = max_seq_len |
|
self.encode_value = encode_value |
|
self.start_idx = start_idx |
|
self.padding_idx = padding_idx |
|
self.mask = mask |
|
|
|
|
|
self.hybrid_decoder = SequenceAttentionDecoder( |
|
num_classes=num_classes, |
|
rnn_layers=hybrid_decoder_rnn_layers, |
|
dim_input=dim_input, |
|
dim_model=dim_model, |
|
max_seq_len=max_seq_len, |
|
start_idx=start_idx, |
|
mask=mask, |
|
padding_idx=padding_idx, |
|
dropout=hybrid_decoder_dropout, |
|
encode_value=encode_value, |
|
return_feature=True |
|
) |
|
|
|
|
|
self.position_decoder = PositionAttentionDecoder( |
|
num_classes=num_classes, |
|
rnn_layers=position_decoder_rnn_layers, |
|
dim_input=dim_input, |
|
dim_model=dim_model, |
|
max_seq_len=max_seq_len, |
|
mask=mask, |
|
encode_value=encode_value, |
|
return_feature=True |
|
) |
|
|
|
|
|
self.fusion_module = RobustScannerFusionLayer( |
|
self.dim_model if encode_value else dim_input) |
|
|
|
pred_num_classes = num_classes - 1 |
|
self.prediction = nn.Linear(dim_model if encode_value else dim_input, |
|
pred_num_classes) |
|
|
|
def forward_train(self, feat, out_enc, target, valid_ratios, word_positions): |
|
""" |
|
Args: |
|
feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. |
|
out_enc (Tensor): Encoder output of shape |
|
:math:`(N, D_m, H, W)`. |
|
target (dict): A dict with the key ``padded_targets``, a |
|
tensor of shape :math:`(N, T)`. Each element is the index of a |
|
character. |
|
valid_ratios (Tensor): |
|
word_positions (Tensor): The position of each word. |
|
|
|
Returns: |
|
Tensor: A raw logit tensor of shape :math:`(N, T, C-1)`. |
|
""" |
|
hybrid_glimpse = self.hybrid_decoder.forward_train( |
|
feat, out_enc, target, valid_ratios) |
|
position_glimpse = self.position_decoder.forward_train( |
|
feat, out_enc, target, valid_ratios, word_positions) |
|
|
|
fusion_out = self.fusion_module(hybrid_glimpse, position_glimpse) |
|
|
|
out = self.prediction(fusion_out) |
|
|
|
return out |
|
|
|
def forward_test(self, feat, out_enc, valid_ratios, word_positions): |
|
""" |
|
Args: |
|
feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. |
|
out_enc (Tensor): Encoder output of shape |
|
:math:`(N, D_m, H, W)`. |
|
valid_ratios (Tensor): |
|
word_positions (Tensor): The position of each word. |
|
Returns: |
|
Tensor: The output logit sequence tensor of shape |
|
:math:`(N, T, C-1)`. |
|
""" |
|
seq_len = self.max_seq_len |
|
batch_size = feat.shape[0] |
|
|
|
decode_sequence = (paddle.ones((batch_size, seq_len), dtype='int64') * self.start_idx) |
|
|
|
position_glimpse = self.position_decoder.forward_test( |
|
feat, out_enc, valid_ratios, word_positions) |
|
|
|
outputs = [] |
|
for i in range(seq_len): |
|
hybrid_glimpse_step = self.hybrid_decoder.forward_test_step( |
|
feat, out_enc, decode_sequence, i, valid_ratios) |
|
|
|
fusion_out = self.fusion_module(hybrid_glimpse_step, |
|
position_glimpse[:, i, :]) |
|
|
|
char_out = self.prediction(fusion_out) |
|
char_out = F.softmax(char_out, -1) |
|
outputs.append(char_out) |
|
max_idx = paddle.argmax(char_out, axis=1, keepdim=False) |
|
if i < seq_len - 1: |
|
decode_sequence[:, i + 1] = max_idx |
|
|
|
outputs = paddle.stack(outputs, 1) |
|
|
|
return outputs |
|
|
|
class RobustScannerHead(nn.Layer): |
|
def __init__(self, |
|
out_channels, |
|
in_channels, |
|
enc_outchannles=128, |
|
hybrid_dec_rnn_layers=2, |
|
hybrid_dec_dropout=0, |
|
position_dec_rnn_layers=2, |
|
start_idx=0, |
|
max_text_length=40, |
|
mask=True, |
|
padding_idx=None, |
|
encode_value=False, |
|
**kwargs): |
|
super(RobustScannerHead, self).__init__() |
|
|
|
|
|
self.encoder = ChannelReductionEncoder( |
|
in_channels=in_channels, out_channels=enc_outchannles) |
|
|
|
|
|
self.decoder =RobustScannerDecoder( |
|
num_classes=out_channels, |
|
dim_input=in_channels, |
|
dim_model=enc_outchannles, |
|
hybrid_decoder_rnn_layers=hybrid_dec_rnn_layers, |
|
hybrid_decoder_dropout=hybrid_dec_dropout, |
|
position_decoder_rnn_layers=position_dec_rnn_layers, |
|
max_seq_len=max_text_length, |
|
start_idx=start_idx, |
|
mask=mask, |
|
padding_idx=padding_idx, |
|
encode_value=encode_value) |
|
|
|
def forward(self, inputs, targets=None): |
|
''' |
|
targets: [label, valid_ratio, word_positions] |
|
''' |
|
out_enc = self.encoder(inputs) |
|
valid_ratios = None |
|
word_positions = targets[-1] |
|
|
|
if len(targets) > 1: |
|
valid_ratios = targets[-2] |
|
|
|
if self.training: |
|
label = targets[0] |
|
label = paddle.to_tensor(label, dtype='int64') |
|
final_out = self.decoder( |
|
inputs, out_enc, label, valid_ratios, word_positions) |
|
if not self.training: |
|
final_out = self.decoder( |
|
inputs, |
|
out_enc, |
|
label=None, |
|
valid_ratios=valid_ratios, |
|
word_positions=word_positions, |
|
train_mode=False) |
|
return final_out |
|
|