|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
import math |
|
import paddle |
|
from paddle import nn, ParamAttr |
|
from paddle.nn import functional as F |
|
import numpy as np |
|
from .self_attention import WrapEncoderForFeature |
|
from .self_attention import WrapEncoder |
|
from paddle.static import Program |
|
from ppocr.modeling.backbones.rec_resnet_fpn import ResNetFPN |
|
|
|
from collections import OrderedDict |
|
gradient_clip = 10 |
|
|
|
|
|
class PVAM(nn.Layer): |
|
def __init__(self, in_channels, char_num, max_text_length, num_heads, |
|
num_encoder_tus, hidden_dims): |
|
super(PVAM, self).__init__() |
|
self.char_num = char_num |
|
self.max_length = max_text_length |
|
self.num_heads = num_heads |
|
self.num_encoder_TUs = num_encoder_tus |
|
self.hidden_dims = hidden_dims |
|
|
|
t = 256 |
|
c = 512 |
|
self.wrap_encoder_for_feature = WrapEncoderForFeature( |
|
src_vocab_size=1, |
|
max_length=t, |
|
n_layer=self.num_encoder_TUs, |
|
n_head=self.num_heads, |
|
d_key=int(self.hidden_dims / self.num_heads), |
|
d_value=int(self.hidden_dims / self.num_heads), |
|
d_model=self.hidden_dims, |
|
d_inner_hid=self.hidden_dims, |
|
prepostprocess_dropout=0.1, |
|
attention_dropout=0.1, |
|
relu_dropout=0.1, |
|
preprocess_cmd="n", |
|
postprocess_cmd="da", |
|
weight_sharing=True) |
|
|
|
|
|
self.flatten0 = paddle.nn.Flatten(start_axis=0, stop_axis=1) |
|
self.fc0 = paddle.nn.Linear( |
|
in_features=in_channels, |
|
out_features=in_channels, ) |
|
self.emb = paddle.nn.Embedding( |
|
num_embeddings=self.max_length, embedding_dim=in_channels) |
|
self.flatten1 = paddle.nn.Flatten(start_axis=0, stop_axis=2) |
|
self.fc1 = paddle.nn.Linear( |
|
in_features=in_channels, out_features=1, bias_attr=False) |
|
|
|
def forward(self, inputs, encoder_word_pos, gsrm_word_pos): |
|
b, c, h, w = inputs.shape |
|
conv_features = paddle.reshape(inputs, shape=[-1, c, h * w]) |
|
conv_features = paddle.transpose(conv_features, perm=[0, 2, 1]) |
|
|
|
b, t, c = conv_features.shape |
|
|
|
enc_inputs = [conv_features, encoder_word_pos, None] |
|
word_features = self.wrap_encoder_for_feature(enc_inputs) |
|
|
|
|
|
b, t, c = word_features.shape |
|
word_features = self.fc0(word_features) |
|
word_features_ = paddle.reshape(word_features, [-1, 1, t, c]) |
|
word_features_ = paddle.tile(word_features_, [1, self.max_length, 1, 1]) |
|
word_pos_feature = self.emb(gsrm_word_pos) |
|
word_pos_feature_ = paddle.reshape(word_pos_feature, |
|
[-1, self.max_length, 1, c]) |
|
word_pos_feature_ = paddle.tile(word_pos_feature_, [1, 1, t, 1]) |
|
y = word_pos_feature_ + word_features_ |
|
y = F.tanh(y) |
|
attention_weight = self.fc1(y) |
|
attention_weight = paddle.reshape( |
|
attention_weight, shape=[-1, self.max_length, t]) |
|
attention_weight = F.softmax(attention_weight, axis=-1) |
|
pvam_features = paddle.matmul(attention_weight, |
|
word_features) |
|
return pvam_features |
|
|
|
|
|
class GSRM(nn.Layer): |
|
def __init__(self, in_channels, char_num, max_text_length, num_heads, |
|
num_encoder_tus, num_decoder_tus, hidden_dims): |
|
super(GSRM, self).__init__() |
|
self.char_num = char_num |
|
self.max_length = max_text_length |
|
self.num_heads = num_heads |
|
self.num_encoder_TUs = num_encoder_tus |
|
self.num_decoder_TUs = num_decoder_tus |
|
self.hidden_dims = hidden_dims |
|
|
|
self.fc0 = paddle.nn.Linear( |
|
in_features=in_channels, out_features=self.char_num) |
|
self.wrap_encoder0 = WrapEncoder( |
|
src_vocab_size=self.char_num + 1, |
|
max_length=self.max_length, |
|
n_layer=self.num_decoder_TUs, |
|
n_head=self.num_heads, |
|
d_key=int(self.hidden_dims / self.num_heads), |
|
d_value=int(self.hidden_dims / self.num_heads), |
|
d_model=self.hidden_dims, |
|
d_inner_hid=self.hidden_dims, |
|
prepostprocess_dropout=0.1, |
|
attention_dropout=0.1, |
|
relu_dropout=0.1, |
|
preprocess_cmd="n", |
|
postprocess_cmd="da", |
|
weight_sharing=True) |
|
|
|
self.wrap_encoder1 = WrapEncoder( |
|
src_vocab_size=self.char_num + 1, |
|
max_length=self.max_length, |
|
n_layer=self.num_decoder_TUs, |
|
n_head=self.num_heads, |
|
d_key=int(self.hidden_dims / self.num_heads), |
|
d_value=int(self.hidden_dims / self.num_heads), |
|
d_model=self.hidden_dims, |
|
d_inner_hid=self.hidden_dims, |
|
prepostprocess_dropout=0.1, |
|
attention_dropout=0.1, |
|
relu_dropout=0.1, |
|
preprocess_cmd="n", |
|
postprocess_cmd="da", |
|
weight_sharing=True) |
|
|
|
self.mul = lambda x: paddle.matmul(x=x, |
|
y=self.wrap_encoder0.prepare_decoder.emb0.weight, |
|
transpose_y=True) |
|
|
|
def forward(self, inputs, gsrm_word_pos, gsrm_slf_attn_bias1, |
|
gsrm_slf_attn_bias2): |
|
|
|
b, t, c = inputs.shape |
|
pvam_features = paddle.reshape(inputs, [-1, c]) |
|
word_out = self.fc0(pvam_features) |
|
word_ids = paddle.argmax(F.softmax(word_out), axis=1) |
|
word_ids = paddle.reshape(x=word_ids, shape=[-1, t, 1]) |
|
|
|
|
|
""" |
|
This module is achieved through bi-transformers, |
|
ngram_feature1 is the froward one, ngram_fetaure2 is the backward one |
|
""" |
|
pad_idx = self.char_num |
|
|
|
word1 = paddle.cast(word_ids, "float32") |
|
word1 = F.pad(word1, [1, 0], value=1.0 * pad_idx, data_format="NLC") |
|
word1 = paddle.cast(word1, "int64") |
|
word1 = word1[:, :-1, :] |
|
word2 = word_ids |
|
|
|
enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1] |
|
enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2] |
|
|
|
gsrm_feature1 = self.wrap_encoder0(enc_inputs_1) |
|
gsrm_feature2 = self.wrap_encoder1(enc_inputs_2) |
|
|
|
gsrm_feature2 = F.pad(gsrm_feature2, [0, 1], |
|
value=0., |
|
data_format="NLC") |
|
gsrm_feature2 = gsrm_feature2[:, 1:, ] |
|
gsrm_features = gsrm_feature1 + gsrm_feature2 |
|
|
|
gsrm_out = self.mul(gsrm_features) |
|
|
|
b, t, c = gsrm_out.shape |
|
gsrm_out = paddle.reshape(gsrm_out, [-1, c]) |
|
|
|
return gsrm_features, word_out, gsrm_out |
|
|
|
|
|
class VSFD(nn.Layer): |
|
def __init__(self, in_channels=512, pvam_ch=512, char_num=38): |
|
super(VSFD, self).__init__() |
|
self.char_num = char_num |
|
self.fc0 = paddle.nn.Linear( |
|
in_features=in_channels * 2, out_features=pvam_ch) |
|
self.fc1 = paddle.nn.Linear( |
|
in_features=pvam_ch, out_features=self.char_num) |
|
|
|
def forward(self, pvam_feature, gsrm_feature): |
|
b, t, c1 = pvam_feature.shape |
|
b, t, c2 = gsrm_feature.shape |
|
combine_feature_ = paddle.concat([pvam_feature, gsrm_feature], axis=2) |
|
img_comb_feature_ = paddle.reshape( |
|
combine_feature_, shape=[-1, c1 + c2]) |
|
img_comb_feature_map = self.fc0(img_comb_feature_) |
|
img_comb_feature_map = F.sigmoid(img_comb_feature_map) |
|
img_comb_feature_map = paddle.reshape( |
|
img_comb_feature_map, shape=[-1, t, c1]) |
|
combine_feature = img_comb_feature_map * pvam_feature + ( |
|
1.0 - img_comb_feature_map) * gsrm_feature |
|
img_comb_feature = paddle.reshape(combine_feature, shape=[-1, c1]) |
|
|
|
out = self.fc1(img_comb_feature) |
|
return out |
|
|
|
|
|
class SRNHead(nn.Layer): |
|
def __init__(self, in_channels, out_channels, max_text_length, num_heads, |
|
num_encoder_TUs, num_decoder_TUs, hidden_dims, **kwargs): |
|
super(SRNHead, self).__init__() |
|
self.char_num = out_channels |
|
self.max_length = max_text_length |
|
self.num_heads = num_heads |
|
self.num_encoder_TUs = num_encoder_TUs |
|
self.num_decoder_TUs = num_decoder_TUs |
|
self.hidden_dims = hidden_dims |
|
|
|
self.pvam = PVAM( |
|
in_channels=in_channels, |
|
char_num=self.char_num, |
|
max_text_length=self.max_length, |
|
num_heads=self.num_heads, |
|
num_encoder_tus=self.num_encoder_TUs, |
|
hidden_dims=self.hidden_dims) |
|
|
|
self.gsrm = GSRM( |
|
in_channels=in_channels, |
|
char_num=self.char_num, |
|
max_text_length=self.max_length, |
|
num_heads=self.num_heads, |
|
num_encoder_tus=self.num_encoder_TUs, |
|
num_decoder_tus=self.num_decoder_TUs, |
|
hidden_dims=self.hidden_dims) |
|
self.vsfd = VSFD(in_channels=in_channels, char_num=self.char_num) |
|
|
|
self.gsrm.wrap_encoder1.prepare_decoder.emb0 = self.gsrm.wrap_encoder0.prepare_decoder.emb0 |
|
|
|
def forward(self, inputs, targets=None): |
|
others = targets[-4:] |
|
encoder_word_pos = others[0] |
|
gsrm_word_pos = others[1] |
|
gsrm_slf_attn_bias1 = others[2] |
|
gsrm_slf_attn_bias2 = others[3] |
|
|
|
pvam_feature = self.pvam(inputs, encoder_word_pos, gsrm_word_pos) |
|
|
|
gsrm_feature, word_out, gsrm_out = self.gsrm( |
|
pvam_feature, gsrm_word_pos, gsrm_slf_attn_bias1, |
|
gsrm_slf_attn_bias2) |
|
|
|
final_out = self.vsfd(pvam_feature, gsrm_feature) |
|
if not self.training: |
|
final_out = F.softmax(final_out, axis=1) |
|
|
|
_, decoded_out = paddle.topk(final_out, k=1) |
|
|
|
predicts = OrderedDict([ |
|
('predict', final_out), |
|
('pvam_feature', pvam_feature), |
|
('decoded_out', decoded_out), |
|
('word_out', word_out), |
|
('gsrm_out', gsrm_out), |
|
]) |
|
|
|
return predicts |
|
|