diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..9c0ded3c2b10a1ab93954eb4a0ffb897075f4ca1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +simfang.ttf filter=lfs diff=lfs merge=lfs -text +ppocr/utils/dict/confuse.pkl filter=lfs diff=lfs merge=lfs -text diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..9fb1fbbd7d16dc2e1e8781f5caa6a68bca2a7668 --- /dev/null +++ b/app.py @@ -0,0 +1,56 @@ +import os, io +from paddleocr import PaddleOCR, draw_ocr +from PIL import Image, ImageDraw +import gradio as gr + + +# 設定 Hugging Face Hub 的 Access Token +os.environ["HF_TOKEN"] = "TWOCR" + +def inference(img_path): + + ocr = PaddleOCR( + rec_char_dict_path='zhtw_common_dict.txt', + use_gpu=False, + rec_image_shape="3, 48, 320" + ) + + result = ocr.ocr(img_path) + + for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + + result = result[0] + image = Image.open(img_path).convert('RGB') + boxes = [line[0] for line in result] + txts = [line[1][0] if line[1] else '' for line in result] # 確保在無文字時 txts 還是個空字串 + scores = [line[1][1] for line in result] + im_show_pil = draw_ocr(image, boxes, txts, scores, font_path="./simfang.ttf") + + return im_show_pil, "\n".join(txts) + +title = "

繁體中文醫療診斷書和收據OCR:PaddleOCR

" + +description = """ +

用PaddleOCR的PPOCRLabel來微調醫療診斷書和收據


+

https://github.com/Deep-Learning-101


+

https://github.com/Deep-Learning-101/Computer-Vision-Paper


+""" + + +css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}" + +gr.Interface( + inference, + [gr.inputs.Image(type='filepath', label='圖片上傳')], + outputs=[ + gr.outputs.Image(type="pil", label="識別結果"), + "text" + ], + title=title, + description=description, + css=css, + enable_queue=True + ).launch(debug=True) \ No newline at end of file diff --git a/ppocr/__init__.py b/ppocr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d0c32e26092f6ea25771279418582a24ea449ab2 --- /dev/null +++ b/ppocr/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppocr/data/__init__.py b/ppocr/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b602a346dbe4b0d45af287f25f05ead0f62daf44 --- /dev/null +++ b/ppocr/data/__init__.py @@ -0,0 +1,110 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import os +import sys +import numpy as np +import skimage +import paddle +import signal +import random + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(os.path.abspath(os.path.join(__dir__, '../..'))) + +import copy +from paddle.io import Dataset, DataLoader, BatchSampler, DistributedBatchSampler +import paddle.distributed as dist + +from ppocr.data.imaug import transform, create_operators +from ppocr.data.simple_dataset import SimpleDataSet +from ppocr.data.lmdb_dataset import LMDBDataSet, LMDBDataSetSR +from ppocr.data.pgnet_dataset import PGDataSet +from ppocr.data.pubtab_dataset import PubTabDataSet + +__all__ = ['build_dataloader', 'transform', 'create_operators'] + + +def term_mp(sig_num, frame): + """ kill all child processes + """ + pid = os.getpid() + pgid = os.getpgid(os.getpid()) + print("main proc {} exit, kill process group " "{}".format(pid, pgid)) + os.killpg(pgid, signal.SIGKILL) + + +def build_dataloader(config, mode, device, logger, seed=None): + config = copy.deepcopy(config) + + support_dict = [ + 'SimpleDataSet', 'LMDBDataSet', 'PGDataSet', 'PubTabDataSet', + 'LMDBDataSetSR' + ] + module_name = config[mode]['dataset']['name'] + assert module_name in support_dict, Exception( + 'DataSet only support {}'.format(support_dict)) + assert mode in ['Train', 'Eval', 'Test' + ], "Mode should be Train, Eval or Test." + + dataset = eval(module_name)(config, mode, logger, seed) + loader_config = config[mode]['loader'] + batch_size = loader_config['batch_size_per_card'] + drop_last = loader_config['drop_last'] + shuffle = loader_config['shuffle'] + num_workers = loader_config['num_workers'] + if 'use_shared_memory' in loader_config.keys(): + use_shared_memory = loader_config['use_shared_memory'] + else: + use_shared_memory = True + + if mode == "Train": + # Distribute data to multiple cards + batch_sampler = DistributedBatchSampler( + dataset=dataset, + batch_size=batch_size, + shuffle=shuffle, + drop_last=drop_last) + else: + # Distribute data to single card + batch_sampler = BatchSampler( + dataset=dataset, + batch_size=batch_size, + shuffle=shuffle, + drop_last=drop_last) + + if 'collate_fn' in loader_config: + from . import collate_fn + collate_fn = getattr(collate_fn, loader_config['collate_fn'])() + else: + collate_fn = None + data_loader = DataLoader( + dataset=dataset, + batch_sampler=batch_sampler, + places=device, + num_workers=num_workers, + return_list=True, + use_shared_memory=use_shared_memory, + collate_fn=collate_fn) + + # support exit using ctrl+c + signal.signal(signal.SIGINT, term_mp) + signal.signal(signal.SIGTERM, term_mp) + + return data_loader diff --git a/ppocr/data/collate_fn.py b/ppocr/data/collate_fn.py new file mode 100644 index 0000000000000000000000000000000000000000..067b2158aca183c68c3a09999483c059bb10eb14 --- /dev/null +++ b/ppocr/data/collate_fn.py @@ -0,0 +1,118 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import numbers +import numpy as np +from collections import defaultdict + + +class DictCollator(object): + """ + data batch + """ + + def __call__(self, batch): + # todo:support batch operators + data_dict = defaultdict(list) + to_tensor_keys = [] + for sample in batch: + for k, v in sample.items(): + if isinstance(v, (np.ndarray, paddle.Tensor, numbers.Number)): + if k not in to_tensor_keys: + to_tensor_keys.append(k) + data_dict[k].append(v) + for k in to_tensor_keys: + data_dict[k] = paddle.to_tensor(data_dict[k]) + return data_dict + + +class ListCollator(object): + """ + data batch + """ + + def __call__(self, batch): + # todo:support batch operators + data_dict = defaultdict(list) + to_tensor_idxs = [] + for sample in batch: + for idx, v in enumerate(sample): + if isinstance(v, (np.ndarray, paddle.Tensor, numbers.Number)): + if idx not in to_tensor_idxs: + to_tensor_idxs.append(idx) + data_dict[idx].append(v) + for idx in to_tensor_idxs: + data_dict[idx] = paddle.to_tensor(data_dict[idx]) + return list(data_dict.values()) + + +class SSLRotateCollate(object): + """ + bach: [ + [(4*3xH*W), (4,)] + [(4*3xH*W), (4,)] + ... + ] + """ + + def __call__(self, batch): + output = [np.concatenate(d, axis=0) for d in zip(*batch)] + return output + + +class DyMaskCollator(object): + """ + batch: [ + image [batch_size, channel, maxHinbatch, maxWinbatch] + image_mask [batch_size, channel, maxHinbatch, maxWinbatch] + label [batch_size, maxLabelLen] + label_mask [batch_size, maxLabelLen] + ... + ] + """ + + def __call__(self, batch): + max_width, max_height, max_length = 0, 0, 0 + bs, channel = len(batch), batch[0][0].shape[0] + proper_items = [] + for item in batch: + if item[0].shape[1] * max_width > 1600 * 320 or item[0].shape[ + 2] * max_height > 1600 * 320: + continue + max_height = item[0].shape[1] if item[0].shape[ + 1] > max_height else max_height + max_width = item[0].shape[2] if item[0].shape[ + 2] > max_width else max_width + max_length = len(item[1]) if len(item[ + 1]) > max_length else max_length + proper_items.append(item) + + images, image_masks = np.zeros( + (len(proper_items), channel, max_height, max_width), + dtype='float32'), np.zeros( + (len(proper_items), 1, max_height, max_width), dtype='float32') + labels, label_masks = np.zeros( + (len(proper_items), max_length), dtype='int64'), np.zeros( + (len(proper_items), max_length), dtype='int64') + + for i in range(len(proper_items)): + _, h, w = proper_items[i][0].shape + images[i][:, :h, :w] = proper_items[i][0] + image_masks[i][:, :h, :w] = 1 + l = len(proper_items[i][1]) + labels[i][:l] = proper_items[i][1] + label_masks[i][:l] = 1 + + return images, image_masks, labels, label_masks diff --git a/ppocr/data/imaug/ColorJitter.py b/ppocr/data/imaug/ColorJitter.py new file mode 100644 index 0000000000000000000000000000000000000000..4b542abc8f9dc5af76529f9feb4bcb8b47b5f7d0 --- /dev/null +++ b/ppocr/data/imaug/ColorJitter.py @@ -0,0 +1,26 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from paddle.vision.transforms import ColorJitter as pp_ColorJitter + +__all__ = ['ColorJitter'] + +class ColorJitter(object): + def __init__(self, brightness=0, contrast=0, saturation=0, hue=0,**kwargs): + self.aug = pp_ColorJitter(brightness, contrast, saturation, hue) + + def __call__(self, data): + image = data['image'] + image = self.aug(image) + data['image'] = image + return data diff --git a/ppocr/data/imaug/__init__.py b/ppocr/data/imaug/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..121582b4908750fca6612dc592a3671ef4dcb328 --- /dev/null +++ b/ppocr/data/imaug/__init__.py @@ -0,0 +1,80 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from .iaa_augment import IaaAugment +from .make_border_map import MakeBorderMap +from .make_shrink_map import MakeShrinkMap +from .random_crop_data import EastRandomCropData, RandomCropImgMask +from .make_pse_gt import MakePseGt + + + +from .rec_img_aug import BaseDataAugmentation, RecAug, RecConAug, RecResizeImg, ClsResizeImg, \ + SRNRecResizeImg, GrayRecResizeImg, SARRecResizeImg, PRENResizeImg, \ + ABINetRecResizeImg, SVTRRecResizeImg, ABINetRecAug, VLRecResizeImg, SPINRecResizeImg, RobustScannerRecResizeImg, \ + RFLRecResizeImg, SVTRRecAug +from .ssl_img_aug import SSLRotateResize +from .randaugment import RandAugment +from .copy_paste import CopyPaste +from .ColorJitter import ColorJitter +from .operators import * +from .label_ops import * + +from .east_process import * +from .sast_process import * +from .pg_process import * +from .table_ops import * + +from .vqa import * + +from .fce_aug import * +from .fce_targets import FCENetTargets +from .ct_process import * +from .drrg_targets import DRRGTargets + + +def transform(data, ops=None): + """ transform """ + if ops is None: + ops = [] + for op in ops: + data = op(data) + if data is None: + return None + return data + + +def create_operators(op_param_list, global_config=None): + """ + create operators based on the config + + Args: + params(list): a dict list, used to create some operators + """ + assert isinstance(op_param_list, list), ('operator config should be a list') + ops = [] + for operator in op_param_list: + assert isinstance(operator, + dict) and len(operator) == 1, "yaml format error" + op_name = list(operator)[0] + param = {} if operator[op_name] is None else operator[op_name] + if global_config is not None: + param.update(global_config) + op = eval(op_name)(**param) + ops.append(op) + return ops diff --git a/ppocr/data/imaug/abinet_aug.py b/ppocr/data/imaug/abinet_aug.py new file mode 100644 index 0000000000000000000000000000000000000000..1b93751bc0b73e6425e73e078dfe189bdce5cdaf --- /dev/null +++ b/ppocr/data/imaug/abinet_aug.py @@ -0,0 +1,458 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FangShancheng/ABINet/blob/main/transforms.py +""" +import math +import numbers +import random + +import cv2 +import numpy as np +from paddle.vision.transforms import Compose, ColorJitter + + +def sample_asym(magnitude, size=None): + return np.random.beta(1, 4, size) * magnitude + + +def sample_sym(magnitude, size=None): + return (np.random.beta(4, 4, size=size) - 0.5) * 2 * magnitude + + +def sample_uniform(low, high, size=None): + return np.random.uniform(low, high, size=size) + + +def get_interpolation(type='random'): + if type == 'random': + choice = [ + cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA + ] + interpolation = choice[random.randint(0, len(choice) - 1)] + elif type == 'nearest': + interpolation = cv2.INTER_NEAREST + elif type == 'linear': + interpolation = cv2.INTER_LINEAR + elif type == 'cubic': + interpolation = cv2.INTER_CUBIC + elif type == 'area': + interpolation = cv2.INTER_AREA + else: + raise TypeError( + 'Interpolation types only nearest, linear, cubic, area are supported!' + ) + return interpolation + + +class CVRandomRotation(object): + def __init__(self, degrees=15): + assert isinstance(degrees, + numbers.Number), "degree should be a single number." + assert degrees >= 0, "degree must be positive." + self.degrees = degrees + + @staticmethod + def get_params(degrees): + return sample_sym(degrees) + + def __call__(self, img): + angle = self.get_params(self.degrees) + src_h, src_w = img.shape[:2] + M = cv2.getRotationMatrix2D( + center=(src_w / 2, src_h / 2), angle=angle, scale=1.0) + abs_cos, abs_sin = abs(M[0, 0]), abs(M[0, 1]) + dst_w = int(src_h * abs_sin + src_w * abs_cos) + dst_h = int(src_h * abs_cos + src_w * abs_sin) + M[0, 2] += (dst_w - src_w) / 2 + M[1, 2] += (dst_h - src_h) / 2 + + flags = get_interpolation() + return cv2.warpAffine( + img, + M, (dst_w, dst_h), + flags=flags, + borderMode=cv2.BORDER_REPLICATE) + + +class CVRandomAffine(object): + def __init__(self, degrees, translate=None, scale=None, shear=None): + assert isinstance(degrees, + numbers.Number), "degree should be a single number." + assert degrees >= 0, "degree must be positive." + self.degrees = degrees + + if translate is not None: + assert isinstance(translate, (tuple, list)) and len(translate) == 2, \ + "translate should be a list or tuple and it must be of length 2." + for t in translate: + if not (0.0 <= t <= 1.0): + raise ValueError( + "translation values should be between 0 and 1") + self.translate = translate + + if scale is not None: + assert isinstance(scale, (tuple, list)) and len(scale) == 2, \ + "scale should be a list or tuple and it must be of length 2." + for s in scale: + if s <= 0: + raise ValueError("scale values should be positive") + self.scale = scale + + if shear is not None: + if isinstance(shear, numbers.Number): + if shear < 0: + raise ValueError( + "If shear is a single number, it must be positive.") + self.shear = [shear] + else: + assert isinstance(shear, (tuple, list)) and (len(shear) == 2), \ + "shear should be a list or tuple and it must be of length 2." + self.shear = shear + else: + self.shear = shear + + def _get_inverse_affine_matrix(self, center, angle, translate, scale, + shear): + # https://github.com/pytorch/vision/blob/v0.4.0/torchvision/transforms/functional.py#L717 + from numpy import sin, cos, tan + + if isinstance(shear, numbers.Number): + shear = [shear, 0] + + if not isinstance(shear, (tuple, list)) and len(shear) == 2: + raise ValueError( + "Shear should be a single value or a tuple/list containing " + + "two values. Got {}".format(shear)) + + rot = math.radians(angle) + sx, sy = [math.radians(s) for s in shear] + + cx, cy = center + tx, ty = translate + + # RSS without scaling + a = cos(rot - sy) / cos(sy) + b = -cos(rot - sy) * tan(sx) / cos(sy) - sin(rot) + c = sin(rot - sy) / cos(sy) + d = -sin(rot - sy) * tan(sx) / cos(sy) + cos(rot) + + # Inverted rotation matrix with scale and shear + # det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1 + M = [d, -b, 0, -c, a, 0] + M = [x / scale for x in M] + + # Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1 + M[2] += M[0] * (-cx - tx) + M[1] * (-cy - ty) + M[5] += M[3] * (-cx - tx) + M[4] * (-cy - ty) + + # Apply center translation: C * RSS^-1 * C^-1 * T^-1 + M[2] += cx + M[5] += cy + return M + + @staticmethod + def get_params(degrees, translate, scale_ranges, shears, height): + angle = sample_sym(degrees) + if translate is not None: + max_dx = translate[0] * height + max_dy = translate[1] * height + translations = (np.round(sample_sym(max_dx)), + np.round(sample_sym(max_dy))) + else: + translations = (0, 0) + + if scale_ranges is not None: + scale = sample_uniform(scale_ranges[0], scale_ranges[1]) + else: + scale = 1.0 + + if shears is not None: + if len(shears) == 1: + shear = [sample_sym(shears[0]), 0.] + elif len(shears) == 2: + shear = [sample_sym(shears[0]), sample_sym(shears[1])] + else: + shear = 0.0 + + return angle, translations, scale, shear + + def __call__(self, img): + src_h, src_w = img.shape[:2] + angle, translate, scale, shear = self.get_params( + self.degrees, self.translate, self.scale, self.shear, src_h) + + M = self._get_inverse_affine_matrix((src_w / 2, src_h / 2), angle, + (0, 0), scale, shear) + M = np.array(M).reshape(2, 3) + + startpoints = [(0, 0), (src_w - 1, 0), (src_w - 1, src_h - 1), + (0, src_h - 1)] + project = lambda x, y, a, b, c: int(a * x + b * y + c) + endpoints = [(project(x, y, *M[0]), project(x, y, *M[1])) + for x, y in startpoints] + + rect = cv2.minAreaRect(np.array(endpoints)) + bbox = cv2.boxPoints(rect).astype(dtype=np.int) + max_x, max_y = bbox[:, 0].max(), bbox[:, 1].max() + min_x, min_y = bbox[:, 0].min(), bbox[:, 1].min() + + dst_w = int(max_x - min_x) + dst_h = int(max_y - min_y) + M[0, 2] += (dst_w - src_w) / 2 + M[1, 2] += (dst_h - src_h) / 2 + + # add translate + dst_w += int(abs(translate[0])) + dst_h += int(abs(translate[1])) + if translate[0] < 0: M[0, 2] += abs(translate[0]) + if translate[1] < 0: M[1, 2] += abs(translate[1]) + + flags = get_interpolation() + return cv2.warpAffine( + img, + M, (dst_w, dst_h), + flags=flags, + borderMode=cv2.BORDER_REPLICATE) + + +class CVRandomPerspective(object): + def __init__(self, distortion=0.5): + self.distortion = distortion + + def get_params(self, width, height, distortion): + offset_h = sample_asym( + distortion * height / 2, size=4).astype(dtype=np.int) + offset_w = sample_asym( + distortion * width / 2, size=4).astype(dtype=np.int) + topleft = (offset_w[0], offset_h[0]) + topright = (width - 1 - offset_w[1], offset_h[1]) + botright = (width - 1 - offset_w[2], height - 1 - offset_h[2]) + botleft = (offset_w[3], height - 1 - offset_h[3]) + + startpoints = [(0, 0), (width - 1, 0), (width - 1, height - 1), + (0, height - 1)] + endpoints = [topleft, topright, botright, botleft] + return np.array( + startpoints, dtype=np.float32), np.array( + endpoints, dtype=np.float32) + + def __call__(self, img): + height, width = img.shape[:2] + startpoints, endpoints = self.get_params(width, height, self.distortion) + M = cv2.getPerspectiveTransform(startpoints, endpoints) + + # TODO: more robust way to crop image + rect = cv2.minAreaRect(endpoints) + bbox = cv2.boxPoints(rect).astype(dtype=np.int) + max_x, max_y = bbox[:, 0].max(), bbox[:, 1].max() + min_x, min_y = bbox[:, 0].min(), bbox[:, 1].min() + min_x, min_y = max(min_x, 0), max(min_y, 0) + + flags = get_interpolation() + img = cv2.warpPerspective( + img, + M, (max_x, max_y), + flags=flags, + borderMode=cv2.BORDER_REPLICATE) + img = img[min_y:, min_x:] + return img + + +class CVRescale(object): + def __init__(self, factor=4, base_size=(128, 512)): + """ Define image scales using gaussian pyramid and rescale image to target scale. + + Args: + factor: the decayed factor from base size, factor=4 keeps target scale by default. + base_size: base size the build the bottom layer of pyramid + """ + if isinstance(factor, numbers.Number): + self.factor = round(sample_uniform(0, factor)) + elif isinstance(factor, (tuple, list)) and len(factor) == 2: + self.factor = round(sample_uniform(factor[0], factor[1])) + else: + raise Exception('factor must be number or list with length 2') + # assert factor is valid + self.base_h, self.base_w = base_size[:2] + + def __call__(self, img): + if self.factor == 0: return img + src_h, src_w = img.shape[:2] + cur_w, cur_h = self.base_w, self.base_h + scale_img = cv2.resize( + img, (cur_w, cur_h), interpolation=get_interpolation()) + for _ in range(self.factor): + scale_img = cv2.pyrDown(scale_img) + scale_img = cv2.resize( + scale_img, (src_w, src_h), interpolation=get_interpolation()) + return scale_img + + +class CVGaussianNoise(object): + def __init__(self, mean=0, var=20): + self.mean = mean + if isinstance(var, numbers.Number): + self.var = max(int(sample_asym(var)), 1) + elif isinstance(var, (tuple, list)) and len(var) == 2: + self.var = int(sample_uniform(var[0], var[1])) + else: + raise Exception('degree must be number or list with length 2') + + def __call__(self, img): + noise = np.random.normal(self.mean, self.var**0.5, img.shape) + img = np.clip(img + noise, 0, 255).astype(np.uint8) + return img + + +class CVMotionBlur(object): + def __init__(self, degrees=12, angle=90): + if isinstance(degrees, numbers.Number): + self.degree = max(int(sample_asym(degrees)), 1) + elif isinstance(degrees, (tuple, list)) and len(degrees) == 2: + self.degree = int(sample_uniform(degrees[0], degrees[1])) + else: + raise Exception('degree must be number or list with length 2') + self.angle = sample_uniform(-angle, angle) + + def __call__(self, img): + M = cv2.getRotationMatrix2D((self.degree // 2, self.degree // 2), + self.angle, 1) + motion_blur_kernel = np.zeros((self.degree, self.degree)) + motion_blur_kernel[self.degree // 2, :] = 1 + motion_blur_kernel = cv2.warpAffine(motion_blur_kernel, M, + (self.degree, self.degree)) + motion_blur_kernel = motion_blur_kernel / self.degree + img = cv2.filter2D(img, -1, motion_blur_kernel) + img = np.clip(img, 0, 255).astype(np.uint8) + return img + + +class CVGeometry(object): + def __init__(self, + degrees=15, + translate=(0.3, 0.3), + scale=(0.5, 2.), + shear=(45, 15), + distortion=0.5, + p=0.5): + self.p = p + type_p = random.random() + if type_p < 0.33: + self.transforms = CVRandomRotation(degrees=degrees) + elif type_p < 0.66: + self.transforms = CVRandomAffine( + degrees=degrees, translate=translate, scale=scale, shear=shear) + else: + self.transforms = CVRandomPerspective(distortion=distortion) + + def __call__(self, img): + if random.random() < self.p: + return self.transforms(img) + else: + return img + + +class CVDeterioration(object): + def __init__(self, var, degrees, factor, p=0.5): + self.p = p + transforms = [] + if var is not None: + transforms.append(CVGaussianNoise(var=var)) + if degrees is not None: + transforms.append(CVMotionBlur(degrees=degrees)) + if factor is not None: + transforms.append(CVRescale(factor=factor)) + + random.shuffle(transforms) + transforms = Compose(transforms) + self.transforms = transforms + + def __call__(self, img): + if random.random() < self.p: + + return self.transforms(img) + else: + return img + + +class CVColorJitter(object): + def __init__(self, + brightness=0.5, + contrast=0.5, + saturation=0.5, + hue=0.1, + p=0.5): + self.p = p + self.transforms = ColorJitter( + brightness=brightness, + contrast=contrast, + saturation=saturation, + hue=hue) + + def __call__(self, img): + if random.random() < self.p: return self.transforms(img) + else: return img + + +class SVTRDeterioration(object): + def __init__(self, var, degrees, factor, p=0.5): + self.p = p + transforms = [] + if var is not None: + transforms.append(CVGaussianNoise(var=var)) + if degrees is not None: + transforms.append(CVMotionBlur(degrees=degrees)) + if factor is not None: + transforms.append(CVRescale(factor=factor)) + self.transforms = transforms + + def __call__(self, img): + if random.random() < self.p: + random.shuffle(self.transforms) + transforms = Compose(self.transforms) + return transforms(img) + else: + return img + + +class SVTRGeometry(object): + def __init__(self, + aug_type=0, + degrees=15, + translate=(0.3, 0.3), + scale=(0.5, 2.), + shear=(45, 15), + distortion=0.5, + p=0.5): + self.aug_type = aug_type + self.p = p + self.transforms = [] + self.transforms.append(CVRandomRotation(degrees=degrees)) + self.transforms.append(CVRandomAffine( + degrees=degrees, translate=translate, scale=scale, shear=shear)) + self.transforms.append(CVRandomPerspective(distortion=distortion)) + + def __call__(self, img): + if random.random() < self.p: + if self.aug_type: + random.shuffle(self.transforms) + transforms = Compose(self.transforms[:random.randint(1, 3)]) + img = transforms(img) + else: + img = self.transforms[random.randint(0, 2)](img) + return img + else: + return img \ No newline at end of file diff --git a/ppocr/data/imaug/copy_paste.py b/ppocr/data/imaug/copy_paste.py new file mode 100644 index 0000000000000000000000000000000000000000..79343da60fd40f8dc0ffe8927398b70cb751b532 --- /dev/null +++ b/ppocr/data/imaug/copy_paste.py @@ -0,0 +1,174 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import cv2 +import random +import numpy as np +from PIL import Image +from shapely.geometry import Polygon + +from ppocr.data.imaug.iaa_augment import IaaAugment +from ppocr.data.imaug.random_crop_data import is_poly_outside_rect +from tools.infer.utility import get_rotate_crop_image + + +class CopyPaste(object): + def __init__(self, objects_paste_ratio=0.2, limit_paste=True, **kwargs): + self.ext_data_num = 1 + self.objects_paste_ratio = objects_paste_ratio + self.limit_paste = limit_paste + augmenter_args = [{'type': 'Resize', 'args': {'size': [0.5, 3]}}] + self.aug = IaaAugment(augmenter_args) + + def __call__(self, data): + point_num = data['polys'].shape[1] + src_img = data['image'] + src_polys = data['polys'].tolist() + src_texts = data['texts'] + src_ignores = data['ignore_tags'].tolist() + ext_data = data['ext_data'][0] + ext_image = ext_data['image'] + ext_polys = ext_data['polys'] + ext_texts = ext_data['texts'] + ext_ignores = ext_data['ignore_tags'] + + indexs = [i for i in range(len(ext_ignores)) if not ext_ignores[i]] + select_num = max( + 1, min(int(self.objects_paste_ratio * len(ext_polys)), 30)) + + random.shuffle(indexs) + select_idxs = indexs[:select_num] + select_polys = ext_polys[select_idxs] + select_ignores = ext_ignores[select_idxs] + + src_img = cv2.cvtColor(src_img, cv2.COLOR_BGR2RGB) + ext_image = cv2.cvtColor(ext_image, cv2.COLOR_BGR2RGB) + src_img = Image.fromarray(src_img).convert('RGBA') + for idx, poly, tag in zip(select_idxs, select_polys, select_ignores): + box_img = get_rotate_crop_image(ext_image, poly) + + src_img, box = self.paste_img(src_img, box_img, src_polys) + if box is not None: + box = box.tolist() + for _ in range(len(box), point_num): + box.append(box[-1]) + src_polys.append(box) + src_texts.append(ext_texts[idx]) + src_ignores.append(tag) + src_img = cv2.cvtColor(np.array(src_img), cv2.COLOR_RGB2BGR) + h, w = src_img.shape[:2] + src_polys = np.array(src_polys) + src_polys[:, :, 0] = np.clip(src_polys[:, :, 0], 0, w) + src_polys[:, :, 1] = np.clip(src_polys[:, :, 1], 0, h) + data['image'] = src_img + data['polys'] = src_polys + data['texts'] = src_texts + data['ignore_tags'] = np.array(src_ignores) + return data + + def paste_img(self, src_img, box_img, src_polys): + box_img_pil = Image.fromarray(box_img).convert('RGBA') + src_w, src_h = src_img.size + box_w, box_h = box_img_pil.size + + angle = np.random.randint(0, 360) + box = np.array([[[0, 0], [box_w, 0], [box_w, box_h], [0, box_h]]]) + box = rotate_bbox(box_img, box, angle)[0] + box_img_pil = box_img_pil.rotate(angle, expand=1) + box_w, box_h = box_img_pil.width, box_img_pil.height + if src_w - box_w < 0 or src_h - box_h < 0: + return src_img, None + + paste_x, paste_y = self.select_coord(src_polys, box, src_w - box_w, + src_h - box_h) + if paste_x is None: + return src_img, None + box[:, 0] += paste_x + box[:, 1] += paste_y + r, g, b, A = box_img_pil.split() + src_img.paste(box_img_pil, (paste_x, paste_y), mask=A) + + return src_img, box + + def select_coord(self, src_polys, box, endx, endy): + if self.limit_paste: + xmin, ymin, xmax, ymax = box[:, 0].min(), box[:, 1].min( + ), box[:, 0].max(), box[:, 1].max() + for _ in range(50): + paste_x = random.randint(0, endx) + paste_y = random.randint(0, endy) + xmin1 = xmin + paste_x + xmax1 = xmax + paste_x + ymin1 = ymin + paste_y + ymax1 = ymax + paste_y + + num_poly_in_rect = 0 + for poly in src_polys: + if not is_poly_outside_rect(poly, xmin1, ymin1, + xmax1 - xmin1, ymax1 - ymin1): + num_poly_in_rect += 1 + break + if num_poly_in_rect == 0: + return paste_x, paste_y + return None, None + else: + paste_x = random.randint(0, endx) + paste_y = random.randint(0, endy) + return paste_x, paste_y + + +def get_union(pD, pG): + return Polygon(pD).union(Polygon(pG)).area + + +def get_intersection_over_union(pD, pG): + return get_intersection(pD, pG) / get_union(pD, pG) + + +def get_intersection(pD, pG): + return Polygon(pD).intersection(Polygon(pG)).area + + +def rotate_bbox(img, text_polys, angle, scale=1): + """ + from https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/augment.py + Args: + img: np.ndarray + text_polys: np.ndarray N*4*2 + angle: int + scale: int + + Returns: + + """ + w = img.shape[1] + h = img.shape[0] + + rangle = np.deg2rad(angle) + nw = (abs(np.sin(rangle) * h) + abs(np.cos(rangle) * w)) + nh = (abs(np.cos(rangle) * h) + abs(np.sin(rangle) * w)) + rot_mat = cv2.getRotationMatrix2D((nw * 0.5, nh * 0.5), angle, scale) + rot_move = np.dot(rot_mat, np.array([(nw - w) * 0.5, (nh - h) * 0.5, 0])) + rot_mat[0, 2] += rot_move[0] + rot_mat[1, 2] += rot_move[1] + + # ---------------------- rotate box ---------------------- + rot_text_polys = list() + for bbox in text_polys: + point1 = np.dot(rot_mat, np.array([bbox[0, 0], bbox[0, 1], 1])) + point2 = np.dot(rot_mat, np.array([bbox[1, 0], bbox[1, 1], 1])) + point3 = np.dot(rot_mat, np.array([bbox[2, 0], bbox[2, 1], 1])) + point4 = np.dot(rot_mat, np.array([bbox[3, 0], bbox[3, 1], 1])) + rot_text_polys.append([point1, point2, point3, point4]) + return np.array(rot_text_polys, dtype=np.float32) diff --git a/ppocr/data/imaug/ct_process.py b/ppocr/data/imaug/ct_process.py new file mode 100644 index 0000000000000000000000000000000000000000..59715090036e1020800950b02b9ea06ab5c8d4c2 --- /dev/null +++ b/ppocr/data/imaug/ct_process.py @@ -0,0 +1,355 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import cv2 +import random +import pyclipper +import paddle + +import numpy as np +import Polygon as plg +import scipy.io as scio + +from PIL import Image +import paddle.vision.transforms as transforms + + +class RandomScale(): + def __init__(self, short_size=640, **kwargs): + self.short_size = short_size + + def scale_aligned(self, img, scale): + oh, ow = img.shape[0:2] + h = int(oh * scale + 0.5) + w = int(ow * scale + 0.5) + if h % 32 != 0: + h = h + (32 - h % 32) + if w % 32 != 0: + w = w + (32 - w % 32) + img = cv2.resize(img, dsize=(w, h)) + factor_h = h / oh + factor_w = w / ow + return img, factor_h, factor_w + + def __call__(self, data): + img = data['image'] + + h, w = img.shape[0:2] + random_scale = np.array([0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3]) + scale = (np.random.choice(random_scale) * self.short_size) / min(h, w) + img, factor_h, factor_w = self.scale_aligned(img, scale) + + data['scale_factor'] = (factor_w, factor_h) + data['image'] = img + return data + + +class MakeShrink(): + def __init__(self, kernel_scale=0.7, **kwargs): + self.kernel_scale = kernel_scale + + def dist(self, a, b): + return np.linalg.norm((a - b), ord=2, axis=0) + + def perimeter(self, bbox): + peri = 0.0 + for i in range(bbox.shape[0]): + peri += self.dist(bbox[i], bbox[(i + 1) % bbox.shape[0]]) + return peri + + def shrink(self, bboxes, rate, max_shr=20): + rate = rate * rate + shrinked_bboxes = [] + for bbox in bboxes: + area = plg.Polygon(bbox).area() + peri = self.perimeter(bbox) + + try: + pco = pyclipper.PyclipperOffset() + pco.AddPath(bbox, pyclipper.JT_ROUND, + pyclipper.ET_CLOSEDPOLYGON) + offset = min( + int(area * (1 - rate) / (peri + 0.001) + 0.5), max_shr) + + shrinked_bbox = pco.Execute(-offset) + if len(shrinked_bbox) == 0: + shrinked_bboxes.append(bbox) + continue + + shrinked_bbox = np.array(shrinked_bbox[0]) + if shrinked_bbox.shape[0] <= 2: + shrinked_bboxes.append(bbox) + continue + + shrinked_bboxes.append(shrinked_bbox) + except Exception as e: + shrinked_bboxes.append(bbox) + + return shrinked_bboxes + + def __call__(self, data): + img = data['image'] + bboxes = data['polys'] + words = data['texts'] + scale_factor = data['scale_factor'] + + gt_instance = np.zeros(img.shape[0:2], dtype='uint8') # h,w + training_mask = np.ones(img.shape[0:2], dtype='uint8') + training_mask_distance = np.ones(img.shape[0:2], dtype='uint8') + + for i in range(len(bboxes)): + bboxes[i] = np.reshape(bboxes[i] * ( + [scale_factor[0], scale_factor[1]] * (bboxes[i].shape[0] // 2)), + (bboxes[i].shape[0] // 2, 2)).astype('int32') + + for i in range(len(bboxes)): + #different value for different bbox + cv2.drawContours(gt_instance, [bboxes[i]], -1, i + 1, -1) + + # set training mask to 0 + cv2.drawContours(training_mask, [bboxes[i]], -1, 0, -1) + + # for not accurate annotation, use training_mask_distance + if words[i] == '###' or words[i] == '???': + cv2.drawContours(training_mask_distance, [bboxes[i]], -1, 0, -1) + + # make shrink + gt_kernel_instance = np.zeros(img.shape[0:2], dtype='uint8') + kernel_bboxes = self.shrink(bboxes, self.kernel_scale) + for i in range(len(bboxes)): + cv2.drawContours(gt_kernel_instance, [kernel_bboxes[i]], -1, i + 1, + -1) + + # for training mask, kernel and background= 1, box region=0 + if words[i] != '###' and words[i] != '???': + cv2.drawContours(training_mask, [kernel_bboxes[i]], -1, 1, -1) + + gt_kernel = gt_kernel_instance.copy() + # for gt_kernel, kernel = 1 + gt_kernel[gt_kernel > 0] = 1 + + # shrink 2 times + tmp1 = gt_kernel_instance.copy() + erode_kernel = np.ones((3, 3), np.uint8) + tmp1 = cv2.erode(tmp1, erode_kernel, iterations=1) + tmp2 = tmp1.copy() + tmp2 = cv2.erode(tmp2, erode_kernel, iterations=1) + + # compute text region + gt_kernel_inner = tmp1 - tmp2 + + # gt_instance: text instance, bg=0, diff word use diff value + # training_mask: text instance mask, word=0,kernel and bg=1 + # gt_kernel_instance: text kernel instance, bg=0, diff word use diff value + # gt_kernel: text_kernel, bg=0,diff word use same value + # gt_kernel_inner: text kernel reference + # training_mask_distance: word without anno = 0, else 1 + + data['image'] = [ + img, gt_instance, training_mask, gt_kernel_instance, gt_kernel, + gt_kernel_inner, training_mask_distance + ] + return data + + +class GroupRandomHorizontalFlip(): + def __init__(self, p=0.5, **kwargs): + self.p = p + + def __call__(self, data): + imgs = data['image'] + + if random.random() < self.p: + for i in range(len(imgs)): + imgs[i] = np.flip(imgs[i], axis=1).copy() + data['image'] = imgs + return data + + +class GroupRandomRotate(): + def __init__(self, **kwargs): + pass + + def __call__(self, data): + imgs = data['image'] + + max_angle = 10 + angle = random.random() * 2 * max_angle - max_angle + for i in range(len(imgs)): + img = imgs[i] + w, h = img.shape[:2] + rotation_matrix = cv2.getRotationMatrix2D((h / 2, w / 2), angle, 1) + img_rotation = cv2.warpAffine( + img, rotation_matrix, (h, w), flags=cv2.INTER_NEAREST) + imgs[i] = img_rotation + + data['image'] = imgs + return data + + +class GroupRandomCropPadding(): + def __init__(self, target_size=(640, 640), **kwargs): + self.target_size = target_size + + def __call__(self, data): + imgs = data['image'] + + h, w = imgs[0].shape[0:2] + t_w, t_h = self.target_size + p_w, p_h = self.target_size + if w == t_w and h == t_h: + return data + + t_h = t_h if t_h < h else h + t_w = t_w if t_w < w else w + + if random.random() > 3.0 / 8.0 and np.max(imgs[1]) > 0: + # make sure to crop the text region + tl = np.min(np.where(imgs[1] > 0), axis=1) - (t_h, t_w) + tl[tl < 0] = 0 + br = np.max(np.where(imgs[1] > 0), axis=1) - (t_h, t_w) + br[br < 0] = 0 + br[0] = min(br[0], h - t_h) + br[1] = min(br[1], w - t_w) + + i = random.randint(tl[0], br[0]) if tl[0] < br[0] else 0 + j = random.randint(tl[1], br[1]) if tl[1] < br[1] else 0 + else: + i = random.randint(0, h - t_h) if h - t_h > 0 else 0 + j = random.randint(0, w - t_w) if w - t_w > 0 else 0 + + n_imgs = [] + for idx in range(len(imgs)): + if len(imgs[idx].shape) == 3: + s3_length = int(imgs[idx].shape[-1]) + img = imgs[idx][i:i + t_h, j:j + t_w, :] + img_p = cv2.copyMakeBorder( + img, + 0, + p_h - t_h, + 0, + p_w - t_w, + borderType=cv2.BORDER_CONSTANT, + value=tuple(0 for i in range(s3_length))) + else: + img = imgs[idx][i:i + t_h, j:j + t_w] + img_p = cv2.copyMakeBorder( + img, + 0, + p_h - t_h, + 0, + p_w - t_w, + borderType=cv2.BORDER_CONSTANT, + value=(0, )) + n_imgs.append(img_p) + + data['image'] = n_imgs + return data + + +class MakeCentripetalShift(): + def __init__(self, **kwargs): + pass + + def jaccard(self, As, Bs): + A = As.shape[0] # small + B = Bs.shape[0] # large + + dis = np.sqrt( + np.sum((As[:, np.newaxis, :].repeat( + B, axis=1) - Bs[np.newaxis, :, :].repeat( + A, axis=0))**2, + axis=-1)) + + ind = np.argmin(dis, axis=-1) + + return ind + + def __call__(self, data): + imgs = data['image'] + + img, gt_instance, training_mask, gt_kernel_instance, gt_kernel, gt_kernel_inner, training_mask_distance = \ + imgs[0], imgs[1], imgs[2], imgs[3], imgs[4], imgs[5], imgs[6] + + max_instance = np.max(gt_instance) # num bbox + + # make centripetal shift + gt_distance = np.zeros((2, *img.shape[0:2]), dtype=np.float32) + for i in range(1, max_instance + 1): + # kernel_reference + ind = (gt_kernel_inner == i) + + if np.sum(ind) == 0: + training_mask[gt_instance == i] = 0 + training_mask_distance[gt_instance == i] = 0 + continue + + kpoints = np.array(np.where(ind)).transpose( + (1, 0))[:, ::-1].astype('float32') + + ind = (gt_instance == i) * (gt_kernel_instance == 0) + if np.sum(ind) == 0: + continue + pixels = np.where(ind) + + points = np.array(pixels).transpose( + (1, 0))[:, ::-1].astype('float32') + + bbox_ind = self.jaccard(points, kpoints) + + offset_gt = kpoints[bbox_ind] - points + + gt_distance[:, pixels[0], pixels[1]] = offset_gt.T * 0.1 + + img = Image.fromarray(img) + img = img.convert('RGB') + + data["image"] = img + data["gt_kernel"] = gt_kernel.astype("int64") + data["training_mask"] = training_mask.astype("int64") + data["gt_instance"] = gt_instance.astype("int64") + data["gt_kernel_instance"] = gt_kernel_instance.astype("int64") + data["training_mask_distance"] = training_mask_distance.astype("int64") + data["gt_distance"] = gt_distance.astype("float32") + + return data + + +class ScaleAlignedShort(): + def __init__(self, short_size=640, **kwargs): + self.short_size = short_size + + def __call__(self, data): + img = data['image'] + + org_img_shape = img.shape + + h, w = img.shape[0:2] + scale = self.short_size * 1.0 / min(h, w) + h = int(h * scale + 0.5) + w = int(w * scale + 0.5) + if h % 32 != 0: + h = h + (32 - h % 32) + if w % 32 != 0: + w = w + (32 - w % 32) + img = cv2.resize(img, dsize=(w, h)) + + new_img_shape = img.shape + img_shape = np.array(org_img_shape + new_img_shape) + + data['shape'] = img_shape + data['image'] = img + + return data \ No newline at end of file diff --git a/ppocr/data/imaug/drrg_targets.py b/ppocr/data/imaug/drrg_targets.py new file mode 100644 index 0000000000000000000000000000000000000000..c56e878b837328ef2efde40b96b5571dffbb4791 --- /dev/null +++ b/ppocr/data/imaug/drrg_targets.py @@ -0,0 +1,696 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/datasets/pipelines/textdet_targets/drrg_targets.py +""" + +import cv2 +import numpy as np +from lanms import merge_quadrangle_n9 as la_nms +from numpy.linalg import norm + + +class DRRGTargets(object): + def __init__(self, + orientation_thr=2.0, + resample_step=8.0, + num_min_comps=9, + num_max_comps=600, + min_width=8.0, + max_width=24.0, + center_region_shrink_ratio=0.3, + comp_shrink_ratio=1.0, + comp_w_h_ratio=0.3, + text_comp_nms_thr=0.25, + min_rand_half_height=8.0, + max_rand_half_height=24.0, + jitter_level=0.2, + **kwargs): + + super().__init__() + self.orientation_thr = orientation_thr + self.resample_step = resample_step + self.num_max_comps = num_max_comps + self.num_min_comps = num_min_comps + self.min_width = min_width + self.max_width = max_width + self.center_region_shrink_ratio = center_region_shrink_ratio + self.comp_shrink_ratio = comp_shrink_ratio + self.comp_w_h_ratio = comp_w_h_ratio + self.text_comp_nms_thr = text_comp_nms_thr + self.min_rand_half_height = min_rand_half_height + self.max_rand_half_height = max_rand_half_height + self.jitter_level = jitter_level + self.eps = 1e-8 + + def vector_angle(self, vec1, vec2): + if vec1.ndim > 1: + unit_vec1 = vec1 / (norm(vec1, axis=-1) + self.eps).reshape((-1, 1)) + else: + unit_vec1 = vec1 / (norm(vec1, axis=-1) + self.eps) + if vec2.ndim > 1: + unit_vec2 = vec2 / (norm(vec2, axis=-1) + self.eps).reshape((-1, 1)) + else: + unit_vec2 = vec2 / (norm(vec2, axis=-1) + self.eps) + return np.arccos( + np.clip( + np.sum(unit_vec1 * unit_vec2, axis=-1), -1.0, 1.0)) + + def vector_slope(self, vec): + assert len(vec) == 2 + return abs(vec[1] / (vec[0] + self.eps)) + + def vector_sin(self, vec): + assert len(vec) == 2 + return vec[1] / (norm(vec) + self.eps) + + def vector_cos(self, vec): + assert len(vec) == 2 + return vec[0] / (norm(vec) + self.eps) + + def find_head_tail(self, points, orientation_thr): + + assert points.ndim == 2 + assert points.shape[0] >= 4 + assert points.shape[1] == 2 + assert isinstance(orientation_thr, float) + + if len(points) > 4: + pad_points = np.vstack([points, points[0]]) + edge_vec = pad_points[1:] - pad_points[:-1] + + theta_sum = [] + adjacent_vec_theta = [] + for i, edge_vec1 in enumerate(edge_vec): + adjacent_ind = [x % len(edge_vec) for x in [i - 1, i + 1]] + adjacent_edge_vec = edge_vec[adjacent_ind] + temp_theta_sum = np.sum( + self.vector_angle(edge_vec1, adjacent_edge_vec)) + temp_adjacent_theta = self.vector_angle(adjacent_edge_vec[0], + adjacent_edge_vec[1]) + theta_sum.append(temp_theta_sum) + adjacent_vec_theta.append(temp_adjacent_theta) + theta_sum_score = np.array(theta_sum) / np.pi + adjacent_theta_score = np.array(adjacent_vec_theta) / np.pi + poly_center = np.mean(points, axis=0) + edge_dist = np.maximum( + norm( + pad_points[1:] - poly_center, axis=-1), + norm( + pad_points[:-1] - poly_center, axis=-1)) + dist_score = edge_dist / (np.max(edge_dist) + self.eps) + position_score = np.zeros(len(edge_vec)) + score = 0.5 * theta_sum_score + 0.15 * adjacent_theta_score + score += 0.35 * dist_score + if len(points) % 2 == 0: + position_score[(len(score) // 2 - 1)] += 1 + position_score[-1] += 1 + score += 0.1 * position_score + pad_score = np.concatenate([score, score]) + score_matrix = np.zeros((len(score), len(score) - 3)) + x = np.arange(len(score) - 3) / float(len(score) - 4) + gaussian = 1. / (np.sqrt(2. * np.pi) * 0.5) * np.exp(-np.power( + (x - 0.5) / 0.5, 2.) / 2) + gaussian = gaussian / np.max(gaussian) + for i in range(len(score)): + score_matrix[i, :] = score[i] + pad_score[(i + 2):(i + len( + score) - 1)] * gaussian * 0.3 + + head_start, tail_increment = np.unravel_index(score_matrix.argmax(), + score_matrix.shape) + tail_start = (head_start + tail_increment + 2) % len(points) + head_end = (head_start + 1) % len(points) + tail_end = (tail_start + 1) % len(points) + + if head_end > tail_end: + head_start, tail_start = tail_start, head_start + head_end, tail_end = tail_end, head_end + head_inds = [head_start, head_end] + tail_inds = [tail_start, tail_end] + else: + if self.vector_slope(points[1] - points[0]) + self.vector_slope( + points[3] - points[2]) < self.vector_slope(points[ + 2] - points[1]) + self.vector_slope(points[0] - points[ + 3]): + horizontal_edge_inds = [[0, 1], [2, 3]] + vertical_edge_inds = [[3, 0], [1, 2]] + else: + horizontal_edge_inds = [[3, 0], [1, 2]] + vertical_edge_inds = [[0, 1], [2, 3]] + + vertical_len_sum = norm(points[vertical_edge_inds[0][0]] - points[ + vertical_edge_inds[0][1]]) + norm(points[vertical_edge_inds[1][ + 0]] - points[vertical_edge_inds[1][1]]) + horizontal_len_sum = norm(points[horizontal_edge_inds[0][ + 0]] - points[horizontal_edge_inds[0][1]]) + norm(points[ + horizontal_edge_inds[1][0]] - points[horizontal_edge_inds[1] + [1]]) + + if vertical_len_sum > horizontal_len_sum * orientation_thr: + head_inds = horizontal_edge_inds[0] + tail_inds = horizontal_edge_inds[1] + else: + head_inds = vertical_edge_inds[0] + tail_inds = vertical_edge_inds[1] + + return head_inds, tail_inds + + def reorder_poly_edge(self, points): + + assert points.ndim == 2 + assert points.shape[0] >= 4 + assert points.shape[1] == 2 + + head_inds, tail_inds = self.find_head_tail(points, self.orientation_thr) + head_edge, tail_edge = points[head_inds], points[tail_inds] + + pad_points = np.vstack([points, points]) + if tail_inds[1] < 1: + tail_inds[1] = len(points) + sideline1 = pad_points[head_inds[1]:tail_inds[1]] + sideline2 = pad_points[tail_inds[1]:(head_inds[1] + len(points))] + sideline_mean_shift = np.mean( + sideline1, axis=0) - np.mean( + sideline2, axis=0) + + if sideline_mean_shift[1] > 0: + top_sideline, bot_sideline = sideline2, sideline1 + else: + top_sideline, bot_sideline = sideline1, sideline2 + + return head_edge, tail_edge, top_sideline, bot_sideline + + def cal_curve_length(self, line): + + assert line.ndim == 2 + assert len(line) >= 2 + + edges_length = np.sqrt((line[1:, 0] - line[:-1, 0])**2 + (line[ + 1:, 1] - line[:-1, 1])**2) + total_length = np.sum(edges_length) + return edges_length, total_length + + def resample_line(self, line, n): + + assert line.ndim == 2 + assert line.shape[0] >= 2 + assert line.shape[1] == 2 + assert isinstance(n, int) + assert n > 2 + + edges_length, total_length = self.cal_curve_length(line) + t_org = np.insert(np.cumsum(edges_length), 0, 0) + unit_t = total_length / (n - 1) + t_equidistant = np.arange(1, n - 1, dtype=np.float32) * unit_t + edge_ind = 0 + points = [line[0]] + for t in t_equidistant: + while edge_ind < len(edges_length) - 1 and t > t_org[edge_ind + 1]: + edge_ind += 1 + t_l, t_r = t_org[edge_ind], t_org[edge_ind + 1] + weight = np.array( + [t_r - t, t - t_l], dtype=np.float32) / (t_r - t_l + self.eps) + p_coords = np.dot(weight, line[[edge_ind, edge_ind + 1]]) + points.append(p_coords) + points.append(line[-1]) + resampled_line = np.vstack(points) + + return resampled_line + + def resample_sidelines(self, sideline1, sideline2, resample_step): + + assert sideline1.ndim == sideline2.ndim == 2 + assert sideline1.shape[1] == sideline2.shape[1] == 2 + assert sideline1.shape[0] >= 2 + assert sideline2.shape[0] >= 2 + assert isinstance(resample_step, float) + + _, length1 = self.cal_curve_length(sideline1) + _, length2 = self.cal_curve_length(sideline2) + + avg_length = (length1 + length2) / 2 + resample_point_num = max(int(float(avg_length) / resample_step) + 1, 3) + + resampled_line1 = self.resample_line(sideline1, resample_point_num) + resampled_line2 = self.resample_line(sideline2, resample_point_num) + + return resampled_line1, resampled_line2 + + def dist_point2line(self, point, line): + + assert isinstance(line, tuple) + point1, point2 = line + d = abs(np.cross(point2 - point1, point - point1)) / ( + norm(point2 - point1) + 1e-8) + return d + + def draw_center_region_maps(self, top_line, bot_line, center_line, + center_region_mask, top_height_map, + bot_height_map, sin_map, cos_map, + region_shrink_ratio): + + assert top_line.shape == bot_line.shape == center_line.shape + assert (center_region_mask.shape == top_height_map.shape == + bot_height_map.shape == sin_map.shape == cos_map.shape) + assert isinstance(region_shrink_ratio, float) + + h, w = center_region_mask.shape + for i in range(0, len(center_line) - 1): + + top_mid_point = (top_line[i] + top_line[i + 1]) / 2 + bot_mid_point = (bot_line[i] + bot_line[i + 1]) / 2 + + sin_theta = self.vector_sin(top_mid_point - bot_mid_point) + cos_theta = self.vector_cos(top_mid_point - bot_mid_point) + + tl = center_line[i] + (top_line[i] - center_line[i] + ) * region_shrink_ratio + tr = center_line[i + 1] + (top_line[i + 1] - center_line[i + 1] + ) * region_shrink_ratio + br = center_line[i + 1] + (bot_line[i + 1] - center_line[i + 1] + ) * region_shrink_ratio + bl = center_line[i] + (bot_line[i] - center_line[i] + ) * region_shrink_ratio + current_center_box = np.vstack([tl, tr, br, bl]).astype(np.int32) + + cv2.fillPoly(center_region_mask, [current_center_box], color=1) + cv2.fillPoly(sin_map, [current_center_box], color=sin_theta) + cv2.fillPoly(cos_map, [current_center_box], color=cos_theta) + + current_center_box[:, 0] = np.clip(current_center_box[:, 0], 0, + w - 1) + current_center_box[:, 1] = np.clip(current_center_box[:, 1], 0, + h - 1) + min_coord = np.min(current_center_box, axis=0).astype(np.int32) + max_coord = np.max(current_center_box, axis=0).astype(np.int32) + current_center_box = current_center_box - min_coord + box_sz = (max_coord - min_coord + 1) + + center_box_mask = np.zeros((box_sz[1], box_sz[0]), dtype=np.uint8) + cv2.fillPoly(center_box_mask, [current_center_box], color=1) + + inds = np.argwhere(center_box_mask > 0) + inds = inds + (min_coord[1], min_coord[0]) + inds_xy = np.fliplr(inds) + top_height_map[(inds[:, 0], inds[:, 1])] = self.dist_point2line( + inds_xy, (top_line[i], top_line[i + 1])) + bot_height_map[(inds[:, 0], inds[:, 1])] = self.dist_point2line( + inds_xy, (bot_line[i], bot_line[i + 1])) + + def generate_center_mask_attrib_maps(self, img_size, text_polys): + + assert isinstance(img_size, tuple) + + h, w = img_size + + center_lines = [] + center_region_mask = np.zeros((h, w), np.uint8) + top_height_map = np.zeros((h, w), dtype=np.float32) + bot_height_map = np.zeros((h, w), dtype=np.float32) + sin_map = np.zeros((h, w), dtype=np.float32) + cos_map = np.zeros((h, w), dtype=np.float32) + + for poly in text_polys: + polygon_points = poly + _, _, top_line, bot_line = self.reorder_poly_edge(polygon_points) + resampled_top_line, resampled_bot_line = self.resample_sidelines( + top_line, bot_line, self.resample_step) + resampled_bot_line = resampled_bot_line[::-1] + center_line = (resampled_top_line + resampled_bot_line) / 2 + + if self.vector_slope(center_line[-1] - center_line[0]) > 2: + if (center_line[-1] - center_line[0])[1] < 0: + center_line = center_line[::-1] + resampled_top_line = resampled_top_line[::-1] + resampled_bot_line = resampled_bot_line[::-1] + else: + if (center_line[-1] - center_line[0])[0] < 0: + center_line = center_line[::-1] + resampled_top_line = resampled_top_line[::-1] + resampled_bot_line = resampled_bot_line[::-1] + + line_head_shrink_len = np.clip( + (norm(top_line[0] - bot_line[0]) * self.comp_w_h_ratio), + self.min_width, self.max_width) / 2 + line_tail_shrink_len = np.clip( + (norm(top_line[-1] - bot_line[-1]) * self.comp_w_h_ratio), + self.min_width, self.max_width) / 2 + num_head_shrink = int(line_head_shrink_len // self.resample_step) + num_tail_shrink = int(line_tail_shrink_len // self.resample_step) + if len(center_line) > num_head_shrink + num_tail_shrink + 2: + center_line = center_line[num_head_shrink:len(center_line) - + num_tail_shrink] + resampled_top_line = resampled_top_line[num_head_shrink:len( + resampled_top_line) - num_tail_shrink] + resampled_bot_line = resampled_bot_line[num_head_shrink:len( + resampled_bot_line) - num_tail_shrink] + center_lines.append(center_line.astype(np.int32)) + + self.draw_center_region_maps( + resampled_top_line, resampled_bot_line, center_line, + center_region_mask, top_height_map, bot_height_map, sin_map, + cos_map, self.center_region_shrink_ratio) + + return (center_lines, center_region_mask, top_height_map, + bot_height_map, sin_map, cos_map) + + def generate_rand_comp_attribs(self, num_rand_comps, center_sample_mask): + + assert isinstance(num_rand_comps, int) + assert num_rand_comps > 0 + assert center_sample_mask.ndim == 2 + + h, w = center_sample_mask.shape + + max_rand_half_height = self.max_rand_half_height + min_rand_half_height = self.min_rand_half_height + max_rand_height = max_rand_half_height * 2 + max_rand_width = np.clip(max_rand_height * self.comp_w_h_ratio, + self.min_width, self.max_width) + margin = int( + np.sqrt((max_rand_height / 2)**2 + (max_rand_width / 2)**2)) + 1 + + if 2 * margin + 1 > min(h, w): + + assert min(h, w) > (np.sqrt(2) * (self.min_width + 1)) + max_rand_half_height = max(min(h, w) / 4, self.min_width / 2 + 1) + min_rand_half_height = max(max_rand_half_height / 4, + self.min_width / 2) + + max_rand_height = max_rand_half_height * 2 + max_rand_width = np.clip(max_rand_height * self.comp_w_h_ratio, + self.min_width, self.max_width) + margin = int( + np.sqrt((max_rand_height / 2)**2 + (max_rand_width / 2)**2)) + 1 + + inner_center_sample_mask = np.zeros_like(center_sample_mask) + inner_center_sample_mask[margin:h - margin, margin:w - margin] = \ + center_sample_mask[margin:h - margin, margin:w - margin] + kernel_size = int(np.clip(max_rand_half_height, 7, 21)) + inner_center_sample_mask = cv2.erode( + inner_center_sample_mask, + np.ones((kernel_size, kernel_size), np.uint8)) + + center_candidates = np.argwhere(inner_center_sample_mask > 0) + num_center_candidates = len(center_candidates) + sample_inds = np.random.choice(num_center_candidates, num_rand_comps) + rand_centers = center_candidates[sample_inds] + + rand_top_height = np.random.randint( + min_rand_half_height, + max_rand_half_height, + size=(len(rand_centers), 1)) + rand_bot_height = np.random.randint( + min_rand_half_height, + max_rand_half_height, + size=(len(rand_centers), 1)) + + rand_cos = 2 * np.random.random(size=(len(rand_centers), 1)) - 1 + rand_sin = 2 * np.random.random(size=(len(rand_centers), 1)) - 1 + scale = np.sqrt(1.0 / (rand_cos**2 + rand_sin**2 + 1e-8)) + rand_cos = rand_cos * scale + rand_sin = rand_sin * scale + + height = (rand_top_height + rand_bot_height) + width = np.clip(height * self.comp_w_h_ratio, self.min_width, + self.max_width) + + rand_comp_attribs = np.hstack([ + rand_centers[:, ::-1], height, width, rand_cos, rand_sin, + np.zeros_like(rand_sin) + ]).astype(np.float32) + + return rand_comp_attribs + + def jitter_comp_attribs(self, comp_attribs, jitter_level): + """Jitter text components attributes. + + Args: + comp_attribs (ndarray): The text component attributes. + jitter_level (float): The jitter level of text components + attributes. + + Returns: + jittered_comp_attribs (ndarray): The jittered text component + attributes (x, y, h, w, cos, sin, comp_label). + """ + + assert comp_attribs.shape[1] == 7 + assert comp_attribs.shape[0] > 0 + assert isinstance(jitter_level, float) + + x = comp_attribs[:, 0].reshape((-1, 1)) + y = comp_attribs[:, 1].reshape((-1, 1)) + h = comp_attribs[:, 2].reshape((-1, 1)) + w = comp_attribs[:, 3].reshape((-1, 1)) + cos = comp_attribs[:, 4].reshape((-1, 1)) + sin = comp_attribs[:, 5].reshape((-1, 1)) + comp_labels = comp_attribs[:, 6].reshape((-1, 1)) + + x += (np.random.random(size=(len(comp_attribs), 1)) - 0.5) * ( + h * np.abs(cos) + w * np.abs(sin)) * jitter_level + y += (np.random.random(size=(len(comp_attribs), 1)) - 0.5) * ( + h * np.abs(sin) + w * np.abs(cos)) * jitter_level + + h += (np.random.random(size=(len(comp_attribs), 1)) - 0.5 + ) * h * jitter_level + w += (np.random.random(size=(len(comp_attribs), 1)) - 0.5 + ) * w * jitter_level + + cos += (np.random.random(size=(len(comp_attribs), 1)) - 0.5 + ) * 2 * jitter_level + sin += (np.random.random(size=(len(comp_attribs), 1)) - 0.5 + ) * 2 * jitter_level + + scale = np.sqrt(1.0 / (cos**2 + sin**2 + 1e-8)) + cos = cos * scale + sin = sin * scale + + jittered_comp_attribs = np.hstack([x, y, h, w, cos, sin, comp_labels]) + + return jittered_comp_attribs + + def generate_comp_attribs(self, center_lines, text_mask, center_region_mask, + top_height_map, bot_height_map, sin_map, cos_map): + """Generate text component attributes. + + Args: + center_lines (list[ndarray]): The list of text center lines . + text_mask (ndarray): The text region mask. + center_region_mask (ndarray): The text center region mask. + top_height_map (ndarray): The map on which the distance from points + to top side lines will be drawn for each pixel in text center + regions. + bot_height_map (ndarray): The map on which the distance from points + to bottom side lines will be drawn for each pixel in text + center regions. + sin_map (ndarray): The sin(theta) map where theta is the angle + between vector (top point - bottom point) and vector (1, 0). + cos_map (ndarray): The cos(theta) map where theta is the angle + between vector (top point - bottom point) and vector (1, 0). + + Returns: + pad_comp_attribs (ndarray): The padded text component attributes + of a fixed size. + """ + + assert isinstance(center_lines, list) + assert ( + text_mask.shape == center_region_mask.shape == top_height_map.shape + == bot_height_map.shape == sin_map.shape == cos_map.shape) + + center_lines_mask = np.zeros_like(center_region_mask) + cv2.polylines(center_lines_mask, center_lines, 0, 1, 1) + center_lines_mask = center_lines_mask * center_region_mask + comp_centers = np.argwhere(center_lines_mask > 0) + + y = comp_centers[:, 0] + x = comp_centers[:, 1] + + top_height = top_height_map[y, x].reshape( + (-1, 1)) * self.comp_shrink_ratio + bot_height = bot_height_map[y, x].reshape( + (-1, 1)) * self.comp_shrink_ratio + sin = sin_map[y, x].reshape((-1, 1)) + cos = cos_map[y, x].reshape((-1, 1)) + + top_mid_points = comp_centers + np.hstack( + [top_height * sin, top_height * cos]) + bot_mid_points = comp_centers - np.hstack( + [bot_height * sin, bot_height * cos]) + + width = (top_height + bot_height) * self.comp_w_h_ratio + width = np.clip(width, self.min_width, self.max_width) + r = width / 2 + + tl = top_mid_points[:, ::-1] - np.hstack([-r * sin, r * cos]) + tr = top_mid_points[:, ::-1] + np.hstack([-r * sin, r * cos]) + br = bot_mid_points[:, ::-1] + np.hstack([-r * sin, r * cos]) + bl = bot_mid_points[:, ::-1] - np.hstack([-r * sin, r * cos]) + text_comps = np.hstack([tl, tr, br, bl]).astype(np.float32) + + score = np.ones((text_comps.shape[0], 1), dtype=np.float32) + text_comps = np.hstack([text_comps, score]) + text_comps = la_nms(text_comps, self.text_comp_nms_thr) + + if text_comps.shape[0] >= 1: + img_h, img_w = center_region_mask.shape + text_comps[:, 0:8:2] = np.clip(text_comps[:, 0:8:2], 0, img_w - 1) + text_comps[:, 1:8:2] = np.clip(text_comps[:, 1:8:2], 0, img_h - 1) + + comp_centers = np.mean( + text_comps[:, 0:8].reshape((-1, 4, 2)), axis=1).astype(np.int32) + x = comp_centers[:, 0] + y = comp_centers[:, 1] + + height = (top_height_map[y, x] + bot_height_map[y, x]).reshape( + (-1, 1)) + width = np.clip(height * self.comp_w_h_ratio, self.min_width, + self.max_width) + + cos = cos_map[y, x].reshape((-1, 1)) + sin = sin_map[y, x].reshape((-1, 1)) + + _, comp_label_mask = cv2.connectedComponents( + center_region_mask, connectivity=8) + comp_labels = comp_label_mask[y, x].reshape( + (-1, 1)).astype(np.float32) + + x = x.reshape((-1, 1)).astype(np.float32) + y = y.reshape((-1, 1)).astype(np.float32) + comp_attribs = np.hstack( + [x, y, height, width, cos, sin, comp_labels]) + comp_attribs = self.jitter_comp_attribs(comp_attribs, + self.jitter_level) + + if comp_attribs.shape[0] < self.num_min_comps: + num_rand_comps = self.num_min_comps - comp_attribs.shape[0] + rand_comp_attribs = self.generate_rand_comp_attribs( + num_rand_comps, 1 - text_mask) + comp_attribs = np.vstack([comp_attribs, rand_comp_attribs]) + else: + comp_attribs = self.generate_rand_comp_attribs(self.num_min_comps, + 1 - text_mask) + + num_comps = (np.ones( + (comp_attribs.shape[0], 1), + dtype=np.float32) * comp_attribs.shape[0]) + comp_attribs = np.hstack([num_comps, comp_attribs]) + + if comp_attribs.shape[0] > self.num_max_comps: + comp_attribs = comp_attribs[:self.num_max_comps, :] + comp_attribs[:, 0] = self.num_max_comps + + pad_comp_attribs = np.zeros( + (self.num_max_comps, comp_attribs.shape[1]), dtype=np.float32) + pad_comp_attribs[:comp_attribs.shape[0], :] = comp_attribs + + return pad_comp_attribs + + def generate_text_region_mask(self, img_size, text_polys): + """Generate text center region mask and geometry attribute maps. + + Args: + img_size (tuple): The image size (height, width). + text_polys (list[list[ndarray]]): The list of text polygons. + + Returns: + text_region_mask (ndarray): The text region mask. + """ + + assert isinstance(img_size, tuple) + + h, w = img_size + text_region_mask = np.zeros((h, w), dtype=np.uint8) + + for poly in text_polys: + polygon = np.array(poly, dtype=np.int32).reshape((1, -1, 2)) + cv2.fillPoly(text_region_mask, polygon, 1) + + return text_region_mask + + def generate_effective_mask(self, mask_size: tuple, polygons_ignore): + """Generate effective mask by setting the ineffective regions to 0 and + effective regions to 1. + + Args: + mask_size (tuple): The mask size. + polygons_ignore (list[[ndarray]]: The list of ignored text + polygons. + + Returns: + mask (ndarray): The effective mask of (height, width). + """ + mask = np.ones(mask_size, dtype=np.uint8) + + for poly in polygons_ignore: + instance = poly.astype(np.int32).reshape(1, -1, 2) + cv2.fillPoly(mask, instance, 0) + + return mask + + def generate_targets(self, data): + """Generate the gt targets for DRRG. + + Args: + data (dict): The input result dictionary. + + Returns: + data (dict): The output result dictionary. + """ + + assert isinstance(data, dict) + + image = data['image'] + polygons = data['polys'] + ignore_tags = data['ignore_tags'] + h, w, _ = image.shape + + polygon_masks = [] + polygon_masks_ignore = [] + for tag, polygon in zip(ignore_tags, polygons): + if tag is True: + polygon_masks_ignore.append(polygon) + else: + polygon_masks.append(polygon) + + gt_text_mask = self.generate_text_region_mask((h, w), polygon_masks) + gt_mask = self.generate_effective_mask((h, w), polygon_masks_ignore) + (center_lines, gt_center_region_mask, gt_top_height_map, + gt_bot_height_map, gt_sin_map, + gt_cos_map) = self.generate_center_mask_attrib_maps((h, w), + polygon_masks) + + gt_comp_attribs = self.generate_comp_attribs( + center_lines, gt_text_mask, gt_center_region_mask, + gt_top_height_map, gt_bot_height_map, gt_sin_map, gt_cos_map) + + mapping = { + 'gt_text_mask': gt_text_mask, + 'gt_center_region_mask': gt_center_region_mask, + 'gt_mask': gt_mask, + 'gt_top_height_map': gt_top_height_map, + 'gt_bot_height_map': gt_bot_height_map, + 'gt_sin_map': gt_sin_map, + 'gt_cos_map': gt_cos_map + } + + data.update(mapping) + data['gt_comp_attribs'] = gt_comp_attribs + return data + + def __call__(self, data): + data = self.generate_targets(data) + return data diff --git a/ppocr/data/imaug/east_process.py b/ppocr/data/imaug/east_process.py new file mode 100644 index 0000000000000000000000000000000000000000..df08adfa1516c59229e95af193c172dfcdd5af08 --- /dev/null +++ b/ppocr/data/imaug/east_process.py @@ -0,0 +1,436 @@ +#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. +""" +This code is refered from: +https://github.com/songdejia/EAST/blob/master/data_utils.py +""" +import math +import cv2 +import numpy as np +import json +import sys +import os + +__all__ = ['EASTProcessTrain'] + + +class EASTProcessTrain(object): + def __init__(self, + image_shape=[512, 512], + background_ratio=0.125, + min_crop_side_ratio=0.1, + min_text_size=10, + **kwargs): + self.input_size = image_shape[1] + self.random_scale = np.array([0.5, 1, 2.0, 3.0]) + self.background_ratio = background_ratio + self.min_crop_side_ratio = min_crop_side_ratio + self.min_text_size = min_text_size + + def preprocess(self, im): + input_size = self.input_size + im_shape = im.shape + im_size_min = np.min(im_shape[0:2]) + im_size_max = np.max(im_shape[0:2]) + im_scale = float(input_size) / float(im_size_max) + im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale) + img_mean = [0.485, 0.456, 0.406] + img_std = [0.229, 0.224, 0.225] + # im = im[:, :, ::-1].astype(np.float32) + im = im / 255 + im -= img_mean + im /= img_std + new_h, new_w, _ = im.shape + im_padded = np.zeros((input_size, input_size, 3), dtype=np.float32) + im_padded[:new_h, :new_w, :] = im + im_padded = im_padded.transpose((2, 0, 1)) + im_padded = im_padded[np.newaxis, :] + return im_padded, im_scale + + def rotate_im_poly(self, im, text_polys): + """ + rotate image with 90 / 180 / 270 degre + """ + im_w, im_h = im.shape[1], im.shape[0] + dst_im = im.copy() + dst_polys = [] + rand_degree_ratio = np.random.rand() + rand_degree_cnt = 1 + if 0.333 < rand_degree_ratio < 0.666: + rand_degree_cnt = 2 + elif rand_degree_ratio > 0.666: + rand_degree_cnt = 3 + for i in range(rand_degree_cnt): + dst_im = np.rot90(dst_im) + rot_degree = -90 * rand_degree_cnt + rot_angle = rot_degree * math.pi / 180.0 + n_poly = text_polys.shape[0] + cx, cy = 0.5 * im_w, 0.5 * im_h + ncx, ncy = 0.5 * dst_im.shape[1], 0.5 * dst_im.shape[0] + for i in range(n_poly): + wordBB = text_polys[i] + poly = [] + for j in range(4): + sx, sy = wordBB[j][0], wordBB[j][1] + dx = math.cos(rot_angle) * (sx - cx)\ + - math.sin(rot_angle) * (sy - cy) + ncx + dy = math.sin(rot_angle) * (sx - cx)\ + + math.cos(rot_angle) * (sy - cy) + ncy + poly.append([dx, dy]) + dst_polys.append(poly) + dst_polys = np.array(dst_polys, dtype=np.float32) + return dst_im, dst_polys + + def polygon_area(self, poly): + """ + compute area of a polygon + :param poly: + :return: + """ + edge = [(poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]), + (poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]), + (poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]), + (poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])] + return np.sum(edge) / 2. + + def check_and_validate_polys(self, polys, tags, img_height, img_width): + """ + check so that the text poly is in the same direction, + and also filter some invalid polygons + :param polys: + :param tags: + :return: + """ + h, w = img_height, img_width + if polys.shape[0] == 0: + return polys + polys[:, :, 0] = np.clip(polys[:, :, 0], 0, w - 1) + polys[:, :, 1] = np.clip(polys[:, :, 1], 0, h - 1) + + validated_polys = [] + validated_tags = [] + for poly, tag in zip(polys, tags): + p_area = self.polygon_area(poly) + #invalid poly + if abs(p_area) < 1: + continue + if p_area > 0: + #'poly in wrong direction' + if not tag: + tag = True #reversed cases should be ignore + poly = poly[(0, 3, 2, 1), :] + validated_polys.append(poly) + validated_tags.append(tag) + return np.array(validated_polys), np.array(validated_tags) + + def draw_img_polys(self, img, polys): + if len(img.shape) == 4: + img = np.squeeze(img, axis=0) + if img.shape[0] == 3: + img = img.transpose((1, 2, 0)) + img[:, :, 2] += 123.68 + img[:, :, 1] += 116.78 + img[:, :, 0] += 103.94 + cv2.imwrite("tmp.jpg", img) + img = cv2.imread("tmp.jpg") + for box in polys: + box = box.astype(np.int32).reshape((-1, 1, 2)) + cv2.polylines(img, [box], True, color=(255, 255, 0), thickness=2) + import random + ino = random.randint(0, 100) + cv2.imwrite("tmp_%d.jpg" % ino, img) + return + + def shrink_poly(self, poly, r): + """ + fit a poly inside the origin poly, maybe bugs here... + used for generate the score map + :param poly: the text poly + :param r: r in the paper + :return: the shrinked poly + """ + # shrink ratio + R = 0.3 + # find the longer pair + dist0 = np.linalg.norm(poly[0] - poly[1]) + dist1 = np.linalg.norm(poly[2] - poly[3]) + dist2 = np.linalg.norm(poly[0] - poly[3]) + dist3 = np.linalg.norm(poly[1] - poly[2]) + if dist0 + dist1 > dist2 + dist3: + # first move (p0, p1), (p2, p3), then (p0, p3), (p1, p2) + ## p0, p1 + theta = np.arctan2((poly[1][1] - poly[0][1]), + (poly[1][0] - poly[0][0])) + poly[0][0] += R * r[0] * np.cos(theta) + poly[0][1] += R * r[0] * np.sin(theta) + poly[1][0] -= R * r[1] * np.cos(theta) + poly[1][1] -= R * r[1] * np.sin(theta) + ## p2, p3 + theta = np.arctan2((poly[2][1] - poly[3][1]), + (poly[2][0] - poly[3][0])) + poly[3][0] += R * r[3] * np.cos(theta) + poly[3][1] += R * r[3] * np.sin(theta) + poly[2][0] -= R * r[2] * np.cos(theta) + poly[2][1] -= R * r[2] * np.sin(theta) + ## p0, p3 + theta = np.arctan2((poly[3][0] - poly[0][0]), + (poly[3][1] - poly[0][1])) + poly[0][0] += R * r[0] * np.sin(theta) + poly[0][1] += R * r[0] * np.cos(theta) + poly[3][0] -= R * r[3] * np.sin(theta) + poly[3][1] -= R * r[3] * np.cos(theta) + ## p1, p2 + theta = np.arctan2((poly[2][0] - poly[1][0]), + (poly[2][1] - poly[1][1])) + poly[1][0] += R * r[1] * np.sin(theta) + poly[1][1] += R * r[1] * np.cos(theta) + poly[2][0] -= R * r[2] * np.sin(theta) + poly[2][1] -= R * r[2] * np.cos(theta) + else: + ## p0, p3 + # print poly + theta = np.arctan2((poly[3][0] - poly[0][0]), + (poly[3][1] - poly[0][1])) + poly[0][0] += R * r[0] * np.sin(theta) + poly[0][1] += R * r[0] * np.cos(theta) + poly[3][0] -= R * r[3] * np.sin(theta) + poly[3][1] -= R * r[3] * np.cos(theta) + ## p1, p2 + theta = np.arctan2((poly[2][0] - poly[1][0]), + (poly[2][1] - poly[1][1])) + poly[1][0] += R * r[1] * np.sin(theta) + poly[1][1] += R * r[1] * np.cos(theta) + poly[2][0] -= R * r[2] * np.sin(theta) + poly[2][1] -= R * r[2] * np.cos(theta) + ## p0, p1 + theta = np.arctan2((poly[1][1] - poly[0][1]), + (poly[1][0] - poly[0][0])) + poly[0][0] += R * r[0] * np.cos(theta) + poly[0][1] += R * r[0] * np.sin(theta) + poly[1][0] -= R * r[1] * np.cos(theta) + poly[1][1] -= R * r[1] * np.sin(theta) + ## p2, p3 + theta = np.arctan2((poly[2][1] - poly[3][1]), + (poly[2][0] - poly[3][0])) + poly[3][0] += R * r[3] * np.cos(theta) + poly[3][1] += R * r[3] * np.sin(theta) + poly[2][0] -= R * r[2] * np.cos(theta) + poly[2][1] -= R * r[2] * np.sin(theta) + return poly + + def generate_quad(self, im_size, polys, tags): + """ + Generate quadrangle. + """ + h, w = im_size + poly_mask = np.zeros((h, w), dtype=np.uint8) + score_map = np.zeros((h, w), dtype=np.uint8) + # (x1, y1, ..., x4, y4, short_edge_norm) + geo_map = np.zeros((h, w, 9), dtype=np.float32) + # mask used during traning, to ignore some hard areas + training_mask = np.ones((h, w), dtype=np.uint8) + for poly_idx, poly_tag in enumerate(zip(polys, tags)): + poly = poly_tag[0] + tag = poly_tag[1] + + r = [None, None, None, None] + for i in range(4): + dist1 = np.linalg.norm(poly[i] - poly[(i + 1) % 4]) + dist2 = np.linalg.norm(poly[i] - poly[(i - 1) % 4]) + r[i] = min(dist1, dist2) + # score map + shrinked_poly = self.shrink_poly( + poly.copy(), r).astype(np.int32)[np.newaxis, :, :] + cv2.fillPoly(score_map, shrinked_poly, 1) + cv2.fillPoly(poly_mask, shrinked_poly, poly_idx + 1) + # if the poly is too small, then ignore it during training + poly_h = min( + np.linalg.norm(poly[0] - poly[3]), + np.linalg.norm(poly[1] - poly[2])) + poly_w = min( + np.linalg.norm(poly[0] - poly[1]), + np.linalg.norm(poly[2] - poly[3])) + if min(poly_h, poly_w) < self.min_text_size: + cv2.fillPoly(training_mask, + poly.astype(np.int32)[np.newaxis, :, :], 0) + + if tag: + cv2.fillPoly(training_mask, + poly.astype(np.int32)[np.newaxis, :, :], 0) + + xy_in_poly = np.argwhere(poly_mask == (poly_idx + 1)) + # geo map. + y_in_poly = xy_in_poly[:, 0] + x_in_poly = xy_in_poly[:, 1] + poly[:, 0] = np.minimum(np.maximum(poly[:, 0], 0), w) + poly[:, 1] = np.minimum(np.maximum(poly[:, 1], 0), h) + for pno in range(4): + geo_channel_beg = pno * 2 + geo_map[y_in_poly, x_in_poly, geo_channel_beg] =\ + x_in_poly - poly[pno, 0] + geo_map[y_in_poly, x_in_poly, geo_channel_beg+1] =\ + y_in_poly - poly[pno, 1] + geo_map[y_in_poly, x_in_poly, 8] = \ + 1.0 / max(min(poly_h, poly_w), 1.0) + return score_map, geo_map, training_mask + + def crop_area(self, im, polys, tags, crop_background=False, max_tries=50): + """ + make random crop from the input image + :param im: + :param polys: + :param tags: + :param crop_background: + :param max_tries: + :return: + """ + h, w, _ = im.shape + pad_h = h // 10 + pad_w = w // 10 + h_array = np.zeros((h + pad_h * 2), dtype=np.int32) + w_array = np.zeros((w + pad_w * 2), dtype=np.int32) + for poly in polys: + poly = np.round(poly, decimals=0).astype(np.int32) + minx = np.min(poly[:, 0]) + maxx = np.max(poly[:, 0]) + w_array[minx + pad_w:maxx + pad_w] = 1 + miny = np.min(poly[:, 1]) + maxy = np.max(poly[:, 1]) + h_array[miny + pad_h:maxy + pad_h] = 1 + # ensure the cropped area not across a text + h_axis = np.where(h_array == 0)[0] + w_axis = np.where(w_array == 0)[0] + if len(h_axis) == 0 or len(w_axis) == 0: + return im, polys, tags + + for i in range(max_tries): + xx = np.random.choice(w_axis, size=2) + xmin = np.min(xx) - pad_w + xmax = np.max(xx) - pad_w + xmin = np.clip(xmin, 0, w - 1) + xmax = np.clip(xmax, 0, w - 1) + yy = np.random.choice(h_axis, size=2) + ymin = np.min(yy) - pad_h + ymax = np.max(yy) - pad_h + ymin = np.clip(ymin, 0, h - 1) + ymax = np.clip(ymax, 0, h - 1) + if xmax - xmin < self.min_crop_side_ratio * w or \ + ymax - ymin < self.min_crop_side_ratio * h: + # area too small + continue + if polys.shape[0] != 0: + poly_axis_in_area = (polys[:, :, 0] >= xmin)\ + & (polys[:, :, 0] <= xmax)\ + & (polys[:, :, 1] >= ymin)\ + & (polys[:, :, 1] <= ymax) + selected_polys = np.where( + np.sum(poly_axis_in_area, axis=1) == 4)[0] + else: + selected_polys = [] + + if len(selected_polys) == 0: + # no text in this area + if crop_background: + im = im[ymin:ymax + 1, xmin:xmax + 1, :] + polys = [] + tags = [] + return im, polys, tags + else: + continue + + im = im[ymin:ymax + 1, xmin:xmax + 1, :] + polys = polys[selected_polys] + tags = tags[selected_polys] + polys[:, :, 0] -= xmin + polys[:, :, 1] -= ymin + return im, polys, tags + return im, polys, tags + + def crop_background_infor(self, im, text_polys, text_tags): + im, text_polys, text_tags = self.crop_area( + im, text_polys, text_tags, crop_background=True) + + if len(text_polys) > 0: + return None + # pad and resize image + input_size = self.input_size + im, ratio = self.preprocess(im) + score_map = np.zeros((input_size, input_size), dtype=np.float32) + geo_map = np.zeros((input_size, input_size, 9), dtype=np.float32) + training_mask = np.ones((input_size, input_size), dtype=np.float32) + return im, score_map, geo_map, training_mask + + def crop_foreground_infor(self, im, text_polys, text_tags): + im, text_polys, text_tags = self.crop_area( + im, text_polys, text_tags, crop_background=False) + + if text_polys.shape[0] == 0: + return None + #continue for all ignore case + if np.sum((text_tags * 1.0)) >= text_tags.size: + return None + # pad and resize image + input_size = self.input_size + im, ratio = self.preprocess(im) + text_polys[:, :, 0] *= ratio + text_polys[:, :, 1] *= ratio + _, _, new_h, new_w = im.shape + # print(im.shape) + # self.draw_img_polys(im, text_polys) + score_map, geo_map, training_mask = self.generate_quad( + (new_h, new_w), text_polys, text_tags) + return im, score_map, geo_map, training_mask + + def __call__(self, data): + im = data['image'] + text_polys = data['polys'] + text_tags = data['ignore_tags'] + if im is None: + return None + if text_polys.shape[0] == 0: + return None + + #add rotate cases + if np.random.rand() < 0.5: + im, text_polys = self.rotate_im_poly(im, text_polys) + h, w, _ = im.shape + text_polys, text_tags = self.check_and_validate_polys(text_polys, + text_tags, h, w) + if text_polys.shape[0] == 0: + return None + + # random scale this image + rd_scale = np.random.choice(self.random_scale) + im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale) + text_polys *= rd_scale + if np.random.rand() < self.background_ratio: + outs = self.crop_background_infor(im, text_polys, text_tags) + else: + outs = self.crop_foreground_infor(im, text_polys, text_tags) + + if outs is None: + return None + im, score_map, geo_map, training_mask = outs + score_map = score_map[np.newaxis, ::4, ::4].astype(np.float32) + geo_map = np.swapaxes(geo_map, 1, 2) + geo_map = np.swapaxes(geo_map, 1, 0) + geo_map = geo_map[:, ::4, ::4].astype(np.float32) + training_mask = training_mask[np.newaxis, ::4, ::4] + training_mask = training_mask.astype(np.float32) + + data['image'] = im[0] + data['score_map'] = score_map + data['geo_map'] = geo_map + data['training_mask'] = training_mask + return data diff --git a/ppocr/data/imaug/fce_aug.py b/ppocr/data/imaug/fce_aug.py new file mode 100644 index 0000000000000000000000000000000000000000..66bafef13caaaa958c89f865bde04cb25f031329 --- /dev/null +++ b/ppocr/data/imaug/fce_aug.py @@ -0,0 +1,564 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/datasets/pipelines/transforms.py +""" +import numpy as np +from PIL import Image, ImageDraw +import cv2 +from shapely.geometry import Polygon +import math +from ppocr.utils.poly_nms import poly_intersection + + +class RandomScaling: + def __init__(self, size=800, scale=(3. / 4, 5. / 2), **kwargs): + """Random scale the image while keeping aspect. + + Args: + size (int) : Base size before scaling. + scale (tuple(float)) : The range of scaling. + """ + assert isinstance(size, int) + assert isinstance(scale, float) or isinstance(scale, tuple) + self.size = size + self.scale = scale if isinstance(scale, tuple) \ + else (1 - scale, 1 + scale) + + def __call__(self, data): + image = data['image'] + text_polys = data['polys'] + h, w, _ = image.shape + + aspect_ratio = np.random.uniform(min(self.scale), max(self.scale)) + scales = self.size * 1.0 / max(h, w) * aspect_ratio + scales = np.array([scales, scales]) + out_size = (int(h * scales[1]), int(w * scales[0])) + image = cv2.resize(image, out_size[::-1]) + + data['image'] = image + text_polys[:, :, 0::2] = text_polys[:, :, 0::2] * scales[1] + text_polys[:, :, 1::2] = text_polys[:, :, 1::2] * scales[0] + data['polys'] = text_polys + + return data + + +class RandomCropFlip: + def __init__(self, + pad_ratio=0.1, + crop_ratio=0.5, + iter_num=1, + min_area_ratio=0.2, + **kwargs): + """Random crop and flip a patch of the image. + + Args: + crop_ratio (float): The ratio of cropping. + iter_num (int): Number of operations. + min_area_ratio (float): Minimal area ratio between cropped patch + and original image. + """ + assert isinstance(crop_ratio, float) + assert isinstance(iter_num, int) + assert isinstance(min_area_ratio, float) + + self.pad_ratio = pad_ratio + self.epsilon = 1e-2 + self.crop_ratio = crop_ratio + self.iter_num = iter_num + self.min_area_ratio = min_area_ratio + + def __call__(self, results): + for i in range(self.iter_num): + results = self.random_crop_flip(results) + + return results + + def random_crop_flip(self, results): + image = results['image'] + polygons = results['polys'] + ignore_tags = results['ignore_tags'] + if len(polygons) == 0: + return results + + if np.random.random() >= self.crop_ratio: + return results + + h, w, _ = image.shape + area = h * w + pad_h = int(h * self.pad_ratio) + pad_w = int(w * self.pad_ratio) + h_axis, w_axis = self.generate_crop_target(image, polygons, pad_h, + pad_w) + if len(h_axis) == 0 or len(w_axis) == 0: + return results + + attempt = 0 + while attempt < 50: + attempt += 1 + polys_keep = [] + polys_new = [] + ignore_tags_keep = [] + ignore_tags_new = [] + xx = np.random.choice(w_axis, size=2) + xmin = np.min(xx) - pad_w + xmax = np.max(xx) - pad_w + xmin = np.clip(xmin, 0, w - 1) + xmax = np.clip(xmax, 0, w - 1) + yy = np.random.choice(h_axis, size=2) + ymin = np.min(yy) - pad_h + ymax = np.max(yy) - pad_h + ymin = np.clip(ymin, 0, h - 1) + ymax = np.clip(ymax, 0, h - 1) + if (xmax - xmin) * (ymax - ymin) < area * self.min_area_ratio: + # area too small + continue + + pts = np.stack([[xmin, xmax, xmax, xmin], + [ymin, ymin, ymax, ymax]]).T.astype(np.int32) + pp = Polygon(pts) + fail_flag = False + for polygon, ignore_tag in zip(polygons, ignore_tags): + ppi = Polygon(polygon.reshape(-1, 2)) + ppiou, _ = poly_intersection(ppi, pp, buffer=0) + if np.abs(ppiou - float(ppi.area)) > self.epsilon and \ + np.abs(ppiou) > self.epsilon: + fail_flag = True + break + elif np.abs(ppiou - float(ppi.area)) < self.epsilon: + polys_new.append(polygon) + ignore_tags_new.append(ignore_tag) + else: + polys_keep.append(polygon) + ignore_tags_keep.append(ignore_tag) + + if fail_flag: + continue + else: + break + + cropped = image[ymin:ymax, xmin:xmax, :] + select_type = np.random.randint(3) + if select_type == 0: + img = np.ascontiguousarray(cropped[:, ::-1]) + elif select_type == 1: + img = np.ascontiguousarray(cropped[::-1, :]) + else: + img = np.ascontiguousarray(cropped[::-1, ::-1]) + image[ymin:ymax, xmin:xmax, :] = img + results['img'] = image + + if len(polys_new) != 0: + height, width, _ = cropped.shape + if select_type == 0: + for idx, polygon in enumerate(polys_new): + poly = polygon.reshape(-1, 2) + poly[:, 0] = width - poly[:, 0] + 2 * xmin + polys_new[idx] = poly + elif select_type == 1: + for idx, polygon in enumerate(polys_new): + poly = polygon.reshape(-1, 2) + poly[:, 1] = height - poly[:, 1] + 2 * ymin + polys_new[idx] = poly + else: + for idx, polygon in enumerate(polys_new): + poly = polygon.reshape(-1, 2) + poly[:, 0] = width - poly[:, 0] + 2 * xmin + poly[:, 1] = height - poly[:, 1] + 2 * ymin + polys_new[idx] = poly + polygons = polys_keep + polys_new + ignore_tags = ignore_tags_keep + ignore_tags_new + results['polys'] = np.array(polygons) + results['ignore_tags'] = ignore_tags + + return results + + def generate_crop_target(self, image, all_polys, pad_h, pad_w): + """Generate crop target and make sure not to crop the polygon + instances. + + Args: + image (ndarray): The image waited to be crop. + all_polys (list[list[ndarray]]): All polygons including ground + truth polygons and ground truth ignored polygons. + pad_h (int): Padding length of height. + pad_w (int): Padding length of width. + Returns: + h_axis (ndarray): Vertical cropping range. + w_axis (ndarray): Horizontal cropping range. + """ + h, w, _ = image.shape + h_array = np.zeros((h + pad_h * 2), dtype=np.int32) + w_array = np.zeros((w + pad_w * 2), dtype=np.int32) + + text_polys = [] + for polygon in all_polys: + rect = cv2.minAreaRect(polygon.astype(np.int32).reshape(-1, 2)) + box = cv2.boxPoints(rect) + box = np.int0(box) + text_polys.append([box[0], box[1], box[2], box[3]]) + + polys = np.array(text_polys, dtype=np.int32) + for poly in polys: + poly = np.round(poly, decimals=0).astype(np.int32) + minx = np.min(poly[:, 0]) + maxx = np.max(poly[:, 0]) + w_array[minx + pad_w:maxx + pad_w] = 1 + miny = np.min(poly[:, 1]) + maxy = np.max(poly[:, 1]) + h_array[miny + pad_h:maxy + pad_h] = 1 + + h_axis = np.where(h_array == 0)[0] + w_axis = np.where(w_array == 0)[0] + return h_axis, w_axis + + +class RandomCropPolyInstances: + """Randomly crop images and make sure to contain at least one intact + instance.""" + + def __init__(self, crop_ratio=5.0 / 8.0, min_side_ratio=0.4, **kwargs): + super().__init__() + self.crop_ratio = crop_ratio + self.min_side_ratio = min_side_ratio + + def sample_valid_start_end(self, valid_array, min_len, max_start, min_end): + + assert isinstance(min_len, int) + assert len(valid_array) > min_len + + start_array = valid_array.copy() + max_start = min(len(start_array) - min_len, max_start) + start_array[max_start:] = 0 + start_array[0] = 1 + diff_array = np.hstack([0, start_array]) - np.hstack([start_array, 0]) + region_starts = np.where(diff_array < 0)[0] + region_ends = np.where(diff_array > 0)[0] + region_ind = np.random.randint(0, len(region_starts)) + start = np.random.randint(region_starts[region_ind], + region_ends[region_ind]) + + end_array = valid_array.copy() + min_end = max(start + min_len, min_end) + end_array[:min_end] = 0 + end_array[-1] = 1 + diff_array = np.hstack([0, end_array]) - np.hstack([end_array, 0]) + region_starts = np.where(diff_array < 0)[0] + region_ends = np.where(diff_array > 0)[0] + region_ind = np.random.randint(0, len(region_starts)) + end = np.random.randint(region_starts[region_ind], + region_ends[region_ind]) + return start, end + + def sample_crop_box(self, img_size, results): + """Generate crop box and make sure not to crop the polygon instances. + + Args: + img_size (tuple(int)): The image size (h, w). + results (dict): The results dict. + """ + + assert isinstance(img_size, tuple) + h, w = img_size[:2] + + key_masks = results['polys'] + + x_valid_array = np.ones(w, dtype=np.int32) + y_valid_array = np.ones(h, dtype=np.int32) + + selected_mask = key_masks[np.random.randint(0, len(key_masks))] + selected_mask = selected_mask.reshape((-1, 2)).astype(np.int32) + max_x_start = max(np.min(selected_mask[:, 0]) - 2, 0) + min_x_end = min(np.max(selected_mask[:, 0]) + 3, w - 1) + max_y_start = max(np.min(selected_mask[:, 1]) - 2, 0) + min_y_end = min(np.max(selected_mask[:, 1]) + 3, h - 1) + + for mask in key_masks: + mask = mask.reshape((-1, 2)).astype(np.int32) + clip_x = np.clip(mask[:, 0], 0, w - 1) + clip_y = np.clip(mask[:, 1], 0, h - 1) + min_x, max_x = np.min(clip_x), np.max(clip_x) + min_y, max_y = np.min(clip_y), np.max(clip_y) + + x_valid_array[min_x - 2:max_x + 3] = 0 + y_valid_array[min_y - 2:max_y + 3] = 0 + + min_w = int(w * self.min_side_ratio) + min_h = int(h * self.min_side_ratio) + + x1, x2 = self.sample_valid_start_end(x_valid_array, min_w, max_x_start, + min_x_end) + y1, y2 = self.sample_valid_start_end(y_valid_array, min_h, max_y_start, + min_y_end) + + return np.array([x1, y1, x2, y2]) + + def crop_img(self, img, bbox): + assert img.ndim == 3 + h, w, _ = img.shape + assert 0 <= bbox[1] < bbox[3] <= h + assert 0 <= bbox[0] < bbox[2] <= w + return img[bbox[1]:bbox[3], bbox[0]:bbox[2]] + + def __call__(self, results): + image = results['image'] + polygons = results['polys'] + ignore_tags = results['ignore_tags'] + if len(polygons) < 1: + return results + + if np.random.random_sample() < self.crop_ratio: + + crop_box = self.sample_crop_box(image.shape, results) + img = self.crop_img(image, crop_box) + results['image'] = img + # crop and filter masks + x1, y1, x2, y2 = crop_box + w = max(x2 - x1, 1) + h = max(y2 - y1, 1) + polygons[:, :, 0::2] = polygons[:, :, 0::2] - x1 + polygons[:, :, 1::2] = polygons[:, :, 1::2] - y1 + + valid_masks_list = [] + valid_tags_list = [] + for ind, polygon in enumerate(polygons): + if (polygon[:, ::2] > -4).all() and ( + polygon[:, ::2] < w + 4).all() and ( + polygon[:, 1::2] > -4).all() and ( + polygon[:, 1::2] < h + 4).all(): + polygon[:, ::2] = np.clip(polygon[:, ::2], 0, w) + polygon[:, 1::2] = np.clip(polygon[:, 1::2], 0, h) + valid_masks_list.append(polygon) + valid_tags_list.append(ignore_tags[ind]) + + results['polys'] = np.array(valid_masks_list) + results['ignore_tags'] = valid_tags_list + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + return repr_str + + +class RandomRotatePolyInstances: + def __init__(self, + rotate_ratio=0.5, + max_angle=10, + pad_with_fixed_color=False, + pad_value=(0, 0, 0), + **kwargs): + """Randomly rotate images and polygon masks. + + Args: + rotate_ratio (float): The ratio of samples to operate rotation. + max_angle (int): The maximum rotation angle. + pad_with_fixed_color (bool): The flag for whether to pad rotated + image with fixed value. If set to False, the rotated image will + be padded onto cropped image. + pad_value (tuple(int)): The color value for padding rotated image. + """ + self.rotate_ratio = rotate_ratio + self.max_angle = max_angle + self.pad_with_fixed_color = pad_with_fixed_color + self.pad_value = pad_value + + def rotate(self, center, points, theta, center_shift=(0, 0)): + # rotate points. + (center_x, center_y) = center + center_y = -center_y + x, y = points[:, ::2], points[:, 1::2] + y = -y + + theta = theta / 180 * math.pi + cos = math.cos(theta) + sin = math.sin(theta) + + x = (x - center_x) + y = (y - center_y) + + _x = center_x + x * cos - y * sin + center_shift[0] + _y = -(center_y + x * sin + y * cos) + center_shift[1] + + points[:, ::2], points[:, 1::2] = _x, _y + return points + + def cal_canvas_size(self, ori_size, degree): + assert isinstance(ori_size, tuple) + angle = degree * math.pi / 180.0 + h, w = ori_size[:2] + + cos = math.cos(angle) + sin = math.sin(angle) + canvas_h = int(w * math.fabs(sin) + h * math.fabs(cos)) + canvas_w = int(w * math.fabs(cos) + h * math.fabs(sin)) + + canvas_size = (canvas_h, canvas_w) + return canvas_size + + def sample_angle(self, max_angle): + angle = np.random.random_sample() * 2 * max_angle - max_angle + return angle + + def rotate_img(self, img, angle, canvas_size): + h, w = img.shape[:2] + rotation_matrix = cv2.getRotationMatrix2D((w / 2, h / 2), angle, 1) + rotation_matrix[0, 2] += int((canvas_size[1] - w) / 2) + rotation_matrix[1, 2] += int((canvas_size[0] - h) / 2) + + if self.pad_with_fixed_color: + target_img = cv2.warpAffine( + img, + rotation_matrix, (canvas_size[1], canvas_size[0]), + flags=cv2.INTER_NEAREST, + borderValue=self.pad_value) + else: + mask = np.zeros_like(img) + (h_ind, w_ind) = (np.random.randint(0, h * 7 // 8), + np.random.randint(0, w * 7 // 8)) + img_cut = img[h_ind:(h_ind + h // 9), w_ind:(w_ind + w // 9)] + img_cut = cv2.resize(img_cut, (canvas_size[1], canvas_size[0])) + + mask = cv2.warpAffine( + mask, + rotation_matrix, (canvas_size[1], canvas_size[0]), + borderValue=[1, 1, 1]) + target_img = cv2.warpAffine( + img, + rotation_matrix, (canvas_size[1], canvas_size[0]), + borderValue=[0, 0, 0]) + target_img = target_img + img_cut * mask + + return target_img + + def __call__(self, results): + if np.random.random_sample() < self.rotate_ratio: + image = results['image'] + polygons = results['polys'] + h, w = image.shape[:2] + + angle = self.sample_angle(self.max_angle) + canvas_size = self.cal_canvas_size((h, w), angle) + center_shift = (int((canvas_size[1] - w) / 2), int( + (canvas_size[0] - h) / 2)) + image = self.rotate_img(image, angle, canvas_size) + results['image'] = image + # rotate polygons + rotated_masks = [] + for mask in polygons: + rotated_mask = self.rotate((w / 2, h / 2), mask, angle, + center_shift) + rotated_masks.append(rotated_mask) + results['polys'] = np.array(rotated_masks) + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + return repr_str + + +class SquareResizePad: + def __init__(self, + target_size, + pad_ratio=0.6, + pad_with_fixed_color=False, + pad_value=(0, 0, 0), + **kwargs): + """Resize or pad images to be square shape. + + Args: + target_size (int): The target size of square shaped image. + pad_with_fixed_color (bool): The flag for whether to pad rotated + image with fixed value. If set to False, the rescales image will + be padded onto cropped image. + pad_value (tuple(int)): The color value for padding rotated image. + """ + assert isinstance(target_size, int) + assert isinstance(pad_ratio, float) + assert isinstance(pad_with_fixed_color, bool) + assert isinstance(pad_value, tuple) + + self.target_size = target_size + self.pad_ratio = pad_ratio + self.pad_with_fixed_color = pad_with_fixed_color + self.pad_value = pad_value + + def resize_img(self, img, keep_ratio=True): + h, w, _ = img.shape + if keep_ratio: + t_h = self.target_size if h >= w else int(h * self.target_size / w) + t_w = self.target_size if h <= w else int(w * self.target_size / h) + else: + t_h = t_w = self.target_size + img = cv2.resize(img, (t_w, t_h)) + return img, (t_h, t_w) + + def square_pad(self, img): + h, w = img.shape[:2] + if h == w: + return img, (0, 0) + pad_size = max(h, w) + if self.pad_with_fixed_color: + expand_img = np.ones((pad_size, pad_size, 3), dtype=np.uint8) + expand_img[:] = self.pad_value + else: + (h_ind, w_ind) = (np.random.randint(0, h * 7 // 8), + np.random.randint(0, w * 7 // 8)) + img_cut = img[h_ind:(h_ind + h // 9), w_ind:(w_ind + w // 9)] + expand_img = cv2.resize(img_cut, (pad_size, pad_size)) + if h > w: + y0, x0 = 0, (h - w) // 2 + else: + y0, x0 = (w - h) // 2, 0 + expand_img[y0:y0 + h, x0:x0 + w] = img + offset = (x0, y0) + + return expand_img, offset + + def square_pad_mask(self, points, offset): + x0, y0 = offset + pad_points = points.copy() + pad_points[::2] = pad_points[::2] + x0 + pad_points[1::2] = pad_points[1::2] + y0 + return pad_points + + def __call__(self, results): + image = results['image'] + polygons = results['polys'] + h, w = image.shape[:2] + + if np.random.random_sample() < self.pad_ratio: + image, out_size = self.resize_img(image, keep_ratio=True) + image, offset = self.square_pad(image) + else: + image, out_size = self.resize_img(image, keep_ratio=False) + offset = (0, 0) + results['image'] = image + try: + polygons[:, :, 0::2] = polygons[:, :, 0::2] * out_size[ + 1] / w + offset[0] + polygons[:, :, 1::2] = polygons[:, :, 1::2] * out_size[ + 0] / h + offset[1] + except: + pass + results['polys'] = polygons + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + return repr_str diff --git a/ppocr/data/imaug/fce_targets.py b/ppocr/data/imaug/fce_targets.py new file mode 100644 index 0000000000000000000000000000000000000000..8c64276e26665d2779d35154bf9cd77edddad580 --- /dev/null +++ b/ppocr/data/imaug/fce_targets.py @@ -0,0 +1,666 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/datasets/pipelines/textdet_targets/fcenet_targets.py +""" + +import cv2 +import numpy as np +from numpy.fft import fft +from numpy.linalg import norm +import sys + +def vector_slope(vec): + assert len(vec) == 2 + return abs(vec[1] / (vec[0] + 1e-8)) + +class FCENetTargets: + """Generate the ground truth targets of FCENet: Fourier Contour Embedding + for Arbitrary-Shaped Text Detection. + + [https://arxiv.org/abs/2104.10442] + + Args: + fourier_degree (int): The maximum Fourier transform degree k. + resample_step (float): The step size for resampling the text center + line (TCL). It's better not to exceed half of the minimum width. + center_region_shrink_ratio (float): The shrink ratio of text center + region. + level_size_divisors (tuple(int)): The downsample ratio on each level. + level_proportion_range (tuple(tuple(int))): The range of text sizes + assigned to each level. + """ + + def __init__(self, + fourier_degree=5, + resample_step=4.0, + center_region_shrink_ratio=0.3, + level_size_divisors=(8, 16, 32), + level_proportion_range=((0, 0.25), (0.2, 0.65), (0.55, 1.0)), + orientation_thr=2.0, + **kwargs): + + super().__init__() + assert isinstance(level_size_divisors, tuple) + assert isinstance(level_proportion_range, tuple) + assert len(level_size_divisors) == len(level_proportion_range) + self.fourier_degree = fourier_degree + self.resample_step = resample_step + self.center_region_shrink_ratio = center_region_shrink_ratio + self.level_size_divisors = level_size_divisors + self.level_proportion_range = level_proportion_range + + self.orientation_thr = orientation_thr + + def vector_angle(self, vec1, vec2): + if vec1.ndim > 1: + unit_vec1 = vec1 / (norm(vec1, axis=-1) + 1e-8).reshape((-1, 1)) + else: + unit_vec1 = vec1 / (norm(vec1, axis=-1) + 1e-8) + if vec2.ndim > 1: + unit_vec2 = vec2 / (norm(vec2, axis=-1) + 1e-8).reshape((-1, 1)) + else: + unit_vec2 = vec2 / (norm(vec2, axis=-1) + 1e-8) + return np.arccos( + np.clip( + np.sum(unit_vec1 * unit_vec2, axis=-1), -1.0, 1.0)) + + def resample_line(self, line, n): + """Resample n points on a line. + + Args: + line (ndarray): The points composing a line. + n (int): The resampled points number. + + Returns: + resampled_line (ndarray): The points composing the resampled line. + """ + + assert line.ndim == 2 + assert line.shape[0] >= 2 + assert line.shape[1] == 2 + assert isinstance(n, int) + assert n > 0 + + length_list = [ + norm(line[i + 1] - line[i]) for i in range(len(line) - 1) + ] + total_length = sum(length_list) + length_cumsum = np.cumsum([0.0] + length_list) + delta_length = total_length / (float(n) + 1e-8) + + current_edge_ind = 0 + resampled_line = [line[0]] + + for i in range(1, n): + current_line_len = i * delta_length + + while current_edge_ind + 1 < len(length_cumsum) and current_line_len >= length_cumsum[current_edge_ind + 1]: + current_edge_ind += 1 + + current_edge_end_shift = current_line_len - length_cumsum[ + current_edge_ind] + + if current_edge_ind >= len(length_list): + break + end_shift_ratio = current_edge_end_shift / length_list[ + current_edge_ind] + current_point = line[current_edge_ind] + (line[current_edge_ind + 1] + - line[current_edge_ind] + ) * end_shift_ratio + resampled_line.append(current_point) + resampled_line.append(line[-1]) + resampled_line = np.array(resampled_line) + + return resampled_line + + def reorder_poly_edge(self, points): + """Get the respective points composing head edge, tail edge, top + sideline and bottom sideline. + + Args: + points (ndarray): The points composing a text polygon. + + Returns: + head_edge (ndarray): The two points composing the head edge of text + polygon. + tail_edge (ndarray): The two points composing the tail edge of text + polygon. + top_sideline (ndarray): The points composing top curved sideline of + text polygon. + bot_sideline (ndarray): The points composing bottom curved sideline + of text polygon. + """ + + assert points.ndim == 2 + assert points.shape[0] >= 4 + assert points.shape[1] == 2 + + head_inds, tail_inds = self.find_head_tail(points, self.orientation_thr) + head_edge, tail_edge = points[head_inds], points[tail_inds] + + pad_points = np.vstack([points, points]) + if tail_inds[1] < 1: + tail_inds[1] = len(points) + sideline1 = pad_points[head_inds[1]:tail_inds[1]] + sideline2 = pad_points[tail_inds[1]:(head_inds[1] + len(points))] + sideline_mean_shift = np.mean( + sideline1, axis=0) - np.mean( + sideline2, axis=0) + + if sideline_mean_shift[1] > 0: + top_sideline, bot_sideline = sideline2, sideline1 + else: + top_sideline, bot_sideline = sideline1, sideline2 + + return head_edge, tail_edge, top_sideline, bot_sideline + + def find_head_tail(self, points, orientation_thr): + """Find the head edge and tail edge of a text polygon. + + Args: + points (ndarray): The points composing a text polygon. + orientation_thr (float): The threshold for distinguishing between + head edge and tail edge among the horizontal and vertical edges + of a quadrangle. + + Returns: + head_inds (list): The indexes of two points composing head edge. + tail_inds (list): The indexes of two points composing tail edge. + """ + + assert points.ndim == 2 + assert points.shape[0] >= 4 + assert points.shape[1] == 2 + assert isinstance(orientation_thr, float) + + if len(points) > 4: + pad_points = np.vstack([points, points[0]]) + edge_vec = pad_points[1:] - pad_points[:-1] + + theta_sum = [] + adjacent_vec_theta = [] + for i, edge_vec1 in enumerate(edge_vec): + adjacent_ind = [x % len(edge_vec) for x in [i - 1, i + 1]] + adjacent_edge_vec = edge_vec[adjacent_ind] + temp_theta_sum = np.sum( + self.vector_angle(edge_vec1, adjacent_edge_vec)) + temp_adjacent_theta = self.vector_angle(adjacent_edge_vec[0], + adjacent_edge_vec[1]) + theta_sum.append(temp_theta_sum) + adjacent_vec_theta.append(temp_adjacent_theta) + theta_sum_score = np.array(theta_sum) / np.pi + adjacent_theta_score = np.array(adjacent_vec_theta) / np.pi + poly_center = np.mean(points, axis=0) + edge_dist = np.maximum( + norm( + pad_points[1:] - poly_center, axis=-1), + norm( + pad_points[:-1] - poly_center, axis=-1)) + dist_score = edge_dist / np.max(edge_dist) + position_score = np.zeros(len(edge_vec)) + score = 0.5 * theta_sum_score + 0.15 * adjacent_theta_score + score += 0.35 * dist_score + if len(points) % 2 == 0: + position_score[(len(score) // 2 - 1)] += 1 + position_score[-1] += 1 + score += 0.1 * position_score + pad_score = np.concatenate([score, score]) + score_matrix = np.zeros((len(score), len(score) - 3)) + x = np.arange(len(score) - 3) / float(len(score) - 4) + gaussian = 1. / (np.sqrt(2. * np.pi) * 0.5) * np.exp(-np.power( + (x - 0.5) / 0.5, 2.) / 2) + gaussian = gaussian / np.max(gaussian) + for i in range(len(score)): + score_matrix[i, :] = score[i] + pad_score[(i + 2):(i + len( + score) - 1)] * gaussian * 0.3 + + head_start, tail_increment = np.unravel_index(score_matrix.argmax(), + score_matrix.shape) + tail_start = (head_start + tail_increment + 2) % len(points) + head_end = (head_start + 1) % len(points) + tail_end = (tail_start + 1) % len(points) + + if head_end > tail_end: + head_start, tail_start = tail_start, head_start + head_end, tail_end = tail_end, head_end + head_inds = [head_start, head_end] + tail_inds = [tail_start, tail_end] + else: + if vector_slope(points[1] - points[0]) + vector_slope( + points[3] - points[2]) < vector_slope(points[ + 2] - points[1]) + vector_slope(points[0] - points[ + 3]): + horizontal_edge_inds = [[0, 1], [2, 3]] + vertical_edge_inds = [[3, 0], [1, 2]] + else: + horizontal_edge_inds = [[3, 0], [1, 2]] + vertical_edge_inds = [[0, 1], [2, 3]] + + vertical_len_sum = norm(points[vertical_edge_inds[0][0]] - points[ + vertical_edge_inds[0][1]]) + norm(points[vertical_edge_inds[1][ + 0]] - points[vertical_edge_inds[1][1]]) + horizontal_len_sum = norm(points[horizontal_edge_inds[0][ + 0]] - points[horizontal_edge_inds[0][1]]) + norm(points[ + horizontal_edge_inds[1][0]] - points[horizontal_edge_inds[1] + [1]]) + + if vertical_len_sum > horizontal_len_sum * orientation_thr: + head_inds = horizontal_edge_inds[0] + tail_inds = horizontal_edge_inds[1] + else: + head_inds = vertical_edge_inds[0] + tail_inds = vertical_edge_inds[1] + + return head_inds, tail_inds + + def resample_sidelines(self, sideline1, sideline2, resample_step): + """Resample two sidelines to be of the same points number according to + step size. + + Args: + sideline1 (ndarray): The points composing a sideline of a text + polygon. + sideline2 (ndarray): The points composing another sideline of a + text polygon. + resample_step (float): The resampled step size. + + Returns: + resampled_line1 (ndarray): The resampled line 1. + resampled_line2 (ndarray): The resampled line 2. + """ + + assert sideline1.ndim == sideline2.ndim == 2 + assert sideline1.shape[1] == sideline2.shape[1] == 2 + assert sideline1.shape[0] >= 2 + assert sideline2.shape[0] >= 2 + assert isinstance(resample_step, float) + + length1 = sum([ + norm(sideline1[i + 1] - sideline1[i]) + for i in range(len(sideline1) - 1) + ]) + length2 = sum([ + norm(sideline2[i + 1] - sideline2[i]) + for i in range(len(sideline2) - 1) + ]) + + total_length = (length1 + length2) / 2 + resample_point_num = max(int(float(total_length) / resample_step), 1) + + resampled_line1 = self.resample_line(sideline1, resample_point_num) + resampled_line2 = self.resample_line(sideline2, resample_point_num) + + return resampled_line1, resampled_line2 + + def generate_center_region_mask(self, img_size, text_polys): + """Generate text center region mask. + + Args: + img_size (tuple): The image size of (height, width). + text_polys (list[list[ndarray]]): The list of text polygons. + + Returns: + center_region_mask (ndarray): The text center region mask. + """ + + assert isinstance(img_size, tuple) + # assert check_argument.is_2dlist(text_polys) + + h, w = img_size + + center_region_mask = np.zeros((h, w), np.uint8) + + center_region_boxes = [] + for poly in text_polys: + # assert len(poly) == 1 + polygon_points = poly.reshape(-1, 2) + _, _, top_line, bot_line = self.reorder_poly_edge(polygon_points) + resampled_top_line, resampled_bot_line = self.resample_sidelines( + top_line, bot_line, self.resample_step) + resampled_bot_line = resampled_bot_line[::-1] + if len(resampled_top_line) != len(resampled_bot_line): + continue + center_line = (resampled_top_line + resampled_bot_line) / 2 + + line_head_shrink_len = norm(resampled_top_line[0] - + resampled_bot_line[0]) / 4.0 + line_tail_shrink_len = norm(resampled_top_line[-1] - + resampled_bot_line[-1]) / 4.0 + head_shrink_num = int(line_head_shrink_len // self.resample_step) + tail_shrink_num = int(line_tail_shrink_len // self.resample_step) + if len(center_line) > head_shrink_num + tail_shrink_num + 2: + center_line = center_line[head_shrink_num:len(center_line) - + tail_shrink_num] + resampled_top_line = resampled_top_line[head_shrink_num:len( + resampled_top_line) - tail_shrink_num] + resampled_bot_line = resampled_bot_line[head_shrink_num:len( + resampled_bot_line) - tail_shrink_num] + + for i in range(0, len(center_line) - 1): + tl = center_line[i] + (resampled_top_line[i] - center_line[i] + ) * self.center_region_shrink_ratio + tr = center_line[i + 1] + (resampled_top_line[i + 1] - + center_line[i + 1] + ) * self.center_region_shrink_ratio + br = center_line[i + 1] + (resampled_bot_line[i + 1] - + center_line[i + 1] + ) * self.center_region_shrink_ratio + bl = center_line[i] + (resampled_bot_line[i] - center_line[i] + ) * self.center_region_shrink_ratio + current_center_box = np.vstack([tl, tr, br, + bl]).astype(np.int32) + center_region_boxes.append(current_center_box) + + cv2.fillPoly(center_region_mask, center_region_boxes, 1) + return center_region_mask + + def resample_polygon(self, polygon, n=400): + """Resample one polygon with n points on its boundary. + + Args: + polygon (list[float]): The input polygon. + n (int): The number of resampled points. + Returns: + resampled_polygon (list[float]): The resampled polygon. + """ + length = [] + + for i in range(len(polygon)): + p1 = polygon[i] + if i == len(polygon) - 1: + p2 = polygon[0] + else: + p2 = polygon[i + 1] + length.append(((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)**0.5) + + total_length = sum(length) + n_on_each_line = (np.array(length) / (total_length + 1e-8)) * n + n_on_each_line = n_on_each_line.astype(np.int32) + new_polygon = [] + + for i in range(len(polygon)): + num = n_on_each_line[i] + p1 = polygon[i] + if i == len(polygon) - 1: + p2 = polygon[0] + else: + p2 = polygon[i + 1] + + if num == 0: + continue + + dxdy = (p2 - p1) / num + for j in range(num): + point = p1 + dxdy * j + new_polygon.append(point) + + return np.array(new_polygon) + + def normalize_polygon(self, polygon): + """Normalize one polygon so that its start point is at right most. + + Args: + polygon (list[float]): The origin polygon. + Returns: + new_polygon (lost[float]): The polygon with start point at right. + """ + temp_polygon = polygon - polygon.mean(axis=0) + x = np.abs(temp_polygon[:, 0]) + y = temp_polygon[:, 1] + index_x = np.argsort(x) + index_y = np.argmin(y[index_x[:8]]) + index = index_x[index_y] + new_polygon = np.concatenate([polygon[index:], polygon[:index]]) + return new_polygon + + def poly2fourier(self, polygon, fourier_degree): + """Perform Fourier transformation to generate Fourier coefficients ck + from polygon. + + Args: + polygon (ndarray): An input polygon. + fourier_degree (int): The maximum Fourier degree K. + Returns: + c (ndarray(complex)): Fourier coefficients. + """ + points = polygon[:, 0] + polygon[:, 1] * 1j + c_fft = fft(points) / len(points) + c = np.hstack((c_fft[-fourier_degree:], c_fft[:fourier_degree + 1])) + return c + + def clockwise(self, c, fourier_degree): + """Make sure the polygon reconstructed from Fourier coefficients c in + the clockwise direction. + + Args: + polygon (list[float]): The origin polygon. + Returns: + new_polygon (lost[float]): The polygon in clockwise point order. + """ + if np.abs(c[fourier_degree + 1]) > np.abs(c[fourier_degree - 1]): + return c + elif np.abs(c[fourier_degree + 1]) < np.abs(c[fourier_degree - 1]): + return c[::-1] + else: + if np.abs(c[fourier_degree + 2]) > np.abs(c[fourier_degree - 2]): + return c + else: + return c[::-1] + + def cal_fourier_signature(self, polygon, fourier_degree): + """Calculate Fourier signature from input polygon. + + Args: + polygon (ndarray): The input polygon. + fourier_degree (int): The maximum Fourier degree K. + Returns: + fourier_signature (ndarray): An array shaped (2k+1, 2) containing + real part and image part of 2k+1 Fourier coefficients. + """ + resampled_polygon = self.resample_polygon(polygon) + resampled_polygon = self.normalize_polygon(resampled_polygon) + + fourier_coeff = self.poly2fourier(resampled_polygon, fourier_degree) + fourier_coeff = self.clockwise(fourier_coeff, fourier_degree) + + real_part = np.real(fourier_coeff).reshape((-1, 1)) + image_part = np.imag(fourier_coeff).reshape((-1, 1)) + fourier_signature = np.hstack([real_part, image_part]) + + return fourier_signature + + def generate_fourier_maps(self, img_size, text_polys): + """Generate Fourier coefficient maps. + + Args: + img_size (tuple): The image size of (height, width). + text_polys (list[list[ndarray]]): The list of text polygons. + + Returns: + fourier_real_map (ndarray): The Fourier coefficient real part maps. + fourier_image_map (ndarray): The Fourier coefficient image part + maps. + """ + + assert isinstance(img_size, tuple) + + h, w = img_size + k = self.fourier_degree + real_map = np.zeros((k * 2 + 1, h, w), dtype=np.float32) + imag_map = np.zeros((k * 2 + 1, h, w), dtype=np.float32) + + for poly in text_polys: + mask = np.zeros((h, w), dtype=np.uint8) + polygon = np.array(poly).reshape((1, -1, 2)) + cv2.fillPoly(mask, polygon.astype(np.int32), 1) + fourier_coeff = self.cal_fourier_signature(polygon[0], k) + for i in range(-k, k + 1): + if i != 0: + real_map[i + k, :, :] = mask * fourier_coeff[i + k, 0] + ( + 1 - mask) * real_map[i + k, :, :] + imag_map[i + k, :, :] = mask * fourier_coeff[i + k, 1] + ( + 1 - mask) * imag_map[i + k, :, :] + else: + yx = np.argwhere(mask > 0.5) + k_ind = np.ones((len(yx)), dtype=np.int64) * k + y, x = yx[:, 0], yx[:, 1] + real_map[k_ind, y, x] = fourier_coeff[k, 0] - x + imag_map[k_ind, y, x] = fourier_coeff[k, 1] - y + + return real_map, imag_map + + def generate_text_region_mask(self, img_size, text_polys): + """Generate text center region mask and geometry attribute maps. + + Args: + img_size (tuple): The image size (height, width). + text_polys (list[list[ndarray]]): The list of text polygons. + + Returns: + text_region_mask (ndarray): The text region mask. + """ + + assert isinstance(img_size, tuple) + + h, w = img_size + text_region_mask = np.zeros((h, w), dtype=np.uint8) + + for poly in text_polys: + polygon = np.array(poly, dtype=np.int32).reshape((1, -1, 2)) + cv2.fillPoly(text_region_mask, polygon, 1) + + return text_region_mask + + def generate_effective_mask(self, mask_size: tuple, polygons_ignore): + """Generate effective mask by setting the ineffective regions to 0 and + effective regions to 1. + + Args: + mask_size (tuple): The mask size. + polygons_ignore (list[[ndarray]]: The list of ignored text + polygons. + + Returns: + mask (ndarray): The effective mask of (height, width). + """ + + mask = np.ones(mask_size, dtype=np.uint8) + + for poly in polygons_ignore: + instance = poly.reshape(-1, 2).astype(np.int32).reshape(1, -1, 2) + cv2.fillPoly(mask, instance, 0) + + return mask + + def generate_level_targets(self, img_size, text_polys, ignore_polys): + """Generate ground truth target on each level. + + Args: + img_size (list[int]): Shape of input image. + text_polys (list[list[ndarray]]): A list of ground truth polygons. + ignore_polys (list[list[ndarray]]): A list of ignored polygons. + Returns: + level_maps (list(ndarray)): A list of ground target on each level. + """ + h, w = img_size + lv_size_divs = self.level_size_divisors + lv_proportion_range = self.level_proportion_range + lv_text_polys = [[] for i in range(len(lv_size_divs))] + lv_ignore_polys = [[] for i in range(len(lv_size_divs))] + level_maps = [] + for poly in text_polys: + polygon = np.array(poly, dtype=np.int).reshape((1, -1, 2)) + _, _, box_w, box_h = cv2.boundingRect(polygon) + proportion = max(box_h, box_w) / (h + 1e-8) + + for ind, proportion_range in enumerate(lv_proportion_range): + if proportion_range[0] < proportion < proportion_range[1]: + lv_text_polys[ind].append(poly / lv_size_divs[ind]) + + for ignore_poly in ignore_polys: + polygon = np.array(ignore_poly, dtype=np.int).reshape((1, -1, 2)) + _, _, box_w, box_h = cv2.boundingRect(polygon) + proportion = max(box_h, box_w) / (h + 1e-8) + + for ind, proportion_range in enumerate(lv_proportion_range): + if proportion_range[0] < proportion < proportion_range[1]: + lv_ignore_polys[ind].append(ignore_poly / lv_size_divs[ind]) + + for ind, size_divisor in enumerate(lv_size_divs): + current_level_maps = [] + level_img_size = (h // size_divisor, w // size_divisor) + + text_region = self.generate_text_region_mask( + level_img_size, lv_text_polys[ind])[None] + current_level_maps.append(text_region) + + center_region = self.generate_center_region_mask( + level_img_size, lv_text_polys[ind])[None] + current_level_maps.append(center_region) + + effective_mask = self.generate_effective_mask( + level_img_size, lv_ignore_polys[ind])[None] + current_level_maps.append(effective_mask) + + fourier_real_map, fourier_image_maps = self.generate_fourier_maps( + level_img_size, lv_text_polys[ind]) + current_level_maps.append(fourier_real_map) + current_level_maps.append(fourier_image_maps) + + level_maps.append(np.concatenate(current_level_maps)) + + return level_maps + + def generate_targets(self, results): + """Generate the ground truth targets for FCENet. + + Args: + results (dict): The input result dictionary. + + Returns: + results (dict): The output result dictionary. + """ + + assert isinstance(results, dict) + image = results['image'] + polygons = results['polys'] + ignore_tags = results['ignore_tags'] + h, w, _ = image.shape + + polygon_masks = [] + polygon_masks_ignore = [] + for tag, polygon in zip(ignore_tags, polygons): + if tag is True: + polygon_masks_ignore.append(polygon) + else: + polygon_masks.append(polygon) + + level_maps = self.generate_level_targets((h, w), polygon_masks, + polygon_masks_ignore) + + mapping = { + 'p3_maps': level_maps[0], + 'p4_maps': level_maps[1], + 'p5_maps': level_maps[2] + } + for key, value in mapping.items(): + results[key] = value + + return results + + def __call__(self, results): + results = self.generate_targets(results) + return results diff --git a/ppocr/data/imaug/iaa_augment.py b/ppocr/data/imaug/iaa_augment.py new file mode 100644 index 0000000000000000000000000000000000000000..0aac7877c257f3e7532ca2806891775913d416b7 --- /dev/null +++ b/ppocr/data/imaug/iaa_augment.py @@ -0,0 +1,105 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/iaa_augment.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import imgaug +import imgaug.augmenters as iaa + + +class AugmenterBuilder(object): + def __init__(self): + pass + + def build(self, args, root=True): + if args is None or len(args) == 0: + return None + elif isinstance(args, list): + if root: + sequence = [self.build(value, root=False) for value in args] + return iaa.Sequential(sequence) + else: + return getattr(iaa, args[0])( + *[self.to_tuple_if_list(a) for a in args[1:]]) + elif isinstance(args, dict): + cls = getattr(iaa, args['type']) + return cls(**{ + k: self.to_tuple_if_list(v) + for k, v in args['args'].items() + }) + else: + raise RuntimeError('unknown augmenter arg: ' + str(args)) + + def to_tuple_if_list(self, obj): + if isinstance(obj, list): + return tuple(obj) + return obj + + +class IaaAugment(): + def __init__(self, augmenter_args=None, **kwargs): + if augmenter_args is None: + augmenter_args = [{ + 'type': 'Fliplr', + 'args': { + 'p': 0.5 + } + }, { + 'type': 'Affine', + 'args': { + 'rotate': [-10, 10] + } + }, { + 'type': 'Resize', + 'args': { + 'size': [0.5, 3] + } + }] + self.augmenter = AugmenterBuilder().build(augmenter_args) + + def __call__(self, data): + image = data['image'] + shape = image.shape + + if self.augmenter: + aug = self.augmenter.to_deterministic() + data['image'] = aug.augment_image(image) + data = self.may_augment_annotation(aug, data, shape) + return data + + def may_augment_annotation(self, aug, data, shape): + if aug is None: + return data + + line_polys = [] + for poly in data['polys']: + new_poly = self.may_augment_poly(aug, shape, poly) + line_polys.append(new_poly) + data['polys'] = np.array(line_polys) + return data + + def may_augment_poly(self, aug, img_shape, poly): + keypoints = [imgaug.Keypoint(p[0], p[1]) for p in poly] + keypoints = aug.augment_keypoints( + [imgaug.KeypointsOnImage( + keypoints, shape=img_shape)])[0].keypoints + poly = [(p.x, p.y) for p in keypoints] + return poly diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..6ec5da288cb7e4b3cbef00dc6c181afcabc204c2 --- /dev/null +++ b/ppocr/data/imaug/label_ops.py @@ -0,0 +1,1505 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import copy +import numpy as np +import string +from shapely.geometry import LineString, Point, Polygon +import json +import copy +from random import sample + +from ppocr.utils.logging import get_logger +from ppocr.data.imaug.vqa.augment import order_by_tbyx + + +class ClsLabelEncode(object): + def __init__(self, label_list, **kwargs): + self.label_list = label_list + + def __call__(self, data): + label = data['label'] + if label not in self.label_list: + return None + label = self.label_list.index(label) + data['label'] = label + return data + + +class DetLabelEncode(object): + def __init__(self, **kwargs): + pass + + def __call__(self, data): + label = data['label'] + label = json.loads(label) + nBox = len(label) + boxes, txts, txt_tags = [], [], [] + for bno in range(0, nBox): + box = label[bno]['points'] + txt = label[bno]['transcription'] + boxes.append(box) + txts.append(txt) + if txt in ['*', '###']: + txt_tags.append(True) + else: + txt_tags.append(False) + if len(boxes) == 0: + return None + boxes = self.expand_points_num(boxes) + boxes = np.array(boxes, dtype=np.float32) + txt_tags = np.array(txt_tags, dtype=bool) + + data['polys'] = boxes + data['texts'] = txts + data['ignore_tags'] = txt_tags + return data + + def order_points_clockwise(self, pts): + rect = np.zeros((4, 2), dtype="float32") + s = pts.sum(axis=1) + rect[0] = pts[np.argmin(s)] + rect[2] = pts[np.argmax(s)] + tmp = np.delete(pts, (np.argmin(s), np.argmax(s)), axis=0) + diff = np.diff(np.array(tmp), axis=1) + rect[1] = tmp[np.argmin(diff)] + rect[3] = tmp[np.argmax(diff)] + return rect + + def expand_points_num(self, boxes): + max_points_num = 0 + for box in boxes: + if len(box) > max_points_num: + max_points_num = len(box) + ex_boxes = [] + for box in boxes: + ex_box = box + [box[-1]] * (max_points_num - len(box)) + ex_boxes.append(ex_box) + return ex_boxes + + +class BaseRecLabelEncode(object): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + lower=False): + + self.max_text_len = max_text_length + self.beg_str = "sos" + self.end_str = "eos" + self.lower = lower + + if character_dict_path is None: + logger = get_logger() + logger.warning( + "The character_dict_path is None, model can only recognize number and lower letters" + ) + self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" + dict_character = list(self.character_str) + self.lower = True + else: + self.character_str = [] + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + self.character_str.append(line) + if use_space_char: + self.character_str.append(" ") + dict_character = list(self.character_str) + dict_character = self.add_special_char(dict_character) + self.dict = {} + for i, char in enumerate(dict_character): + self.dict[char] = i + self.character = dict_character + + def add_special_char(self, dict_character): + return dict_character + + def encode(self, text): + """convert text-label into text-index. + input: + text: text labels of each image. [batch_size] + + output: + text: concatenated text index for CTCLoss. + [sum(text_lengths)] = [text_index_0 + text_index_1 + ... + text_index_(n - 1)] + length: length of each text. [batch_size] + """ + if len(text) == 0 or len(text) > self.max_text_len: + return None + if self.lower: + text = text.lower() + text_list = [] + for char in text: + if char not in self.dict: + # logger = get_logger() + # logger.warning('{} is not in dict'.format(char)) + continue + text_list.append(self.dict[char]) + if len(text_list) == 0: + return None + return text_list + + +class CTCLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(CTCLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + data['length'] = np.array(len(text)) + text = text + [0] * (self.max_text_len - len(text)) + data['label'] = np.array(text) + + label = [0] * len(self.character) + for x in text: + label[x] += 1 + data['label_ace'] = np.array(label) + return data + + def add_special_char(self, dict_character): + dict_character = ['blank'] + dict_character + return dict_character + + +class E2ELabelEncodeTest(BaseRecLabelEncode): + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(E2ELabelEncodeTest, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def __call__(self, data): + import json + padnum = len(self.dict) + label = data['label'] + label = json.loads(label) + nBox = len(label) + boxes, txts, txt_tags = [], [], [] + for bno in range(0, nBox): + box = label[bno]['points'] + txt = label[bno]['transcription'] + boxes.append(box) + txts.append(txt) + if txt in ['*', '###']: + txt_tags.append(True) + else: + txt_tags.append(False) + boxes = np.array(boxes, dtype=np.float32) + txt_tags = np.array(txt_tags, dtype=bool) + data['polys'] = boxes + data['ignore_tags'] = txt_tags + temp_texts = [] + for text in txts: + text = text.lower() + text = self.encode(text) + if text is None: + return None + text = text + [padnum] * (self.max_text_len - len(text) + ) # use 36 to pad + temp_texts.append(text) + data['texts'] = np.array(temp_texts) + return data + + +class E2ELabelEncodeTrain(object): + def __init__(self, **kwargs): + pass + + def __call__(self, data): + import json + label = data['label'] + label = json.loads(label) + nBox = len(label) + boxes, txts, txt_tags = [], [], [] + for bno in range(0, nBox): + box = label[bno]['points'] + txt = label[bno]['transcription'] + boxes.append(box) + txts.append(txt) + if txt in ['*', '###']: + txt_tags.append(True) + else: + txt_tags.append(False) + boxes = np.array(boxes, dtype=np.float32) + txt_tags = np.array(txt_tags, dtype=bool) + + data['polys'] = boxes + data['texts'] = txts + data['ignore_tags'] = txt_tags + return data + + +class KieLabelEncode(object): + def __init__(self, + character_dict_path, + class_path, + norm=10, + directed=False, + **kwargs): + super(KieLabelEncode, self).__init__() + self.dict = dict({'': 0}) + self.label2classid_map = dict() + with open(character_dict_path, 'r', encoding='utf-8') as fr: + idx = 1 + for line in fr: + char = line.strip() + self.dict[char] = idx + idx += 1 + with open(class_path, "r") as fin: + lines = fin.readlines() + for idx, line in enumerate(lines): + line = line.strip("\n") + self.label2classid_map[line] = idx + self.norm = norm + self.directed = directed + + def compute_relation(self, boxes): + """Compute relation between every two boxes.""" + x1s, y1s = boxes[:, 0:1], boxes[:, 1:2] + x2s, y2s = boxes[:, 4:5], boxes[:, 5:6] + ws, hs = x2s - x1s + 1, np.maximum(y2s - y1s + 1, 1) + dxs = (x1s[:, 0][None] - x1s) / self.norm + dys = (y1s[:, 0][None] - y1s) / self.norm + xhhs, xwhs = hs[:, 0][None] / hs, ws[:, 0][None] / hs + whs = ws / hs + np.zeros_like(xhhs) + relations = np.stack([dxs, dys, whs, xhhs, xwhs], -1) + bboxes = np.concatenate([x1s, y1s, x2s, y2s], -1).astype(np.float32) + return relations, bboxes + + def pad_text_indices(self, text_inds): + """Pad text index to same length.""" + max_len = 300 + recoder_len = max([len(text_ind) for text_ind in text_inds]) + padded_text_inds = -np.ones((len(text_inds), max_len), np.int32) + for idx, text_ind in enumerate(text_inds): + padded_text_inds[idx, :len(text_ind)] = np.array(text_ind) + return padded_text_inds, recoder_len + + def list_to_numpy(self, ann_infos): + """Convert bboxes, relations, texts and labels to ndarray.""" + boxes, text_inds = ann_infos['points'], ann_infos['text_inds'] + boxes = np.array(boxes, np.int32) + relations, bboxes = self.compute_relation(boxes) + + labels = ann_infos.get('labels', None) + if labels is not None: + labels = np.array(labels, np.int32) + edges = ann_infos.get('edges', None) + if edges is not None: + labels = labels[:, None] + edges = np.array(edges) + edges = (edges[:, None] == edges[None, :]).astype(np.int32) + if self.directed: + edges = (edges & labels == 1).astype(np.int32) + np.fill_diagonal(edges, -1) + labels = np.concatenate([labels, edges], -1) + padded_text_inds, recoder_len = self.pad_text_indices(text_inds) + max_num = 300 + temp_bboxes = np.zeros([max_num, 4]) + h, _ = bboxes.shape + temp_bboxes[:h, :] = bboxes + + temp_relations = np.zeros([max_num, max_num, 5]) + temp_relations[:h, :h, :] = relations + + temp_padded_text_inds = np.zeros([max_num, max_num]) + temp_padded_text_inds[:h, :] = padded_text_inds + + temp_labels = np.zeros([max_num, max_num]) + temp_labels[:h, :h + 1] = labels + + tag = np.array([h, recoder_len]) + return dict( + image=ann_infos['image'], + points=temp_bboxes, + relations=temp_relations, + texts=temp_padded_text_inds, + labels=temp_labels, + tag=tag) + + def convert_canonical(self, points_x, points_y): + + assert len(points_x) == 4 + assert len(points_y) == 4 + + points = [Point(points_x[i], points_y[i]) for i in range(4)] + + polygon = Polygon([(p.x, p.y) for p in points]) + min_x, min_y, _, _ = polygon.bounds + points_to_lefttop = [ + LineString([points[i], Point(min_x, min_y)]) for i in range(4) + ] + distances = np.array([line.length for line in points_to_lefttop]) + sort_dist_idx = np.argsort(distances) + lefttop_idx = sort_dist_idx[0] + + if lefttop_idx == 0: + point_orders = [0, 1, 2, 3] + elif lefttop_idx == 1: + point_orders = [1, 2, 3, 0] + elif lefttop_idx == 2: + point_orders = [2, 3, 0, 1] + else: + point_orders = [3, 0, 1, 2] + + sorted_points_x = [points_x[i] for i in point_orders] + sorted_points_y = [points_y[j] for j in point_orders] + + return sorted_points_x, sorted_points_y + + def sort_vertex(self, points_x, points_y): + + assert len(points_x) == 4 + assert len(points_y) == 4 + + x = np.array(points_x) + y = np.array(points_y) + center_x = np.sum(x) * 0.25 + center_y = np.sum(y) * 0.25 + + x_arr = np.array(x - center_x) + y_arr = np.array(y - center_y) + + angle = np.arctan2(y_arr, x_arr) * 180.0 / np.pi + sort_idx = np.argsort(angle) + + sorted_points_x, sorted_points_y = [], [] + for i in range(4): + sorted_points_x.append(points_x[sort_idx[i]]) + sorted_points_y.append(points_y[sort_idx[i]]) + + return self.convert_canonical(sorted_points_x, sorted_points_y) + + def __call__(self, data): + import json + label = data['label'] + annotations = json.loads(label) + boxes, texts, text_inds, labels, edges = [], [], [], [], [] + for ann in annotations: + box = ann['points'] + x_list = [box[i][0] for i in range(4)] + y_list = [box[i][1] for i in range(4)] + sorted_x_list, sorted_y_list = self.sort_vertex(x_list, y_list) + sorted_box = [] + for x, y in zip(sorted_x_list, sorted_y_list): + sorted_box.append(x) + sorted_box.append(y) + boxes.append(sorted_box) + text = ann['transcription'] + texts.append(ann['transcription']) + text_ind = [self.dict[c] for c in text if c in self.dict] + text_inds.append(text_ind) + if 'label' in ann.keys(): + labels.append(self.label2classid_map[ann['label']]) + elif 'key_cls' in ann.keys(): + labels.append(ann['key_cls']) + else: + raise ValueError( + "Cannot found 'key_cls' in ann.keys(), please check your training annotation." + ) + edges.append(ann.get('edge', 0)) + ann_infos = dict( + image=data['image'], + points=boxes, + texts=texts, + text_inds=text_inds, + edges=edges, + labels=labels) + + return self.list_to_numpy(ann_infos) + + +class AttnLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(AttnLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def add_special_char(self, dict_character): + self.beg_str = "sos" + self.end_str = "eos" + dict_character = [self.beg_str] + dict_character + [self.end_str] + return dict_character + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len: + return None + data['length'] = np.array(len(text)) + text = [0] + text + [len(self.character) - 1] + [0] * (self.max_text_len + - len(text) - 2) + data['label'] = np.array(text) + return data + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "Unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class RFLLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(RFLLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def add_special_char(self, dict_character): + self.beg_str = "sos" + self.end_str = "eos" + dict_character = [self.beg_str] + dict_character + [self.end_str] + return dict_character + + def encode_cnt(self, text): + cnt_label = [0.0] * len(self.character) + for char_ in text: + cnt_label[char_] += 1 + return np.array(cnt_label) + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len: + return None + cnt_label = self.encode_cnt(text) + data['length'] = np.array(len(text)) + text = [0] + text + [len(self.character) - 1] + [0] * (self.max_text_len + - len(text) - 2) + if len(text) != self.max_text_len: + return None + data['label'] = np.array(text) + data['cnt_label'] = cnt_label + return data + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "Unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class SEEDLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(SEEDLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def add_special_char(self, dict_character): + self.padding = "padding" + self.end_str = "eos" + self.unknown = "unknown" + dict_character = dict_character + [ + self.end_str, self.padding, self.unknown + ] + return dict_character + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len: + return None + data['length'] = np.array(len(text)) + 1 # conclude eos + text = text + [len(self.character) - 3] + [len(self.character) - 2] * ( + self.max_text_len - len(text) - 1) + data['label'] = np.array(text) + return data + + +class SRNLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length=25, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(SRNLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def add_special_char(self, dict_character): + dict_character = dict_character + [self.beg_str, self.end_str] + return dict_character + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + char_num = len(self.character) + if text is None: + return None + if len(text) > self.max_text_len: + return None + data['length'] = np.array(len(text)) + text = text + [char_num - 1] * (self.max_text_len - len(text)) + data['label'] = np.array(text) + return data + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "Unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class TableLabelEncode(AttnLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path, + replace_empty_cell_token=False, + merge_no_span_structure=False, + learn_empty_box=False, + loc_reg_num=4, + **kwargs): + self.max_text_len = max_text_length + self.lower = False + self.learn_empty_box = learn_empty_box + self.merge_no_span_structure = merge_no_span_structure + self.replace_empty_cell_token = replace_empty_cell_token + + dict_character = [] + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + dict_character.append(line) + + if self.merge_no_span_structure: + if "" not in dict_character: + dict_character.append("") + if "" in dict_character: + dict_character.remove("") + + dict_character = self.add_special_char(dict_character) + self.dict = {} + for i, char in enumerate(dict_character): + self.dict[char] = i + self.idx2char = {v: k for k, v in self.dict.items()} + + self.character = dict_character + self.loc_reg_num = loc_reg_num + self.pad_idx = self.dict[self.beg_str] + self.start_idx = self.dict[self.beg_str] + self.end_idx = self.dict[self.end_str] + + self.td_token = ['', '', ''] + self.empty_bbox_token_dict = { + "[]": '', + "[' ']": '', + "['', ' ', '']": '', + "['\\u2028', '\\u2028']": '', + "['', ' ', '']": '', + "['', '']": '', + "['', ' ', '']": '', + "['', '', '', '']": '', + "['', '', ' ', '', '']": '', + "['', '']": '', + "['', ' ', '\\u2028', ' ', '\\u2028', ' ', '']": + '', + } + + @property + def _max_text_len(self): + return self.max_text_len + 2 + + def __call__(self, data): + cells = data['cells'] + structure = data['structure'] + if self.merge_no_span_structure: + structure = self._merge_no_span_structure(structure) + if self.replace_empty_cell_token: + structure = self._replace_empty_cell_token(structure, cells) + # remove empty token and add " " to span token + new_structure = [] + for token in structure: + if token != '': + if 'span' in token and token[0] != ' ': + token = ' ' + token + new_structure.append(token) + # encode structure + structure = self.encode(new_structure) + if structure is None: + return None + + structure = [self.start_idx] + structure + [self.end_idx + ] # add sos abd eos + structure = structure + [self.pad_idx] * (self._max_text_len - + len(structure)) # pad + structure = np.array(structure) + data['structure'] = structure + + if len(structure) > self._max_text_len: + return None + + # encode box + bboxes = np.zeros( + (self._max_text_len, self.loc_reg_num), dtype=np.float32) + bbox_masks = np.zeros((self._max_text_len, 1), dtype=np.float32) + + bbox_idx = 0 + + for i, token in enumerate(structure): + if self.idx2char[token] in self.td_token: + if 'bbox' in cells[bbox_idx] and len(cells[bbox_idx][ + 'tokens']) > 0: + bbox = cells[bbox_idx]['bbox'].copy() + bbox = np.array(bbox, dtype=np.float32).reshape(-1) + bboxes[i] = bbox + bbox_masks[i] = 1.0 + if self.learn_empty_box: + bbox_masks[i] = 1.0 + bbox_idx += 1 + data['bboxes'] = bboxes + data['bbox_masks'] = bbox_masks + return data + + def _merge_no_span_structure(self, structure): + """ + This code is refer from: + https://github.com/JiaquanYe/TableMASTER-mmocr/blob/master/table_recognition/data_preprocess.py + """ + new_structure = [] + i = 0 + while i < len(structure): + token = structure[i] + if token == '': + token = '' + i += 1 + new_structure.append(token) + i += 1 + return new_structure + + def _replace_empty_cell_token(self, token_list, cells): + """ + This fun code is refer from: + https://github.com/JiaquanYe/TableMASTER-mmocr/blob/master/table_recognition/data_preprocess.py + """ + + bbox_idx = 0 + add_empty_bbox_token_list = [] + for token in token_list: + if token in ['', '']: + if 'bbox' not in cells[bbox_idx].keys(): + content = str(cells[bbox_idx]['tokens']) + token = self.empty_bbox_token_dict[content] + add_empty_bbox_token_list.append(token) + bbox_idx += 1 + else: + add_empty_bbox_token_list.append(token) + return add_empty_bbox_token_list + + +class TableMasterLabelEncode(TableLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path, + replace_empty_cell_token=False, + merge_no_span_structure=False, + learn_empty_box=False, + loc_reg_num=4, + **kwargs): + super(TableMasterLabelEncode, self).__init__( + max_text_length, character_dict_path, replace_empty_cell_token, + merge_no_span_structure, learn_empty_box, loc_reg_num, **kwargs) + self.pad_idx = self.dict[self.pad_str] + self.unknown_idx = self.dict[self.unknown_str] + + @property + def _max_text_len(self): + return self.max_text_len + + def add_special_char(self, dict_character): + self.beg_str = '' + self.end_str = '' + self.unknown_str = '' + self.pad_str = '' + dict_character = dict_character + dict_character = dict_character + [ + self.unknown_str, self.beg_str, self.end_str, self.pad_str + ] + return dict_character + + +class TableBoxEncode(object): + def __init__(self, in_box_format='xyxy', out_box_format='xyxy', **kwargs): + assert out_box_format in ['xywh', 'xyxy', 'xyxyxyxy'] + self.in_box_format = in_box_format + self.out_box_format = out_box_format + + def __call__(self, data): + img_height, img_width = data['image'].shape[:2] + bboxes = data['bboxes'] + if self.in_box_format != self.out_box_format: + if self.out_box_format == 'xywh': + if self.in_box_format == 'xyxyxyxy': + bboxes = self.xyxyxyxy2xywh(bboxes) + elif self.in_box_format == 'xyxy': + bboxes = self.xyxy2xywh(bboxes) + + bboxes[:, 0::2] /= img_width + bboxes[:, 1::2] /= img_height + data['bboxes'] = bboxes + return data + + def xyxyxyxy2xywh(self, boxes): + new_bboxes = np.zeros([len(bboxes), 4]) + new_bboxes[:, 0] = bboxes[:, 0::2].min() # x1 + new_bboxes[:, 1] = bboxes[:, 1::2].min() # y1 + new_bboxes[:, 2] = bboxes[:, 0::2].max() - new_bboxes[:, 0] # w + new_bboxes[:, 3] = bboxes[:, 1::2].max() - new_bboxes[:, 1] # h + return new_bboxes + + def xyxy2xywh(self, bboxes): + new_bboxes = np.empty_like(bboxes) + new_bboxes[:, 0] = (bboxes[:, 0] + bboxes[:, 2]) / 2 # x center + new_bboxes[:, 1] = (bboxes[:, 1] + bboxes[:, 3]) / 2 # y center + new_bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0] # width + new_bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1] # height + return new_bboxes + + +class SARLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(SARLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def add_special_char(self, dict_character): + beg_end_str = "" + unknown_str = "" + padding_str = "" + dict_character = dict_character + [unknown_str] + self.unknown_idx = len(dict_character) - 1 + dict_character = dict_character + [beg_end_str] + self.start_idx = len(dict_character) - 1 + self.end_idx = len(dict_character) - 1 + dict_character = dict_character + [padding_str] + self.padding_idx = len(dict_character) - 1 + + return dict_character + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len - 1: + return None + data['length'] = np.array(len(text)) + target = [self.start_idx] + text + [self.end_idx] + padded_text = [self.padding_idx for _ in range(self.max_text_len)] + + padded_text[:len(target)] = target + data['label'] = np.array(padded_text) + return data + + def get_ignored_tokens(self): + return [self.padding_idx] + + +class PRENLabelEncode(BaseRecLabelEncode): + def __init__(self, + max_text_length, + character_dict_path, + use_space_char=False, + **kwargs): + super(PRENLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def add_special_char(self, dict_character): + padding_str = '' # 0 + end_str = '' # 1 + unknown_str = '' # 2 + + dict_character = [padding_str, end_str, unknown_str] + dict_character + self.padding_idx = 0 + self.end_idx = 1 + self.unknown_idx = 2 + + return dict_character + + def encode(self, text): + if len(text) == 0 or len(text) >= self.max_text_len: + return None + if self.lower: + text = text.lower() + text_list = [] + for char in text: + if char not in self.dict: + text_list.append(self.unknown_idx) + else: + text_list.append(self.dict[char]) + text_list.append(self.end_idx) + if len(text_list) < self.max_text_len: + text_list += [self.padding_idx] * ( + self.max_text_len - len(text_list)) + return text_list + + def __call__(self, data): + text = data['label'] + encoded_text = self.encode(text) + if encoded_text is None: + return None + data['label'] = np.array(encoded_text) + return data + + +class VQATokenLabelEncode(object): + """ + Label encode for NLP VQA methods + """ + + def __init__(self, + class_path, + contains_re=False, + add_special_ids=False, + algorithm='LayoutXLM', + use_textline_bbox_info=True, + order_method=None, + infer_mode=False, + ocr_engine=None, + **kwargs): + super(VQATokenLabelEncode, self).__init__() + from paddlenlp.transformers import LayoutXLMTokenizer, LayoutLMTokenizer, LayoutLMv2Tokenizer + from ppocr.utils.utility import load_vqa_bio_label_maps + tokenizer_dict = { + 'LayoutXLM': { + 'class': LayoutXLMTokenizer, + 'pretrained_model': 'layoutxlm-base-uncased' + }, + 'LayoutLM': { + 'class': LayoutLMTokenizer, + 'pretrained_model': 'layoutlm-base-uncased' + }, + 'LayoutLMv2': { + 'class': LayoutLMv2Tokenizer, + 'pretrained_model': 'layoutlmv2-base-uncased' + } + } + self.contains_re = contains_re + tokenizer_config = tokenizer_dict[algorithm] + self.tokenizer = tokenizer_config['class'].from_pretrained( + tokenizer_config['pretrained_model']) + self.label2id_map, id2label_map = load_vqa_bio_label_maps(class_path) + self.add_special_ids = add_special_ids + self.infer_mode = infer_mode + self.ocr_engine = ocr_engine + self.use_textline_bbox_info = use_textline_bbox_info + self.order_method = order_method + assert self.order_method in [None, "tb-yx"] + + def split_bbox(self, bbox, text, tokenizer): + words = text.split() + token_bboxes = [] + curr_word_idx = 0 + x1, y1, x2, y2 = bbox + unit_w = (x2 - x1) / len(text) + for idx, word in enumerate(words): + curr_w = len(word) * unit_w + word_bbox = [x1, y1, x1 + curr_w, y2] + token_bboxes.extend([word_bbox] * len(tokenizer.tokenize(word))) + x1 += (len(word) + 1) * unit_w + return token_bboxes + + def filter_empty_contents(self, ocr_info): + """ + find out the empty texts and remove the links + """ + new_ocr_info = [] + empty_index = [] + for idx, info in enumerate(ocr_info): + if len(info["transcription"]) > 0: + new_ocr_info.append(copy.deepcopy(info)) + else: + empty_index.append(info["id"]) + + for idx, info in enumerate(new_ocr_info): + new_link = [] + for link in info["linking"]: + if link[0] in empty_index or link[1] in empty_index: + continue + new_link.append(link) + new_ocr_info[idx]["linking"] = new_link + return new_ocr_info + + def __call__(self, data): + # load bbox and label info + ocr_info = self._load_ocr_info(data) + + for idx in range(len(ocr_info)): + if "bbox" not in ocr_info[idx]: + ocr_info[idx]["bbox"] = self.trans_poly_to_bbox(ocr_info[idx][ + "points"]) + + if self.order_method == "tb-yx": + ocr_info = order_by_tbyx(ocr_info) + + # for re + train_re = self.contains_re and not self.infer_mode + if train_re: + ocr_info = self.filter_empty_contents(ocr_info) + + height, width, _ = data['image'].shape + + words_list = [] + bbox_list = [] + input_ids_list = [] + token_type_ids_list = [] + segment_offset_id = [] + gt_label_list = [] + + entities = [] + + if train_re: + relations = [] + id2label = {} + entity_id_to_index_map = {} + empty_entity = set() + + data['ocr_info'] = copy.deepcopy(ocr_info) + + for info in ocr_info: + text = info["transcription"] + if len(text) <= 0: + continue + if train_re: + # for re + if len(text) == 0: + empty_entity.add(info["id"]) + continue + id2label[info["id"]] = info["label"] + relations.extend([tuple(sorted(l)) for l in info["linking"]]) + # smooth_box + info["bbox"] = self.trans_poly_to_bbox(info["points"]) + + encode_res = self.tokenizer.encode( + text, + pad_to_max_seq_len=False, + return_attention_mask=True, + return_token_type_ids=True) + + if not self.add_special_ids: + # TODO: use tok.all_special_ids to remove + encode_res["input_ids"] = encode_res["input_ids"][1:-1] + encode_res["token_type_ids"] = encode_res["token_type_ids"][1: + -1] + encode_res["attention_mask"] = encode_res["attention_mask"][1: + -1] + + if self.use_textline_bbox_info: + bbox = [info["bbox"]] * len(encode_res["input_ids"]) + else: + bbox = self.split_bbox(info["bbox"], info["transcription"], + self.tokenizer) + if len(bbox) <= 0: + continue + bbox = self._smooth_box(bbox, height, width) + if self.add_special_ids: + bbox.insert(0, [0, 0, 0, 0]) + bbox.append([0, 0, 0, 0]) + + # parse label + if not self.infer_mode: + label = info['label'] + gt_label = self._parse_label(label, encode_res) + + # construct entities for re + if train_re: + if gt_label[0] != self.label2id_map["O"]: + entity_id_to_index_map[info["id"]] = len(entities) + label = label.upper() + entities.append({ + "start": len(input_ids_list), + "end": + len(input_ids_list) + len(encode_res["input_ids"]), + "label": label.upper(), + }) + else: + entities.append({ + "start": len(input_ids_list), + "end": len(input_ids_list) + len(encode_res["input_ids"]), + "label": 'O', + }) + input_ids_list.extend(encode_res["input_ids"]) + token_type_ids_list.extend(encode_res["token_type_ids"]) + bbox_list.extend(bbox) + words_list.append(text) + segment_offset_id.append(len(input_ids_list)) + if not self.infer_mode: + gt_label_list.extend(gt_label) + + data['input_ids'] = input_ids_list + data['token_type_ids'] = token_type_ids_list + data['bbox'] = bbox_list + data['attention_mask'] = [1] * len(input_ids_list) + data['labels'] = gt_label_list + data['segment_offset_id'] = segment_offset_id + data['tokenizer_params'] = dict( + padding_side=self.tokenizer.padding_side, + pad_token_type_id=self.tokenizer.pad_token_type_id, + pad_token_id=self.tokenizer.pad_token_id) + data['entities'] = entities + + if train_re: + data['relations'] = relations + data['id2label'] = id2label + data['empty_entity'] = empty_entity + data['entity_id_to_index_map'] = entity_id_to_index_map + return data + + def trans_poly_to_bbox(self, poly): + x1 = int(np.min([p[0] for p in poly])) + x2 = int(np.max([p[0] for p in poly])) + y1 = int(np.min([p[1] for p in poly])) + y2 = int(np.max([p[1] for p in poly])) + return [x1, y1, x2, y2] + + def _load_ocr_info(self, data): + if self.infer_mode: + ocr_result = self.ocr_engine.ocr(data['image'], cls=False)[0] + ocr_info = [] + for res in ocr_result: + ocr_info.append({ + "transcription": res[1][0], + "bbox": self.trans_poly_to_bbox(res[0]), + "points": res[0], + }) + return ocr_info + else: + info = data['label'] + # read text info + info_dict = json.loads(info) + return info_dict + + def _smooth_box(self, bboxes, height, width): + bboxes = np.array(bboxes) + bboxes[:, 0] = bboxes[:, 0] * 1000 / width + bboxes[:, 2] = bboxes[:, 2] * 1000 / width + bboxes[:, 1] = bboxes[:, 1] * 1000 / height + bboxes[:, 3] = bboxes[:, 3] * 1000 / height + bboxes = bboxes.astype("int64").tolist() + return bboxes + + def _parse_label(self, label, encode_res): + gt_label = [] + if label.lower() in ["other", "others", "ignore"]: + gt_label.extend([0] * len(encode_res["input_ids"])) + else: + gt_label.append(self.label2id_map[("b-" + label).upper()]) + gt_label.extend([self.label2id_map[("i-" + label).upper()]] * + (len(encode_res["input_ids"]) - 1)) + return gt_label + + +class MultiLabelEncode(BaseRecLabelEncode): + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(MultiLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + self.ctc_encode = CTCLabelEncode(max_text_length, character_dict_path, + use_space_char, **kwargs) + self.sar_encode = SARLabelEncode(max_text_length, character_dict_path, + use_space_char, **kwargs) + + def __call__(self, data): + data_ctc = copy.deepcopy(data) + data_sar = copy.deepcopy(data) + data_out = dict() + data_out['img_path'] = data.get('img_path', None) + data_out['image'] = data['image'] + ctc = self.ctc_encode.__call__(data_ctc) + sar = self.sar_encode.__call__(data_sar) + if ctc is None or sar is None: + return None + data_out['label_ctc'] = ctc['label'] + data_out['label_sar'] = sar['label'] + data_out['length'] = ctc['length'] + return data_out + + +class NRTRLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + + super(NRTRLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len - 1: + return None + data['length'] = np.array(len(text)) + text.insert(0, 2) + text.append(3) + text = text + [0] * (self.max_text_len - len(text)) + data['label'] = np.array(text) + return data + + def add_special_char(self, dict_character): + dict_character = ['blank', '', '', ''] + dict_character + return dict_character + + +class ViTSTRLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + ignore_index=0, + **kwargs): + + super(ViTSTRLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + self.ignore_index = ignore_index + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len: + return None + data['length'] = np.array(len(text)) + text.insert(0, self.ignore_index) + text.append(1) + text = text + [self.ignore_index] * (self.max_text_len + 2 - len(text)) + data['label'] = np.array(text) + return data + + def add_special_char(self, dict_character): + dict_character = ['', ''] + dict_character + return dict_character + + +class ABINetLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + ignore_index=100, + **kwargs): + + super(ABINetLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + self.ignore_index = ignore_index + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) >= self.max_text_len: + return None + data['length'] = np.array(len(text)) + text.append(0) + text = text + [self.ignore_index] * (self.max_text_len + 1 - len(text)) + data['label'] = np.array(text) + return data + + def add_special_char(self, dict_character): + dict_character = [''] + dict_character + return dict_character + + +class SRLabelEncode(BaseRecLabelEncode): + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(SRLabelEncode, self).__init__(max_text_length, + character_dict_path, use_space_char) + self.dic = {} + with open(character_dict_path, 'r') as fin: + for line in fin.readlines(): + line = line.strip() + character, sequence = line.split() + self.dic[character] = sequence + english_stroke_alphabet = '0123456789' + self.english_stroke_dict = {} + for index in range(len(english_stroke_alphabet)): + self.english_stroke_dict[english_stroke_alphabet[index]] = index + + def encode(self, label): + stroke_sequence = '' + for character in label: + if character not in self.dic: + continue + else: + stroke_sequence += self.dic[character] + stroke_sequence += '0' + label = stroke_sequence + + length = len(label) + + input_tensor = np.zeros(self.max_text_len).astype("int64") + for j in range(length - 1): + input_tensor[j + 1] = self.english_stroke_dict[label[j]] + + return length, input_tensor + + def __call__(self, data): + text = data['label'] + length, input_tensor = self.encode(text) + + data["length"] = length + data["input_tensor"] = input_tensor + if text is None: + return None + return data + + +class SPINLabelEncode(AttnLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + lower=True, + **kwargs): + super(SPINLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char) + self.lower = lower + + def add_special_char(self, dict_character): + self.beg_str = "sos" + self.end_str = "eos" + dict_character = [self.beg_str] + [self.end_str] + dict_character + return dict_character + + def __call__(self, data): + text = data['label'] + text = self.encode(text) + if text is None: + return None + if len(text) > self.max_text_len: + return None + data['length'] = np.array(len(text)) + target = [0] + text + [1] + padded_text = [0 for _ in range(self.max_text_len + 2)] + + padded_text[:len(target)] = target + data['label'] = np.array(padded_text) + return data + + +class VLLabelEncode(BaseRecLabelEncode): + """ Convert between text-label and text-index """ + + def __init__(self, + max_text_length, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(VLLabelEncode, self).__init__(max_text_length, + character_dict_path, use_space_char) + self.dict = {} + for i, char in enumerate(self.character): + self.dict[char] = i + + def __call__(self, data): + text = data['label'] # original string + # generate occluded text + len_str = len(text) + if len_str <= 0: + return None + change_num = 1 + order = list(range(len_str)) + change_id = sample(order, change_num)[0] + label_sub = text[change_id] + if change_id == (len_str - 1): + label_res = text[:change_id] + elif change_id == 0: + label_res = text[1:] + else: + label_res = text[:change_id] + text[change_id + 1:] + + data['label_res'] = label_res # remaining string + data['label_sub'] = label_sub # occluded character + data['label_id'] = change_id # character index + # encode label + text = self.encode(text) + if text is None: + return None + text = [i + 1 for i in text] + data['length'] = np.array(len(text)) + text = text + [0] * (self.max_text_len - len(text)) + data['label'] = np.array(text) + label_res = self.encode(label_res) + label_sub = self.encode(label_sub) + if label_res is None: + label_res = [] + else: + label_res = [i + 1 for i in label_res] + if label_sub is None: + label_sub = [] + else: + label_sub = [i + 1 for i in label_sub] + data['length_res'] = np.array(len(label_res)) + data['length_sub'] = np.array(len(label_sub)) + label_res = label_res + [0] * (self.max_text_len - len(label_res)) + label_sub = label_sub + [0] * (self.max_text_len - len(label_sub)) + data['label_res'] = np.array(label_res) + data['label_sub'] = np.array(label_sub) + return data + + +class CTLabelEncode(object): + def __init__(self, **kwargs): + pass + + def __call__(self, data): + label = data['label'] + + label = json.loads(label) + nBox = len(label) + boxes, txts = [], [] + for bno in range(0, nBox): + box = label[bno]['points'] + box = np.array(box) + + boxes.append(box) + txt = label[bno]['transcription'] + txts.append(txt) + + if len(boxes) == 0: + return None + + data['polys'] = boxes + data['texts'] = txts + return data + + +class CANLabelEncode(BaseRecLabelEncode): + def __init__(self, + character_dict_path, + max_text_length=100, + use_space_char=False, + lower=True, + **kwargs): + super(CANLabelEncode, self).__init__( + max_text_length, character_dict_path, use_space_char, lower) + + def encode(self, text_seq): + text_seq_encoded = [] + for text in text_seq: + if text not in self.character: + continue + text_seq_encoded.append(self.dict.get(text)) + if len(text_seq_encoded) == 0: + return None + return text_seq_encoded + + def __call__(self, data): + label = data['label'] + if isinstance(label, str): + label = label.strip().split() + label.append(self.end_str) + data['label'] = self.encode(label) + return data diff --git a/ppocr/data/imaug/make_border_map.py b/ppocr/data/imaug/make_border_map.py new file mode 100644 index 0000000000000000000000000000000000000000..abab38368db2de84e54b060598fc509a65219296 --- /dev/null +++ b/ppocr/data/imaug/make_border_map.py @@ -0,0 +1,173 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/make_border_map.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import cv2 + +np.seterr(divide='ignore', invalid='ignore') +import pyclipper +from shapely.geometry import Polygon +import sys +import warnings + +warnings.simplefilter("ignore") + +__all__ = ['MakeBorderMap'] + + +class MakeBorderMap(object): + def __init__(self, + shrink_ratio=0.4, + thresh_min=0.3, + thresh_max=0.7, + **kwargs): + self.shrink_ratio = shrink_ratio + self.thresh_min = thresh_min + self.thresh_max = thresh_max + + def __call__(self, data): + + img = data['image'] + text_polys = data['polys'] + ignore_tags = data['ignore_tags'] + + canvas = np.zeros(img.shape[:2], dtype=np.float32) + mask = np.zeros(img.shape[:2], dtype=np.float32) + + for i in range(len(text_polys)): + if ignore_tags[i]: + continue + self.draw_border_map(text_polys[i], canvas, mask=mask) + canvas = canvas * (self.thresh_max - self.thresh_min) + self.thresh_min + + data['threshold_map'] = canvas + data['threshold_mask'] = mask + return data + + def draw_border_map(self, polygon, canvas, mask): + polygon = np.array(polygon) + assert polygon.ndim == 2 + assert polygon.shape[1] == 2 + + polygon_shape = Polygon(polygon) + if polygon_shape.area <= 0: + return + distance = polygon_shape.area * ( + 1 - np.power(self.shrink_ratio, 2)) / polygon_shape.length + subject = [tuple(l) for l in polygon] + padding = pyclipper.PyclipperOffset() + padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + + padded_polygon = np.array(padding.Execute(distance)[0]) + cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0) + + xmin = padded_polygon[:, 0].min() + xmax = padded_polygon[:, 0].max() + ymin = padded_polygon[:, 1].min() + ymax = padded_polygon[:, 1].max() + width = xmax - xmin + 1 + height = ymax - ymin + 1 + + polygon[:, 0] = polygon[:, 0] - xmin + polygon[:, 1] = polygon[:, 1] - ymin + + xs = np.broadcast_to( + np.linspace( + 0, width - 1, num=width).reshape(1, width), (height, width)) + ys = np.broadcast_to( + np.linspace( + 0, height - 1, num=height).reshape(height, 1), (height, width)) + + distance_map = np.zeros( + (polygon.shape[0], height, width), dtype=np.float32) + for i in range(polygon.shape[0]): + j = (i + 1) % polygon.shape[0] + absolute_distance = self._distance(xs, ys, polygon[i], polygon[j]) + distance_map[i] = np.clip(absolute_distance / distance, 0, 1) + distance_map = distance_map.min(axis=0) + + xmin_valid = min(max(0, xmin), canvas.shape[1] - 1) + xmax_valid = min(max(0, xmax), canvas.shape[1] - 1) + ymin_valid = min(max(0, ymin), canvas.shape[0] - 1) + ymax_valid = min(max(0, ymax), canvas.shape[0] - 1) + canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax( + 1 - distance_map[ymin_valid - ymin:ymax_valid - ymax + height, + xmin_valid - xmin:xmax_valid - xmax + width], + canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1]) + + def _distance(self, xs, ys, point_1, point_2): + ''' + compute the distance from point to a line + ys: coordinates in the first axis + xs: coordinates in the second axis + point_1, point_2: (x, y), the end of the line + ''' + height, width = xs.shape[:2] + square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[ + 1]) + square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[ + 1]) + square_distance = np.square(point_1[0] - point_2[0]) + np.square( + point_1[1] - point_2[1]) + + cosin = (square_distance - square_distance_1 - square_distance_2) / ( + 2 * np.sqrt(square_distance_1 * square_distance_2)) + square_sin = 1 - np.square(cosin) + square_sin = np.nan_to_num(square_sin) + result = np.sqrt(square_distance_1 * square_distance_2 * square_sin / + square_distance) + + result[cosin < + 0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[cosin + < 0] + # self.extend_line(point_1, point_2, result) + return result + + def extend_line(self, point_1, point_2, result, shrink_ratio): + ex_point_1 = (int( + round(point_1[0] + (point_1[0] - point_2[0]) * (1 + shrink_ratio))), + int( + round(point_1[1] + (point_1[1] - point_2[1]) * ( + 1 + shrink_ratio)))) + cv2.line( + result, + tuple(ex_point_1), + tuple(point_1), + 4096.0, + 1, + lineType=cv2.LINE_AA, + shift=0) + ex_point_2 = (int( + round(point_2[0] + (point_2[0] - point_1[0]) * (1 + shrink_ratio))), + int( + round(point_2[1] + (point_2[1] - point_1[1]) * ( + 1 + shrink_ratio)))) + cv2.line( + result, + tuple(ex_point_2), + tuple(point_2), + 4096.0, + 1, + lineType=cv2.LINE_AA, + shift=0) + return ex_point_1, ex_point_2 diff --git a/ppocr/data/imaug/make_pse_gt.py b/ppocr/data/imaug/make_pse_gt.py new file mode 100644 index 0000000000000000000000000000000000000000..255d076bde848d53f3b2fb04e80594872f4ae8c7 --- /dev/null +++ b/ppocr/data/imaug/make_pse_gt.py @@ -0,0 +1,106 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import cv2 +import numpy as np +import pyclipper +from shapely.geometry import Polygon + +__all__ = ['MakePseGt'] + + +class MakePseGt(object): + def __init__(self, kernel_num=7, size=640, min_shrink_ratio=0.4, **kwargs): + self.kernel_num = kernel_num + self.min_shrink_ratio = min_shrink_ratio + self.size = size + + def __call__(self, data): + + image = data['image'] + text_polys = data['polys'] + ignore_tags = data['ignore_tags'] + + h, w, _ = image.shape + short_edge = min(h, w) + if short_edge < self.size: + # keep short_size >= self.size + scale = self.size / short_edge + image = cv2.resize(image, dsize=None, fx=scale, fy=scale) + text_polys *= scale + + gt_kernels = [] + for i in range(1, self.kernel_num + 1): + # s1->sn, from big to small + rate = 1.0 - (1.0 - self.min_shrink_ratio) / (self.kernel_num - 1 + ) * i + text_kernel, ignore_tags = self.generate_kernel( + image.shape[0:2], rate, text_polys, ignore_tags) + gt_kernels.append(text_kernel) + + training_mask = np.ones(image.shape[0:2], dtype='uint8') + for i in range(text_polys.shape[0]): + if ignore_tags[i]: + cv2.fillPoly(training_mask, + text_polys[i].astype(np.int32)[np.newaxis, :, :], + 0) + + gt_kernels = np.array(gt_kernels) + gt_kernels[gt_kernels > 0] = 1 + + data['image'] = image + data['polys'] = text_polys + data['gt_kernels'] = gt_kernels[0:] + data['gt_text'] = gt_kernels[0] + data['mask'] = training_mask.astype('float32') + return data + + def generate_kernel(self, + img_size, + shrink_ratio, + text_polys, + ignore_tags=None): + """ + Refer to part of the code: + https://github.com/open-mmlab/mmocr/blob/main/mmocr/datasets/pipelines/textdet_targets/base_textdet_targets.py + """ + + h, w = img_size + text_kernel = np.zeros((h, w), dtype=np.float32) + for i, poly in enumerate(text_polys): + polygon = Polygon(poly) + distance = polygon.area * (1 - shrink_ratio * shrink_ratio) / ( + polygon.length + 1e-6) + subject = [tuple(l) for l in poly] + pco = pyclipper.PyclipperOffset() + pco.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + shrinked = np.array(pco.Execute(-distance)) + + if len(shrinked) == 0 or shrinked.size == 0: + if ignore_tags is not None: + ignore_tags[i] = True + continue + try: + shrinked = np.array(shrinked[0]).reshape(-1, 2) + except: + if ignore_tags is not None: + ignore_tags[i] = True + continue + cv2.fillPoly(text_kernel, [shrinked.astype(np.int32)], i + 1) + return text_kernel, ignore_tags diff --git a/ppocr/data/imaug/make_shrink_map.py b/ppocr/data/imaug/make_shrink_map.py new file mode 100644 index 0000000000000000000000000000000000000000..6c65c20e5621f91a5b1fba549b059c92923fca6f --- /dev/null +++ b/ppocr/data/imaug/make_shrink_map.py @@ -0,0 +1,123 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/make_shrink_map.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import cv2 +from shapely.geometry import Polygon +import pyclipper + +__all__ = ['MakeShrinkMap'] + + +class MakeShrinkMap(object): + r''' + Making binary mask from detection data with ICDAR format. + Typically following the process of class `MakeICDARData`. + ''' + + def __init__(self, min_text_size=8, shrink_ratio=0.4, **kwargs): + self.min_text_size = min_text_size + self.shrink_ratio = shrink_ratio + + def __call__(self, data): + image = data['image'] + text_polys = data['polys'] + ignore_tags = data['ignore_tags'] + + h, w = image.shape[:2] + text_polys, ignore_tags = self.validate_polygons(text_polys, + ignore_tags, h, w) + gt = np.zeros((h, w), dtype=np.float32) + mask = np.ones((h, w), dtype=np.float32) + for i in range(len(text_polys)): + polygon = text_polys[i] + height = max(polygon[:, 1]) - min(polygon[:, 1]) + width = max(polygon[:, 0]) - min(polygon[:, 0]) + if ignore_tags[i] or min(height, width) < self.min_text_size: + cv2.fillPoly(mask, + polygon.astype(np.int32)[np.newaxis, :, :], 0) + ignore_tags[i] = True + else: + polygon_shape = Polygon(polygon) + subject = [tuple(l) for l in polygon] + padding = pyclipper.PyclipperOffset() + padding.AddPath(subject, pyclipper.JT_ROUND, + pyclipper.ET_CLOSEDPOLYGON) + shrinked = [] + + # Increase the shrink ratio every time we get multiple polygon returned back + possible_ratios = np.arange(self.shrink_ratio, 1, + self.shrink_ratio) + np.append(possible_ratios, 1) + # print(possible_ratios) + for ratio in possible_ratios: + # print(f"Change shrink ratio to {ratio}") + distance = polygon_shape.area * ( + 1 - np.power(ratio, 2)) / polygon_shape.length + shrinked = padding.Execute(-distance) + if len(shrinked) == 1: + break + + if shrinked == []: + cv2.fillPoly(mask, + polygon.astype(np.int32)[np.newaxis, :, :], 0) + ignore_tags[i] = True + continue + + for each_shirnk in shrinked: + shirnk = np.array(each_shirnk).reshape(-1, 2) + cv2.fillPoly(gt, [shirnk.astype(np.int32)], 1) + + data['shrink_map'] = gt + data['shrink_mask'] = mask + return data + + def validate_polygons(self, polygons, ignore_tags, h, w): + ''' + polygons (numpy.array, required): of shape (num_instances, num_points, 2) + ''' + if len(polygons) == 0: + return polygons, ignore_tags + assert len(polygons) == len(ignore_tags) + for polygon in polygons: + polygon[:, 0] = np.clip(polygon[:, 0], 0, w - 1) + polygon[:, 1] = np.clip(polygon[:, 1], 0, h - 1) + + for i in range(len(polygons)): + area = self.polygon_area(polygons[i]) + if abs(area) < 1: + ignore_tags[i] = True + if area > 0: + polygons[i] = polygons[i][::-1, :] + return polygons, ignore_tags + + def polygon_area(self, polygon): + """ + compute polygon area + """ + area = 0 + q = polygon[-1] + for p in polygon: + area += p[0] * q[1] - p[1] * q[0] + q = p + return area / 2.0 diff --git a/ppocr/data/imaug/operators.py b/ppocr/data/imaug/operators.py new file mode 100644 index 0000000000000000000000000000000000000000..4ff2d29ed32df906c42b28f97a81b20f716cb0fd --- /dev/null +++ b/ppocr/data/imaug/operators.py @@ -0,0 +1,524 @@ +""" +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import sys +import six +import cv2 +import numpy as np +import math +from PIL import Image + + +class DecodeImage(object): + """ decode image """ + + def __init__(self, + img_mode='RGB', + channel_first=False, + ignore_orientation=False, + **kwargs): + self.img_mode = img_mode + self.channel_first = channel_first + self.ignore_orientation = ignore_orientation + + def __call__(self, data): + img = data['image'] + if six.PY2: + assert type(img) is str and len( + img) > 0, "invalid input 'img' in DecodeImage" + else: + assert type(img) is bytes and len( + img) > 0, "invalid input 'img' in DecodeImage" + img = np.frombuffer(img, dtype='uint8') + if self.ignore_orientation: + img = cv2.imdecode(img, cv2.IMREAD_IGNORE_ORIENTATION | + cv2.IMREAD_COLOR) + else: + img = cv2.imdecode(img, 1) + if img is None: + return None + if self.img_mode == 'GRAY': + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + elif self.img_mode == 'RGB': + assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape) + img = img[:, :, ::-1] + + if self.channel_first: + img = img.transpose((2, 0, 1)) + + data['image'] = img + return data + + +class NormalizeImage(object): + """ normalize image such as substract mean, divide std + """ + + def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs): + if isinstance(scale, str): + scale = eval(scale) + self.scale = np.float32(scale if scale is not None else 1.0 / 255.0) + mean = mean if mean is not None else [0.485, 0.456, 0.406] + std = std if std is not None else [0.229, 0.224, 0.225] + + shape = (3, 1, 1) if order == 'chw' else (1, 1, 3) + self.mean = np.array(mean).reshape(shape).astype('float32') + self.std = np.array(std).reshape(shape).astype('float32') + + def __call__(self, data): + img = data['image'] + from PIL import Image + if isinstance(img, Image.Image): + img = np.array(img) + assert isinstance(img, + np.ndarray), "invalid input 'img' in NormalizeImage" + data['image'] = ( + img.astype('float32') * self.scale - self.mean) / self.std + return data + + +class ToCHWImage(object): + """ convert hwc image to chw image + """ + + def __init__(self, **kwargs): + pass + + def __call__(self, data): + img = data['image'] + from PIL import Image + if isinstance(img, Image.Image): + img = np.array(img) + data['image'] = img.transpose((2, 0, 1)) + return data + + +class Fasttext(object): + def __init__(self, path="None", **kwargs): + import fasttext + self.fast_model = fasttext.load_model(path) + + def __call__(self, data): + label = data['label'] + fast_label = self.fast_model[label] + data['fast_label'] = fast_label + return data + + +class KeepKeys(object): + def __init__(self, keep_keys, **kwargs): + self.keep_keys = keep_keys + + def __call__(self, data): + data_list = [] + for key in self.keep_keys: + data_list.append(data[key]) + return data_list + + +class Pad(object): + def __init__(self, size=None, size_div=32, **kwargs): + if size is not None and not isinstance(size, (int, list, tuple)): + raise TypeError("Type of target_size is invalid. Now is {}".format( + type(size))) + if isinstance(size, int): + size = [size, size] + self.size = size + self.size_div = size_div + + def __call__(self, data): + + img = data['image'] + img_h, img_w = img.shape[0], img.shape[1] + if self.size: + resize_h2, resize_w2 = self.size + assert ( + img_h < resize_h2 and img_w < resize_w2 + ), '(h, w) of target size should be greater than (img_h, img_w)' + else: + resize_h2 = max( + int(math.ceil(img.shape[0] / self.size_div) * self.size_div), + self.size_div) + resize_w2 = max( + int(math.ceil(img.shape[1] / self.size_div) * self.size_div), + self.size_div) + img = cv2.copyMakeBorder( + img, + 0, + resize_h2 - img_h, + 0, + resize_w2 - img_w, + cv2.BORDER_CONSTANT, + value=0) + data['image'] = img + return data + + +class Resize(object): + def __init__(self, size=(640, 640), **kwargs): + self.size = size + + def resize_image(self, img): + resize_h, resize_w = self.size + ori_h, ori_w = img.shape[:2] # (h, w, c) + ratio_h = float(resize_h) / ori_h + ratio_w = float(resize_w) / ori_w + img = cv2.resize(img, (int(resize_w), int(resize_h))) + return img, [ratio_h, ratio_w] + + def __call__(self, data): + img = data['image'] + if 'polys' in data: + text_polys = data['polys'] + + img_resize, [ratio_h, ratio_w] = self.resize_image(img) + if 'polys' in data: + new_boxes = [] + for box in text_polys: + new_box = [] + for cord in box: + new_box.append([cord[0] * ratio_w, cord[1] * ratio_h]) + new_boxes.append(new_box) + data['polys'] = np.array(new_boxes, dtype=np.float32) + data['image'] = img_resize + return data + + +class DetResizeForTest(object): + def __init__(self, **kwargs): + super(DetResizeForTest, self).__init__() + self.resize_type = 0 + self.keep_ratio = False + if 'image_shape' in kwargs: + self.image_shape = kwargs['image_shape'] + self.resize_type = 1 + if 'keep_ratio' in kwargs: + self.keep_ratio = kwargs['keep_ratio'] + elif 'limit_side_len' in kwargs: + self.limit_side_len = kwargs['limit_side_len'] + self.limit_type = kwargs.get('limit_type', 'min') + elif 'resize_long' in kwargs: + self.resize_type = 2 + self.resize_long = kwargs.get('resize_long', 960) + else: + self.limit_side_len = 736 + self.limit_type = 'min' + + def __call__(self, data): + img = data['image'] + src_h, src_w, _ = img.shape + if sum([src_h, src_w]) < 64: + img = self.image_padding(img) + + if self.resize_type == 0: + # img, shape = self.resize_image_type0(img) + img, [ratio_h, ratio_w] = self.resize_image_type0(img) + elif self.resize_type == 2: + img, [ratio_h, ratio_w] = self.resize_image_type2(img) + else: + # img, shape = self.resize_image_type1(img) + img, [ratio_h, ratio_w] = self.resize_image_type1(img) + data['image'] = img + data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w]) + return data + + def image_padding(self, im, value=0): + h, w, c = im.shape + im_pad = np.zeros((max(32, h), max(32, w), c), np.uint8) + value + im_pad[:h, :w, :] = im + return im_pad + + def resize_image_type1(self, img): + resize_h, resize_w = self.image_shape + ori_h, ori_w = img.shape[:2] # (h, w, c) + if self.keep_ratio is True: + resize_w = ori_w * resize_h / ori_h + N = math.ceil(resize_w / 32) + resize_w = N * 32 + ratio_h = float(resize_h) / ori_h + ratio_w = float(resize_w) / ori_w + img = cv2.resize(img, (int(resize_w), int(resize_h))) + # return img, np.array([ori_h, ori_w]) + return img, [ratio_h, ratio_w] + + def resize_image_type0(self, img): + """ + resize image to a size multiple of 32 which is required by the network + args: + img(array): array with shape [h, w, c] + return(tuple): + img, (ratio_h, ratio_w) + """ + limit_side_len = self.limit_side_len + h, w, c = img.shape + + # limit the max side + if self.limit_type == 'max': + if max(h, w) > limit_side_len: + if h > w: + ratio = float(limit_side_len) / h + else: + ratio = float(limit_side_len) / w + else: + ratio = 1. + elif self.limit_type == 'min': + if min(h, w) < limit_side_len: + if h < w: + ratio = float(limit_side_len) / h + else: + ratio = float(limit_side_len) / w + else: + ratio = 1. + elif self.limit_type == 'resize_long': + ratio = float(limit_side_len) / max(h, w) + else: + raise Exception('not support limit type, image ') + resize_h = int(h * ratio) + resize_w = int(w * ratio) + + resize_h = max(int(round(resize_h / 32) * 32), 32) + resize_w = max(int(round(resize_w / 32) * 32), 32) + + try: + if int(resize_w) <= 0 or int(resize_h) <= 0: + return None, (None, None) + img = cv2.resize(img, (int(resize_w), int(resize_h))) + except: + print(img.shape, resize_w, resize_h) + sys.exit(0) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return img, [ratio_h, ratio_w] + + def resize_image_type2(self, img): + h, w, _ = img.shape + + resize_w = w + resize_h = h + + if resize_h > resize_w: + ratio = float(self.resize_long) / resize_h + else: + ratio = float(self.resize_long) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + img = cv2.resize(img, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + + return img, [ratio_h, ratio_w] + + +class E2EResizeForTest(object): + def __init__(self, **kwargs): + super(E2EResizeForTest, self).__init__() + self.max_side_len = kwargs['max_side_len'] + self.valid_set = kwargs['valid_set'] + + def __call__(self, data): + img = data['image'] + src_h, src_w, _ = img.shape + if self.valid_set == 'totaltext': + im_resized, [ratio_h, ratio_w] = self.resize_image_for_totaltext( + img, max_side_len=self.max_side_len) + else: + im_resized, (ratio_h, ratio_w) = self.resize_image( + img, max_side_len=self.max_side_len) + data['image'] = im_resized + data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w]) + return data + + def resize_image_for_totaltext(self, im, max_side_len=512): + + h, w, _ = im.shape + resize_w = w + resize_h = h + ratio = 1.25 + if h * ratio > max_side_len: + ratio = float(max_side_len) / resize_h + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return im, (ratio_h, ratio_w) + + def resize_image(self, im, max_side_len=512): + """ + resize image to a size multiple of max_stride which is required by the network + :param im: the resized image + :param max_side_len: limit of max image size to avoid out of memory in gpu + :return: the resized image and the resize ratio + """ + h, w, _ = im.shape + + resize_w = w + resize_h = h + + # Fix the longer side + if resize_h > resize_w: + ratio = float(max_side_len) / resize_h + else: + ratio = float(max_side_len) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + + return im, (ratio_h, ratio_w) + + +class KieResize(object): + def __init__(self, **kwargs): + super(KieResize, self).__init__() + self.max_side, self.min_side = kwargs['img_scale'][0], kwargs[ + 'img_scale'][1] + + def __call__(self, data): + img = data['image'] + points = data['points'] + src_h, src_w, _ = img.shape + im_resized, scale_factor, [ratio_h, ratio_w + ], [new_h, new_w] = self.resize_image(img) + resize_points = self.resize_boxes(img, points, scale_factor) + data['ori_image'] = img + data['ori_boxes'] = points + data['points'] = resize_points + data['image'] = im_resized + data['shape'] = np.array([new_h, new_w]) + return data + + def resize_image(self, img): + norm_img = np.zeros([1024, 1024, 3], dtype='float32') + scale = [512, 1024] + h, w = img.shape[:2] + max_long_edge = max(scale) + max_short_edge = min(scale) + scale_factor = min(max_long_edge / max(h, w), + max_short_edge / min(h, w)) + resize_w, resize_h = int(w * float(scale_factor) + 0.5), int(h * float( + scale_factor) + 0.5) + max_stride = 32 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(img, (resize_w, resize_h)) + new_h, new_w = im.shape[:2] + w_scale = new_w / w + h_scale = new_h / h + scale_factor = np.array( + [w_scale, h_scale, w_scale, h_scale], dtype=np.float32) + norm_img[:new_h, :new_w, :] = im + return norm_img, scale_factor, [h_scale, w_scale], [new_h, new_w] + + def resize_boxes(self, im, points, scale_factor): + points = points * scale_factor + img_shape = im.shape[:2] + points[:, 0::2] = np.clip(points[:, 0::2], 0, img_shape[1]) + points[:, 1::2] = np.clip(points[:, 1::2], 0, img_shape[0]) + return points + + +class SRResize(object): + def __init__(self, + imgH=32, + imgW=128, + down_sample_scale=4, + keep_ratio=False, + min_ratio=1, + mask=False, + infer_mode=False, + **kwargs): + self.imgH = imgH + self.imgW = imgW + self.keep_ratio = keep_ratio + self.min_ratio = min_ratio + self.down_sample_scale = down_sample_scale + self.mask = mask + self.infer_mode = infer_mode + + def __call__(self, data): + imgH = self.imgH + imgW = self.imgW + images_lr = data["image_lr"] + transform2 = ResizeNormalize( + (imgW // self.down_sample_scale, imgH // self.down_sample_scale)) + images_lr = transform2(images_lr) + data["img_lr"] = images_lr + if self.infer_mode: + return data + + images_HR = data["image_hr"] + label_strs = data["label"] + transform = ResizeNormalize((imgW, imgH)) + images_HR = transform(images_HR) + data["img_hr"] = images_HR + return data + + +class ResizeNormalize(object): + def __init__(self, size, interpolation=Image.BICUBIC): + self.size = size + self.interpolation = interpolation + + def __call__(self, img): + img = img.resize(self.size, self.interpolation) + img_numpy = np.array(img).astype("float32") + img_numpy = img_numpy.transpose((2, 0, 1)) / 255 + return img_numpy + + +class GrayImageChannelFormat(object): + """ + format gray scale image's channel: (3,h,w) -> (1,h,w) + Args: + inverse: inverse gray image + """ + + def __init__(self, inverse=False, **kwargs): + self.inverse = inverse + + def __call__(self, data): + img = data['image'] + img_single_channel = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + img_expanded = np.expand_dims(img_single_channel, 0) + + if self.inverse: + data['image'] = np.abs(img_expanded - 1) + else: + data['image'] = img_expanded + + data['src_image'] = img + return data \ No newline at end of file diff --git a/ppocr/data/imaug/pg_process.py b/ppocr/data/imaug/pg_process.py new file mode 100644 index 0000000000000000000000000000000000000000..f1e5f912b7a55dc3b9e883a9f4f8c5de482dcd5a --- /dev/null +++ b/ppocr/data/imaug/pg_process.py @@ -0,0 +1,1034 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import cv2 +import numpy as np +from skimage.morphology._skeletonize import thin +from ppocr.utils.e2e_utils.extract_textpoint_fast import sort_and_expand_with_direction_v2 + +__all__ = ['PGProcessTrain'] + + +class PGProcessTrain(object): + def __init__(self, + character_dict_path, + max_text_length, + max_text_nums, + tcl_len, + batch_size=14, + use_resize=True, + use_random_crop=False, + min_crop_size=24, + min_text_size=4, + max_text_size=512, + point_gather_mode=None, + **kwargs): + self.tcl_len = tcl_len + self.max_text_length = max_text_length + self.max_text_nums = max_text_nums + self.batch_size = batch_size + if use_random_crop is True: + self.min_crop_size = min_crop_size + self.use_random_crop = use_random_crop + self.min_text_size = min_text_size + self.max_text_size = max_text_size + self.use_resize = use_resize + self.point_gather_mode = point_gather_mode + self.Lexicon_Table = self.get_dict(character_dict_path) + self.pad_num = len(self.Lexicon_Table) + self.img_id = 0 + + def get_dict(self, character_dict_path): + character_str = "" + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + character_str += line + dict_character = list(character_str) + return dict_character + + def quad_area(self, poly): + """ + compute area of a polygon + :param poly: + :return: + """ + edge = [(poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]), + (poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]), + (poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]), + (poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])] + return np.sum(edge) / 2. + + def gen_quad_from_poly(self, poly): + """ + Generate min area quad from poly. + """ + point_num = poly.shape[0] + min_area_quad = np.zeros((4, 2), dtype=np.float32) + rect = cv2.minAreaRect(poly.astype( + np.int32)) # (center (x,y), (width, height), angle of rotation) + box = np.array(cv2.boxPoints(rect)) + + first_point_idx = 0 + min_dist = 1e4 + for i in range(4): + dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \ + np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \ + np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \ + np.linalg.norm(box[(i + 3) % 4] - poly[-1]) + if dist < min_dist: + min_dist = dist + first_point_idx = i + for i in range(4): + min_area_quad[i] = box[(first_point_idx + i) % 4] + + return min_area_quad + + def check_and_validate_polys(self, polys, tags, im_size): + """ + check so that the text poly is in the same direction, + and also filter some invalid polygons + :param polys: + :param tags: + :return: + """ + (h, w) = im_size + if polys.shape[0] == 0: + return polys, np.array([]), np.array([]) + polys[:, :, 0] = np.clip(polys[:, :, 0], 0, w - 1) + polys[:, :, 1] = np.clip(polys[:, :, 1], 0, h - 1) + + validated_polys = [] + validated_tags = [] + hv_tags = [] + for poly, tag in zip(polys, tags): + quad = self.gen_quad_from_poly(poly) + p_area = self.quad_area(quad) + if abs(p_area) < 1: + print('invalid poly') + continue + if p_area > 0: + if tag == False: + print('poly in wrong direction') + tag = True # reversed cases should be ignore + poly = poly[(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, + 1), :] + quad = quad[(0, 3, 2, 1), :] + + len_w = np.linalg.norm(quad[0] - quad[1]) + np.linalg.norm(quad[3] - + quad[2]) + len_h = np.linalg.norm(quad[0] - quad[3]) + np.linalg.norm(quad[1] - + quad[2]) + hv_tag = 1 + + if len_w * 2.0 < len_h: + hv_tag = 0 + + validated_polys.append(poly) + validated_tags.append(tag) + hv_tags.append(hv_tag) + return np.array(validated_polys), np.array(validated_tags), np.array( + hv_tags) + + def crop_area(self, + im, + polys, + tags, + hv_tags, + txts, + crop_background=False, + max_tries=25): + """ + make random crop from the input image + :param im: + :param polys: [b,4,2] + :param tags: + :param crop_background: + :param max_tries: 50 -> 25 + :return: + """ + h, w, _ = im.shape + pad_h = h // 10 + pad_w = w // 10 + h_array = np.zeros((h + pad_h * 2), dtype=np.int32) + w_array = np.zeros((w + pad_w * 2), dtype=np.int32) + for poly in polys: + poly = np.round(poly, decimals=0).astype(np.int32) + minx = np.min(poly[:, 0]) + maxx = np.max(poly[:, 0]) + w_array[minx + pad_w:maxx + pad_w] = 1 + miny = np.min(poly[:, 1]) + maxy = np.max(poly[:, 1]) + h_array[miny + pad_h:maxy + pad_h] = 1 + # ensure the cropped area not across a text + h_axis = np.where(h_array == 0)[0] + w_axis = np.where(w_array == 0)[0] + if len(h_axis) == 0 or len(w_axis) == 0: + return im, polys, tags, hv_tags, txts + for i in range(max_tries): + xx = np.random.choice(w_axis, size=2) + xmin = np.min(xx) - pad_w + xmax = np.max(xx) - pad_w + xmin = np.clip(xmin, 0, w - 1) + xmax = np.clip(xmax, 0, w - 1) + yy = np.random.choice(h_axis, size=2) + ymin = np.min(yy) - pad_h + ymax = np.max(yy) - pad_h + ymin = np.clip(ymin, 0, h - 1) + ymax = np.clip(ymax, 0, h - 1) + if xmax - xmin < self.min_crop_size or \ + ymax - ymin < self.min_crop_size: + continue + if polys.shape[0] != 0: + poly_axis_in_area = (polys[:, :, 0] >= xmin) & (polys[:, :, 0] <= xmax) \ + & (polys[:, :, 1] >= ymin) & (polys[:, :, 1] <= ymax) + selected_polys = np.where( + np.sum(poly_axis_in_area, axis=1) == 4)[0] + else: + selected_polys = [] + if len(selected_polys) == 0: + # no text in this area + if crop_background: + txts_tmp = [] + for selected_poly in selected_polys: + txts_tmp.append(txts[selected_poly]) + txts = txts_tmp + return im[ymin: ymax + 1, xmin: xmax + 1, :], \ + polys[selected_polys], tags[selected_polys], hv_tags[selected_polys], txts + else: + continue + im = im[ymin:ymax + 1, xmin:xmax + 1, :] + polys = polys[selected_polys] + tags = tags[selected_polys] + hv_tags = hv_tags[selected_polys] + txts_tmp = [] + for selected_poly in selected_polys: + txts_tmp.append(txts[selected_poly]) + txts = txts_tmp + polys[:, :, 0] -= xmin + polys[:, :, 1] -= ymin + return im, polys, tags, hv_tags, txts + + return im, polys, tags, hv_tags, txts + + def fit_and_gather_tcl_points_v2(self, + min_area_quad, + poly, + max_h, + max_w, + fixed_point_num=64, + img_id=0, + reference_height=3): + """ + Find the center point of poly as key_points, then fit and gather. + """ + key_point_xys = [] + point_num = poly.shape[0] + for idx in range(point_num // 2): + center_point = (poly[idx] + poly[point_num - 1 - idx]) / 2.0 + key_point_xys.append(center_point) + + tmp_image = np.zeros( + shape=( + max_h, + max_w, ), dtype='float32') + cv2.polylines(tmp_image, [np.array(key_point_xys).astype('int32')], + False, 1.0) + ys, xs = np.where(tmp_image > 0) + xy_text = np.array(list(zip(xs, ys)), dtype='float32') + + left_center_pt = ( + (min_area_quad[0] - min_area_quad[1]) / 2.0).reshape(1, 2) + right_center_pt = ( + (min_area_quad[1] - min_area_quad[2]) / 2.0).reshape(1, 2) + proj_unit_vec = (right_center_pt - left_center_pt) / ( + np.linalg.norm(right_center_pt - left_center_pt) + 1e-6) + proj_unit_vec_tile = np.tile(proj_unit_vec, + (xy_text.shape[0], 1)) # (n, 2) + left_center_pt_tile = np.tile(left_center_pt, + (xy_text.shape[0], 1)) # (n, 2) + xy_text_to_left_center = xy_text - left_center_pt_tile + proj_value = np.sum(xy_text_to_left_center * proj_unit_vec_tile, axis=1) + xy_text = xy_text[np.argsort(proj_value)] + + # convert to np and keep the num of point not greater then fixed_point_num + pos_info = np.array(xy_text).reshape(-1, 2)[:, ::-1] # xy-> yx + point_num = len(pos_info) + if point_num > fixed_point_num: + keep_ids = [ + int((point_num * 1.0 / fixed_point_num) * x) + for x in range(fixed_point_num) + ] + pos_info = pos_info[keep_ids, :] + + keep = int(min(len(pos_info), fixed_point_num)) + if np.random.rand() < 0.2 and reference_height >= 3: + dl = (np.random.rand(keep) - 0.5) * reference_height * 0.3 + random_float = np.array([1, 0]).reshape([1, 2]) * dl.reshape( + [keep, 1]) + pos_info += random_float + pos_info[:, 0] = np.clip(pos_info[:, 0], 0, max_h - 1) + pos_info[:, 1] = np.clip(pos_info[:, 1], 0, max_w - 1) + + # padding to fixed length + pos_l = np.zeros((self.tcl_len, 3), dtype=np.int32) + pos_l[:, 0] = np.ones((self.tcl_len, )) * img_id + pos_m = np.zeros((self.tcl_len, 1), dtype=np.float32) + pos_l[:keep, 1:] = np.round(pos_info).astype(np.int32) + pos_m[:keep] = 1.0 + return pos_l, pos_m + + def fit_and_gather_tcl_points_v3(self, + min_area_quad, + poly, + max_h, + max_w, + fixed_point_num=64, + img_id=0, + reference_height=3): + """ + Find the center point of poly as key_points, then fit and gather. + """ + det_mask = np.zeros((int(max_h / self.ds_ratio), + int(max_w / self.ds_ratio))).astype(np.float32) + + # score_big_map + cv2.fillPoly(det_mask, + np.round(poly / self.ds_ratio).astype(np.int32), 1.0) + det_mask = cv2.resize( + det_mask, dsize=None, fx=self.ds_ratio, fy=self.ds_ratio) + det_mask = np.array(det_mask > 1e-3, dtype='float32') + + f_direction = self.f_direction + skeleton_map = thin(det_mask.astype(np.uint8)) + instance_count, instance_label_map = cv2.connectedComponents( + skeleton_map.astype(np.uint8), connectivity=8) + + ys, xs = np.where(instance_label_map == 1) + pos_list = list(zip(ys, xs)) + if len(pos_list) < 3: + return None + pos_list_sorted = sort_and_expand_with_direction_v2( + pos_list, f_direction, det_mask) + + pos_list_sorted = np.array(pos_list_sorted) + length = len(pos_list_sorted) - 1 + insert_num = 0 + for index in range(length): + stride_y = np.abs(pos_list_sorted[index + insert_num][0] - + pos_list_sorted[index + 1 + insert_num][0]) + stride_x = np.abs(pos_list_sorted[index + insert_num][1] - + pos_list_sorted[index + 1 + insert_num][1]) + max_points = int(max(stride_x, stride_y)) + + stride = (pos_list_sorted[index + insert_num] - + pos_list_sorted[index + 1 + insert_num]) / (max_points) + insert_num_temp = max_points - 1 + + for i in range(int(insert_num_temp)): + insert_value = pos_list_sorted[index + insert_num] - (i + 1 + ) * stride + insert_index = index + i + 1 + insert_num + pos_list_sorted = np.insert( + pos_list_sorted, insert_index, insert_value, axis=0) + insert_num += insert_num_temp + + pos_info = np.array(pos_list_sorted).reshape(-1, 2).astype( + np.float32) # xy-> yx + + point_num = len(pos_info) + if point_num > fixed_point_num: + keep_ids = [ + int((point_num * 1.0 / fixed_point_num) * x) + for x in range(fixed_point_num) + ] + pos_info = pos_info[keep_ids, :] + + keep = int(min(len(pos_info), fixed_point_num)) + reference_width = (np.abs(poly[0, 0, 0] - poly[-1, 1, 0]) + + np.abs(poly[0, 3, 0] - poly[-1, 2, 0])) // 2 + if np.random.rand() < 1: + dh = (np.random.rand(keep) - 0.5) * reference_height + offset = np.random.rand() - 0.5 + dw = np.array([[0, offset * reference_width * 0.2]]) + random_float_h = np.array([1, 0]).reshape([1, 2]) * dh.reshape( + [keep, 1]) + random_float_w = dw.repeat(keep, axis=0) + pos_info += random_float_h + pos_info += random_float_w + pos_info[:, 0] = np.clip(pos_info[:, 0], 0, max_h - 1) + pos_info[:, 1] = np.clip(pos_info[:, 1], 0, max_w - 1) + + # padding to fixed length + pos_l = np.zeros((self.tcl_len, 3), dtype=np.int32) + pos_l[:, 0] = np.ones((self.tcl_len, )) * img_id + pos_m = np.zeros((self.tcl_len, 1), dtype=np.float32) + pos_l[:keep, 1:] = np.round(pos_info).astype(np.int32) + pos_m[:keep] = 1.0 + return pos_l, pos_m + + def generate_direction_map(self, poly_quads, n_char, direction_map): + """ + """ + width_list = [] + height_list = [] + for quad in poly_quads: + quad_w = (np.linalg.norm(quad[0] - quad[1]) + + np.linalg.norm(quad[2] - quad[3])) / 2.0 + quad_h = (np.linalg.norm(quad[0] - quad[3]) + + np.linalg.norm(quad[2] - quad[1])) / 2.0 + width_list.append(quad_w) + height_list.append(quad_h) + norm_width = max(sum(width_list) / n_char, 1.0) + average_height = max(sum(height_list) / len(height_list), 1.0) + k = 1 + for quad in poly_quads: + direct_vector_full = ( + (quad[1] + quad[2]) - (quad[0] + quad[3])) / 2.0 + direct_vector = direct_vector_full / ( + np.linalg.norm(direct_vector_full) + 1e-6) * norm_width + direction_label = tuple( + map(float, + [direct_vector[0], direct_vector[1], 1.0 / average_height])) + cv2.fillPoly(direction_map, + quad.round().astype(np.int32)[np.newaxis, :, :], + direction_label) + k += 1 + return direction_map + + def calculate_average_height(self, poly_quads): + """ + """ + height_list = [] + for quad in poly_quads: + quad_h = (np.linalg.norm(quad[0] - quad[3]) + + np.linalg.norm(quad[2] - quad[1])) / 2.0 + height_list.append(quad_h) + average_height = max(sum(height_list) / len(height_list), 1.0) + return average_height + + def generate_tcl_ctc_label(self, + h, + w, + polys, + tags, + text_strs, + ds_ratio, + tcl_ratio=0.3, + shrink_ratio_of_width=0.15): + """ + Generate polygon. + """ + self.ds_ratio = ds_ratio + score_map_big = np.zeros( + ( + h, + w, ), dtype=np.float32) + h, w = int(h * ds_ratio), int(w * ds_ratio) + polys = polys * ds_ratio + + score_map = np.zeros( + ( + h, + w, ), dtype=np.float32) + score_label_map = np.zeros( + ( + h, + w, ), dtype=np.float32) + tbo_map = np.zeros((h, w, 5), dtype=np.float32) + training_mask = np.ones( + ( + h, + w, ), dtype=np.float32) + direction_map = np.ones((h, w, 3)) * np.array([0, 0, 1]).reshape( + [1, 1, 3]).astype(np.float32) + + label_idx = 0 + score_label_map_text_label_list = [] + pos_list, pos_mask, label_list = [], [], [] + for poly_idx, poly_tag in enumerate(zip(polys, tags)): + poly = poly_tag[0] + tag = poly_tag[1] + + # generate min_area_quad + min_area_quad, center_point = self.gen_min_area_quad_from_poly(poly) + min_area_quad_h = 0.5 * ( + np.linalg.norm(min_area_quad[0] - min_area_quad[3]) + + np.linalg.norm(min_area_quad[1] - min_area_quad[2])) + min_area_quad_w = 0.5 * ( + np.linalg.norm(min_area_quad[0] - min_area_quad[1]) + + np.linalg.norm(min_area_quad[2] - min_area_quad[3])) + + if min(min_area_quad_h, min_area_quad_w) < self.min_text_size * ds_ratio \ + or min(min_area_quad_h, min_area_quad_w) > self.max_text_size * ds_ratio: + continue + + if tag: + cv2.fillPoly(training_mask, + poly.astype(np.int32)[np.newaxis, :, :], 0.15) + else: + text_label = text_strs[poly_idx] + text_label = self.prepare_text_label(text_label, + self.Lexicon_Table) + text_label_index_list = [[self.Lexicon_Table.index(c_)] + for c_ in text_label + if c_ in self.Lexicon_Table] + if len(text_label_index_list) < 1: + continue + + tcl_poly = self.poly2tcl(poly, tcl_ratio) + tcl_quads = self.poly2quads(tcl_poly) + poly_quads = self.poly2quads(poly) + + stcl_quads, quad_index = self.shrink_poly_along_width( + tcl_quads, + shrink_ratio_of_width=shrink_ratio_of_width, + expand_height_ratio=1.0 / tcl_ratio) + + cv2.fillPoly(score_map, + np.round(stcl_quads).astype(np.int32), 1.0) + cv2.fillPoly(score_map_big, + np.round(stcl_quads / ds_ratio).astype(np.int32), + 1.0) + + for idx, quad in enumerate(stcl_quads): + quad_mask = np.zeros((h, w), dtype=np.float32) + quad_mask = cv2.fillPoly( + quad_mask, + np.round(quad[np.newaxis, :, :]).astype(np.int32), 1.0) + tbo_map = self.gen_quad_tbo(poly_quads[quad_index[idx]], + quad_mask, tbo_map) + + # score label map and score_label_map_text_label_list for refine + if label_idx == 0: + text_pos_list_ = [[len(self.Lexicon_Table)], ] + score_label_map_text_label_list.append(text_pos_list_) + + label_idx += 1 + cv2.fillPoly(score_label_map, + np.round(poly_quads).astype(np.int32), label_idx) + score_label_map_text_label_list.append(text_label_index_list) + + # direction info, fix-me + n_char = len(text_label_index_list) + direction_map = self.generate_direction_map(poly_quads, n_char, + direction_map) + + # pos info + average_shrink_height = self.calculate_average_height( + stcl_quads) + + if self.point_gather_mode == 'align': + self.f_direction = direction_map[:, :, :-1].copy() + pos_res = self.fit_and_gather_tcl_points_v3( + min_area_quad, + stcl_quads, + max_h=h, + max_w=w, + fixed_point_num=64, + img_id=self.img_id, + reference_height=average_shrink_height) + if pos_res is None: + continue + pos_l, pos_m = pos_res[0], pos_res[1] + + else: + pos_l, pos_m = self.fit_and_gather_tcl_points_v2( + min_area_quad, + poly, + max_h=h, + max_w=w, + fixed_point_num=64, + img_id=self.img_id, + reference_height=average_shrink_height) + + label_l = text_label_index_list + if len(text_label_index_list) < 2: + continue + + pos_list.append(pos_l) + pos_mask.append(pos_m) + label_list.append(label_l) + + # use big score_map for smooth tcl lines + score_map_big_resized = cv2.resize( + score_map_big, dsize=None, fx=ds_ratio, fy=ds_ratio) + score_map = np.array(score_map_big_resized > 1e-3, dtype='float32') + + return score_map, score_label_map, tbo_map, direction_map, training_mask, \ + pos_list, pos_mask, label_list, score_label_map_text_label_list + + def adjust_point(self, poly): + """ + adjust point order. + """ + point_num = poly.shape[0] + if point_num == 4: + len_1 = np.linalg.norm(poly[0] - poly[1]) + len_2 = np.linalg.norm(poly[1] - poly[2]) + len_3 = np.linalg.norm(poly[2] - poly[3]) + len_4 = np.linalg.norm(poly[3] - poly[0]) + + if (len_1 + len_3) * 1.5 < (len_2 + len_4): + poly = poly[[1, 2, 3, 0], :] + + elif point_num > 4: + vector_1 = poly[0] - poly[1] + vector_2 = poly[1] - poly[2] + cos_theta = np.dot(vector_1, vector_2) / ( + np.linalg.norm(vector_1) * np.linalg.norm(vector_2) + 1e-6) + theta = np.arccos(np.round(cos_theta, decimals=4)) + + if abs(theta) > (70 / 180 * math.pi): + index = list(range(1, point_num)) + [0] + poly = poly[np.array(index), :] + return poly + + def gen_min_area_quad_from_poly(self, poly): + """ + Generate min area quad from poly. + """ + point_num = poly.shape[0] + min_area_quad = np.zeros((4, 2), dtype=np.float32) + if point_num == 4: + min_area_quad = poly + center_point = np.sum(poly, axis=0) / 4 + else: + rect = cv2.minAreaRect(poly.astype( + np.int32)) # (center (x,y), (width, height), angle of rotation) + center_point = rect[0] + box = np.array(cv2.boxPoints(rect)) + + first_point_idx = 0 + min_dist = 1e4 + for i in range(4): + dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \ + np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \ + np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \ + np.linalg.norm(box[(i + 3) % 4] - poly[-1]) + if dist < min_dist: + min_dist = dist + first_point_idx = i + + for i in range(4): + min_area_quad[i] = box[(first_point_idx + i) % 4] + + return min_area_quad, center_point + + def shrink_quad_along_width(self, + quad, + begin_width_ratio=0., + end_width_ratio=1.): + """ + Generate shrink_quad_along_width. + """ + ratio_pair = np.array( + [[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + def shrink_poly_along_width(self, + quads, + shrink_ratio_of_width, + expand_height_ratio=1.0): + """ + shrink poly with given length. + """ + upper_edge_list = [] + + def get_cut_info(edge_len_list, cut_len): + for idx, edge_len in enumerate(edge_len_list): + cut_len -= edge_len + if cut_len <= 0.000001: + ratio = (cut_len + edge_len_list[idx]) / edge_len_list[idx] + return idx, ratio + + for quad in quads: + upper_edge_len = np.linalg.norm(quad[0] - quad[1]) + upper_edge_list.append(upper_edge_len) + + # length of left edge and right edge. + left_length = np.linalg.norm(quads[0][0] - quads[0][ + 3]) * expand_height_ratio + right_length = np.linalg.norm(quads[-1][1] - quads[-1][ + 2]) * expand_height_ratio + + shrink_length = min(left_length, right_length, + sum(upper_edge_list)) * shrink_ratio_of_width + # shrinking length + upper_len_left = shrink_length + upper_len_right = sum(upper_edge_list) - shrink_length + + left_idx, left_ratio = get_cut_info(upper_edge_list, upper_len_left) + left_quad = self.shrink_quad_along_width( + quads[left_idx], begin_width_ratio=left_ratio, end_width_ratio=1) + right_idx, right_ratio = get_cut_info(upper_edge_list, upper_len_right) + right_quad = self.shrink_quad_along_width( + quads[right_idx], begin_width_ratio=0, end_width_ratio=right_ratio) + + out_quad_list = [] + if left_idx == right_idx: + out_quad_list.append( + [left_quad[0], right_quad[1], right_quad[2], left_quad[3]]) + else: + out_quad_list.append(left_quad) + for idx in range(left_idx + 1, right_idx): + out_quad_list.append(quads[idx]) + out_quad_list.append(right_quad) + + return np.array(out_quad_list), list(range(left_idx, right_idx + 1)) + + def prepare_text_label(self, label_str, Lexicon_Table): + """ + Prepare text lablel by given Lexicon_Table. + """ + if len(Lexicon_Table) == 36: + return label_str.lower() + else: + return label_str + + def vector_angle(self, A, B): + """ + Calculate the angle between vector AB and x-axis positive direction. + """ + AB = np.array([B[1] - A[1], B[0] - A[0]]) + return np.arctan2(*AB) + + def theta_line_cross_point(self, theta, point): + """ + Calculate the line through given point and angle in ax + by + c =0 form. + """ + x, y = point + cos = np.cos(theta) + sin = np.sin(theta) + return [sin, -cos, cos * y - sin * x] + + def line_cross_two_point(self, A, B): + """ + Calculate the line through given point A and B in ax + by + c =0 form. + """ + angle = self.vector_angle(A, B) + return self.theta_line_cross_point(angle, A) + + def average_angle(self, poly): + """ + Calculate the average angle between left and right edge in given poly. + """ + p0, p1, p2, p3 = poly + angle30 = self.vector_angle(p3, p0) + angle21 = self.vector_angle(p2, p1) + return (angle30 + angle21) / 2 + + def line_cross_point(self, line1, line2): + """ + line1 and line2 in 0=ax+by+c form, compute the cross point of line1 and line2 + """ + a1, b1, c1 = line1 + a2, b2, c2 = line2 + d = a1 * b2 - a2 * b1 + + if d == 0: + print('Cross point does not exist') + return np.array([0, 0], dtype=np.float32) + else: + x = (b1 * c2 - b2 * c1) / d + y = (a2 * c1 - a1 * c2) / d + + return np.array([x, y], dtype=np.float32) + + def quad2tcl(self, poly, ratio): + """ + Generate center line by poly clock-wise point. (4, 2) + """ + ratio_pair = np.array( + [[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32) + p0_3 = poly[0] + (poly[3] - poly[0]) * ratio_pair + p1_2 = poly[1] + (poly[2] - poly[1]) * ratio_pair + return np.array([p0_3[0], p1_2[0], p1_2[1], p0_3[1]]) + + def poly2tcl(self, poly, ratio): + """ + Generate center line by poly clock-wise point. + """ + ratio_pair = np.array( + [[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32) + tcl_poly = np.zeros_like(poly) + point_num = poly.shape[0] + + for idx in range(point_num // 2): + point_pair = poly[idx] + (poly[point_num - 1 - idx] - poly[idx] + ) * ratio_pair + tcl_poly[idx] = point_pair[0] + tcl_poly[point_num - 1 - idx] = point_pair[1] + return tcl_poly + + def gen_quad_tbo(self, quad, tcl_mask, tbo_map): + """ + Generate tbo_map for give quad. + """ + # upper and lower line function: ax + by + c = 0; + up_line = self.line_cross_two_point(quad[0], quad[1]) + lower_line = self.line_cross_two_point(quad[3], quad[2]) + + quad_h = 0.5 * (np.linalg.norm(quad[0] - quad[3]) + + np.linalg.norm(quad[1] - quad[2])) + quad_w = 0.5 * (np.linalg.norm(quad[0] - quad[1]) + + np.linalg.norm(quad[2] - quad[3])) + + # average angle of left and right line. + angle = self.average_angle(quad) + + xy_in_poly = np.argwhere(tcl_mask == 1) + for y, x in xy_in_poly: + point = (x, y) + line = self.theta_line_cross_point(angle, point) + cross_point_upper = self.line_cross_point(up_line, line) + cross_point_lower = self.line_cross_point(lower_line, line) + ##FIX, offset reverse + upper_offset_x, upper_offset_y = cross_point_upper - point + lower_offset_x, lower_offset_y = cross_point_lower - point + tbo_map[y, x, 0] = upper_offset_y + tbo_map[y, x, 1] = upper_offset_x + tbo_map[y, x, 2] = lower_offset_y + tbo_map[y, x, 3] = lower_offset_x + tbo_map[y, x, 4] = 1.0 / max(min(quad_h, quad_w), 1.0) * 2 + return tbo_map + + def poly2quads(self, poly): + """ + Split poly into quads. + """ + quad_list = [] + point_num = poly.shape[0] + + # point pair + point_pair_list = [] + for idx in range(point_num // 2): + point_pair = [poly[idx], poly[point_num - 1 - idx]] + point_pair_list.append(point_pair) + + quad_num = point_num // 2 - 1 + for idx in range(quad_num): + # reshape and adjust to clock-wise + quad_list.append((np.array(point_pair_list)[[idx, idx + 1]] + ).reshape(4, 2)[[0, 2, 3, 1]]) + + return np.array(quad_list) + + def rotate_im_poly(self, im, text_polys): + """ + rotate image with 90 / 180 / 270 degre + """ + im_w, im_h = im.shape[1], im.shape[0] + dst_im = im.copy() + dst_polys = [] + rand_degree_ratio = np.random.rand() + rand_degree_cnt = 1 + if rand_degree_ratio > 0.5: + rand_degree_cnt = 3 + for i in range(rand_degree_cnt): + dst_im = np.rot90(dst_im) + rot_degree = -90 * rand_degree_cnt + rot_angle = rot_degree * math.pi / 180.0 + n_poly = text_polys.shape[0] + cx, cy = 0.5 * im_w, 0.5 * im_h + ncx, ncy = 0.5 * dst_im.shape[1], 0.5 * dst_im.shape[0] + for i in range(n_poly): + wordBB = text_polys[i] + poly = [] + for j in range(4): # 16->4 + sx, sy = wordBB[j][0], wordBB[j][1] + dx = math.cos(rot_angle) * (sx - cx) - math.sin(rot_angle) * ( + sy - cy) + ncx + dy = math.sin(rot_angle) * (sx - cx) + math.cos(rot_angle) * ( + sy - cy) + ncy + poly.append([dx, dy]) + dst_polys.append(poly) + return dst_im, np.array(dst_polys, dtype=np.float32) + + def __call__(self, data): + input_size = 512 + im = data['image'] + text_polys = data['polys'] + text_tags = data['ignore_tags'] + text_strs = data['texts'] + h, w, _ = im.shape + text_polys, text_tags, hv_tags = self.check_and_validate_polys( + text_polys, text_tags, (h, w)) + if text_polys.shape[0] <= 0: + return None + # set aspect ratio and keep area fix + asp_scales = np.arange(1.0, 1.55, 0.1) + asp_scale = np.random.choice(asp_scales) + if np.random.rand() < 0.5: + asp_scale = 1.0 / asp_scale + asp_scale = math.sqrt(asp_scale) + + asp_wx = asp_scale + asp_hy = 1.0 / asp_scale + im = cv2.resize(im, dsize=None, fx=asp_wx, fy=asp_hy) + text_polys[:, :, 0] *= asp_wx + text_polys[:, :, 1] *= asp_hy + + if self.use_resize is True: + ori_h, ori_w, _ = im.shape + if max(ori_h, ori_w) < 200: + ratio = 200 / max(ori_h, ori_w) + im = cv2.resize(im, (int(ori_w * ratio), int(ori_h * ratio))) + text_polys[:, :, 0] *= ratio + text_polys[:, :, 1] *= ratio + + if max(ori_h, ori_w) > 512: + ratio = 512 / max(ori_h, ori_w) + im = cv2.resize(im, (int(ori_w * ratio), int(ori_h * ratio))) + text_polys[:, :, 0] *= ratio + text_polys[:, :, 1] *= ratio + elif self.use_random_crop is True: + h, w, _ = im.shape + if max(h, w) > 2048: + rd_scale = 2048.0 / max(h, w) + im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale) + text_polys *= rd_scale + h, w, _ = im.shape + if min(h, w) < 16: + return None + + # no background + im, text_polys, text_tags, hv_tags, text_strs = self.crop_area( + im, + text_polys, + text_tags, + hv_tags, + text_strs, + crop_background=False) + + if text_polys.shape[0] == 0: + return None + # continue for all ignore case + if np.sum((text_tags * 1.0)) >= text_tags.size: + return None + new_h, new_w, _ = im.shape + if (new_h is None) or (new_w is None): + return None + # resize image + std_ratio = float(input_size) / max(new_w, new_h) + rand_scales = np.array( + [0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0, 1.0, 1.0, 1.0, 1.0]) + rz_scale = std_ratio * np.random.choice(rand_scales) + im = cv2.resize(im, dsize=None, fx=rz_scale, fy=rz_scale) + text_polys[:, :, 0] *= rz_scale + text_polys[:, :, 1] *= rz_scale + + # add gaussian blur + if np.random.rand() < 0.1 * 0.5: + ks = np.random.permutation(5)[0] + 1 + ks = int(ks / 2) * 2 + 1 + im = cv2.GaussianBlur(im, ksize=(ks, ks), sigmaX=0, sigmaY=0) + # add brighter + if np.random.rand() < 0.1 * 0.5: + im = im * (1.0 + np.random.rand() * 0.5) + im = np.clip(im, 0.0, 255.0) + # add darker + if np.random.rand() < 0.1 * 0.5: + im = im * (1.0 - np.random.rand() * 0.5) + im = np.clip(im, 0.0, 255.0) + + # Padding the im to [input_size, input_size] + new_h, new_w, _ = im.shape + if min(new_w, new_h) < input_size * 0.5: + return None + im_padded = np.ones((input_size, input_size, 3), dtype=np.float32) + im_padded[:, :, 2] = 0.485 * 255 + im_padded[:, :, 1] = 0.456 * 255 + im_padded[:, :, 0] = 0.406 * 255 + + # Random the start position + del_h = input_size - new_h + del_w = input_size - new_w + sh, sw = 0, 0 + if del_h > 1: + sh = int(np.random.rand() * del_h) + if del_w > 1: + sw = int(np.random.rand() * del_w) + + # Padding + im_padded[sh:sh + new_h, sw:sw + new_w, :] = im.copy() + text_polys[:, :, 0] += sw + text_polys[:, :, 1] += sh + + score_map, score_label_map, border_map, direction_map, training_mask, \ + pos_list, pos_mask, label_list, score_label_map_text_label = self.generate_tcl_ctc_label(input_size, + input_size, + text_polys, + text_tags, + text_strs, 0.25) + if len(label_list) <= 0: # eliminate negative samples + return None + pos_list_temp = np.zeros([64, 3]) + pos_mask_temp = np.zeros([64, 1]) + label_list_temp = np.zeros([self.max_text_length, 1]) + self.pad_num + + for i, label in enumerate(label_list): + n = len(label) + if n > self.max_text_length: + label_list[i] = label[:self.max_text_length] + continue + while n < self.max_text_length: + label.append([self.pad_num]) + n += 1 + + for i in range(len(label_list)): + label_list[i] = np.array(label_list[i]) + + if len(pos_list) <= 0 or len(pos_list) > self.max_text_nums: + return None + for __ in range(self.max_text_nums - len(pos_list), 0, -1): + pos_list.append(pos_list_temp) + pos_mask.append(pos_mask_temp) + label_list.append(label_list_temp) + + if self.img_id == self.batch_size - 1: + self.img_id = 0 + else: + self.img_id += 1 + + im_padded[:, :, 2] -= 0.485 * 255 + im_padded[:, :, 1] -= 0.456 * 255 + im_padded[:, :, 0] -= 0.406 * 255 + im_padded[:, :, 2] /= (255.0 * 0.229) + im_padded[:, :, 1] /= (255.0 * 0.224) + im_padded[:, :, 0] /= (255.0 * 0.225) + im_padded = im_padded.transpose((2, 0, 1)) + images = im_padded[::-1, :, :] + tcl_maps = score_map[np.newaxis, :, :] + tcl_label_maps = score_label_map[np.newaxis, :, :] + border_maps = border_map.transpose((2, 0, 1)) + direction_maps = direction_map.transpose((2, 0, 1)) + training_masks = training_mask[np.newaxis, :, :] + pos_list = np.array(pos_list) + pos_mask = np.array(pos_mask) + label_list = np.array(label_list) + data['images'] = images + data['tcl_maps'] = tcl_maps + data['tcl_label_maps'] = tcl_label_maps + data['border_maps'] = border_maps + data['direction_maps'] = direction_maps + data['training_masks'] = training_masks + data['label_list'] = label_list + data['pos_list'] = pos_list + data['pos_mask'] = pos_mask + return data diff --git a/ppocr/data/imaug/randaugment.py b/ppocr/data/imaug/randaugment.py new file mode 100644 index 0000000000000000000000000000000000000000..56f114d2f665f9b326e96819ac3d606c87a6e142 --- /dev/null +++ b/ppocr/data/imaug/randaugment.py @@ -0,0 +1,143 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from PIL import Image, ImageEnhance, ImageOps +import numpy as np +import random +import six + + +class RawRandAugment(object): + def __init__(self, + num_layers=2, + magnitude=5, + fillcolor=(128, 128, 128), + **kwargs): + self.num_layers = num_layers + self.magnitude = magnitude + self.max_level = 10 + + abso_level = self.magnitude / self.max_level + self.level_map = { + "shearX": 0.3 * abso_level, + "shearY": 0.3 * abso_level, + "translateX": 150.0 / 331 * abso_level, + "translateY": 150.0 / 331 * abso_level, + "rotate": 30 * abso_level, + "color": 0.9 * abso_level, + "posterize": int(4.0 * abso_level), + "solarize": 256.0 * abso_level, + "contrast": 0.9 * abso_level, + "sharpness": 0.9 * abso_level, + "brightness": 0.9 * abso_level, + "autocontrast": 0, + "equalize": 0, + "invert": 0 + } + + # from https://stackoverflow.com/questions/5252170/ + # specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand + def rotate_with_fill(img, magnitude): + rot = img.convert("RGBA").rotate(magnitude) + return Image.composite(rot, + Image.new("RGBA", rot.size, (128, ) * 4), + rot).convert(img.mode) + + rnd_ch_op = random.choice + + self.func = { + "shearX": lambda img, magnitude: img.transform( + img.size, + Image.AFFINE, + (1, magnitude * rnd_ch_op([-1, 1]), 0, 0, 1, 0), + Image.BICUBIC, + fillcolor=fillcolor), + "shearY": lambda img, magnitude: img.transform( + img.size, + Image.AFFINE, + (1, 0, 0, magnitude * rnd_ch_op([-1, 1]), 1, 0), + Image.BICUBIC, + fillcolor=fillcolor), + "translateX": lambda img, magnitude: img.transform( + img.size, + Image.AFFINE, + (1, 0, magnitude * img.size[0] * rnd_ch_op([-1, 1]), 0, 1, 0), + fillcolor=fillcolor), + "translateY": lambda img, magnitude: img.transform( + img.size, + Image.AFFINE, + (1, 0, 0, 0, 1, magnitude * img.size[1] * rnd_ch_op([-1, 1])), + fillcolor=fillcolor), + "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude), + "color": lambda img, magnitude: ImageEnhance.Color(img).enhance( + 1 + magnitude * rnd_ch_op([-1, 1])), + "posterize": lambda img, magnitude: + ImageOps.posterize(img, magnitude), + "solarize": lambda img, magnitude: + ImageOps.solarize(img, magnitude), + "contrast": lambda img, magnitude: + ImageEnhance.Contrast(img).enhance( + 1 + magnitude * rnd_ch_op([-1, 1])), + "sharpness": lambda img, magnitude: + ImageEnhance.Sharpness(img).enhance( + 1 + magnitude * rnd_ch_op([-1, 1])), + "brightness": lambda img, magnitude: + ImageEnhance.Brightness(img).enhance( + 1 + magnitude * rnd_ch_op([-1, 1])), + "autocontrast": lambda img, magnitude: + ImageOps.autocontrast(img), + "equalize": lambda img, magnitude: ImageOps.equalize(img), + "invert": lambda img, magnitude: ImageOps.invert(img) + } + + def __call__(self, img): + avaiable_op_names = list(self.level_map.keys()) + for layer_num in range(self.num_layers): + op_name = np.random.choice(avaiable_op_names) + img = self.func[op_name](img, self.level_map[op_name]) + return img + + +class RandAugment(RawRandAugment): + """ RandAugment wrapper to auto fit different img types """ + + def __init__(self, prob=0.5, *args, **kwargs): + self.prob = prob + if six.PY2: + super(RandAugment, self).__init__(*args, **kwargs) + else: + super().__init__(*args, **kwargs) + + def __call__(self, data): + if np.random.rand() > self.prob: + return data + img = data['image'] + if not isinstance(img, Image.Image): + img = np.ascontiguousarray(img) + img = Image.fromarray(img) + + if six.PY2: + img = super(RandAugment, self).__call__(img) + else: + img = super().__call__(img) + + if isinstance(img, Image.Image): + img = np.asarray(img) + data['image'] = img + return data diff --git a/ppocr/data/imaug/random_crop_data.py b/ppocr/data/imaug/random_crop_data.py new file mode 100644 index 0000000000000000000000000000000000000000..64aa110de4e3df950ce21e6d657877081b0fdd13 --- /dev/null +++ b/ppocr/data/imaug/random_crop_data.py @@ -0,0 +1,234 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/data_loader/modules/random_crop_data.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import cv2 +import random + + +def is_poly_in_rect(poly, x, y, w, h): + poly = np.array(poly) + if poly[:, 0].min() < x or poly[:, 0].max() > x + w: + return False + if poly[:, 1].min() < y or poly[:, 1].max() > y + h: + return False + return True + + +def is_poly_outside_rect(poly, x, y, w, h): + poly = np.array(poly) + if poly[:, 0].max() < x or poly[:, 0].min() > x + w: + return True + if poly[:, 1].max() < y or poly[:, 1].min() > y + h: + return True + return False + + +def split_regions(axis): + regions = [] + min_axis = 0 + for i in range(1, axis.shape[0]): + if axis[i] != axis[i - 1] + 1: + region = axis[min_axis:i] + min_axis = i + regions.append(region) + return regions + + +def random_select(axis, max_size): + xx = np.random.choice(axis, size=2) + xmin = np.min(xx) + xmax = np.max(xx) + xmin = np.clip(xmin, 0, max_size - 1) + xmax = np.clip(xmax, 0, max_size - 1) + return xmin, xmax + + +def region_wise_random_select(regions, max_size): + selected_index = list(np.random.choice(len(regions), 2)) + selected_values = [] + for index in selected_index: + axis = regions[index] + xx = int(np.random.choice(axis, size=1)) + selected_values.append(xx) + xmin = min(selected_values) + xmax = max(selected_values) + return xmin, xmax + + +def crop_area(im, text_polys, min_crop_side_ratio, max_tries): + h, w, _ = im.shape + h_array = np.zeros(h, dtype=np.int32) + w_array = np.zeros(w, dtype=np.int32) + for points in text_polys: + points = np.round(points, decimals=0).astype(np.int32) + minx = np.min(points[:, 0]) + maxx = np.max(points[:, 0]) + w_array[minx:maxx] = 1 + miny = np.min(points[:, 1]) + maxy = np.max(points[:, 1]) + h_array[miny:maxy] = 1 + # ensure the cropped area not across a text + h_axis = np.where(h_array == 0)[0] + w_axis = np.where(w_array == 0)[0] + + if len(h_axis) == 0 or len(w_axis) == 0: + return 0, 0, w, h + + h_regions = split_regions(h_axis) + w_regions = split_regions(w_axis) + + for i in range(max_tries): + if len(w_regions) > 1: + xmin, xmax = region_wise_random_select(w_regions, w) + else: + xmin, xmax = random_select(w_axis, w) + if len(h_regions) > 1: + ymin, ymax = region_wise_random_select(h_regions, h) + else: + ymin, ymax = random_select(h_axis, h) + + if xmax - xmin < min_crop_side_ratio * w or ymax - ymin < min_crop_side_ratio * h: + # area too small + continue + num_poly_in_rect = 0 + for poly in text_polys: + if not is_poly_outside_rect(poly, xmin, ymin, xmax - xmin, + ymax - ymin): + num_poly_in_rect += 1 + break + + if num_poly_in_rect > 0: + return xmin, ymin, xmax - xmin, ymax - ymin + + return 0, 0, w, h + + +class EastRandomCropData(object): + def __init__(self, + size=(640, 640), + max_tries=10, + min_crop_side_ratio=0.1, + keep_ratio=True, + **kwargs): + self.size = size + self.max_tries = max_tries + self.min_crop_side_ratio = min_crop_side_ratio + self.keep_ratio = keep_ratio + + def __call__(self, data): + img = data['image'] + text_polys = data['polys'] + ignore_tags = data['ignore_tags'] + texts = data['texts'] + all_care_polys = [ + text_polys[i] for i, tag in enumerate(ignore_tags) if not tag + ] + # 计算crop区域 + crop_x, crop_y, crop_w, crop_h = crop_area( + img, all_care_polys, self.min_crop_side_ratio, self.max_tries) + # crop 图片 保持比例填充 + scale_w = self.size[0] / crop_w + scale_h = self.size[1] / crop_h + scale = min(scale_w, scale_h) + h = int(crop_h * scale) + w = int(crop_w * scale) + if self.keep_ratio: + padimg = np.zeros((self.size[1], self.size[0], img.shape[2]), + img.dtype) + padimg[:h, :w] = cv2.resize( + img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], (w, h)) + img = padimg + else: + img = cv2.resize( + img[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], + tuple(self.size)) + # crop 文本框 + text_polys_crop = [] + ignore_tags_crop = [] + texts_crop = [] + for poly, text, tag in zip(text_polys, texts, ignore_tags): + poly = ((poly - (crop_x, crop_y)) * scale).tolist() + if not is_poly_outside_rect(poly, 0, 0, w, h): + text_polys_crop.append(poly) + ignore_tags_crop.append(tag) + texts_crop.append(text) + data['image'] = img + data['polys'] = np.array(text_polys_crop) + data['ignore_tags'] = ignore_tags_crop + data['texts'] = texts_crop + return data + + +class RandomCropImgMask(object): + def __init__(self, size, main_key, crop_keys, p=3 / 8, **kwargs): + self.size = size + self.main_key = main_key + self.crop_keys = crop_keys + self.p = p + + def __call__(self, data): + image = data['image'] + + h, w = image.shape[0:2] + th, tw = self.size + if w == tw and h == th: + return data + + mask = data[self.main_key] + if np.max(mask) > 0 and random.random() > self.p: + # make sure to crop the text region + tl = np.min(np.where(mask > 0), axis=1) - (th, tw) + tl[tl < 0] = 0 + br = np.max(np.where(mask > 0), axis=1) - (th, tw) + br[br < 0] = 0 + + br[0] = min(br[0], h - th) + br[1] = min(br[1], w - tw) + + i = random.randint(tl[0], br[0]) if tl[0] < br[0] else 0 + j = random.randint(tl[1], br[1]) if tl[1] < br[1] else 0 + else: + i = random.randint(0, h - th) if h - th > 0 else 0 + j = random.randint(0, w - tw) if w - tw > 0 else 0 + + # return i, j, th, tw + for k in data: + if k in self.crop_keys: + if len(data[k].shape) == 3: + if np.argmin(data[k].shape) == 0: + img = data[k][:, i:i + th, j:j + tw] + if img.shape[1] != img.shape[2]: + a = 1 + elif np.argmin(data[k].shape) == 2: + img = data[k][i:i + th, j:j + tw, :] + if img.shape[1] != img.shape[0]: + a = 1 + else: + img = data[k] + else: + img = data[k][i:i + th, j:j + tw] + if img.shape[0] != img.shape[1]: + a = 1 + data[k] = img + return data diff --git a/ppocr/data/imaug/rec_img_aug.py b/ppocr/data/imaug/rec_img_aug.py new file mode 100644 index 0000000000000000000000000000000000000000..33cd9f1705568014b3150cdf1629608bb97de447 --- /dev/null +++ b/ppocr/data/imaug/rec_img_aug.py @@ -0,0 +1,825 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import cv2 +import numpy as np +import random +import copy +from PIL import Image +from .text_image_aug import tia_perspective, tia_stretch, tia_distort +from .abinet_aug import CVGeometry, CVDeterioration, CVColorJitter, SVTRGeometry, SVTRDeterioration +from paddle.vision.transforms import Compose + + +class RecAug(object): + def __init__(self, + tia_prob=0.4, + crop_prob=0.4, + reverse_prob=0.4, + noise_prob=0.4, + jitter_prob=0.4, + blur_prob=0.4, + hsv_aug_prob=0.4, + **kwargs): + self.tia_prob = tia_prob + self.bda = BaseDataAugmentation(crop_prob, reverse_prob, noise_prob, + jitter_prob, blur_prob, hsv_aug_prob) + + def __call__(self, data): + img = data['image'] + h, w, _ = img.shape + + # tia + if random.random() <= self.tia_prob: + if h >= 20 and w >= 20: + img = tia_distort(img, random.randint(3, 6)) + img = tia_stretch(img, random.randint(3, 6)) + img = tia_perspective(img) + + # bda + data['image'] = img + data = self.bda(data) + return data + + +class BaseDataAugmentation(object): + def __init__(self, + crop_prob=0.4, + reverse_prob=0.4, + noise_prob=0.4, + jitter_prob=0.4, + blur_prob=0.4, + hsv_aug_prob=0.4, + **kwargs): + self.crop_prob = crop_prob + self.reverse_prob = reverse_prob + self.noise_prob = noise_prob + self.jitter_prob = jitter_prob + self.blur_prob = blur_prob + self.hsv_aug_prob = hsv_aug_prob + + def __call__(self, data): + img = data['image'] + h, w, _ = img.shape + + if random.random() <= self.crop_prob and h >= 20 and w >= 20: + img = get_crop(img) + + if random.random() <= self.blur_prob: + img = blur(img) + + if random.random() <= self.hsv_aug_prob: + img = hsv_aug(img) + + if random.random() <= self.jitter_prob: + img = jitter(img) + + if random.random() <= self.noise_prob: + img = add_gasuss_noise(img) + + if random.random() <= self.reverse_prob: + img = 255 - img + + data['image'] = img + return data + + +class ABINetRecAug(object): + def __init__(self, + geometry_p=0.5, + deterioration_p=0.25, + colorjitter_p=0.25, + **kwargs): + self.transforms = Compose([ + CVGeometry( + degrees=45, + translate=(0.0, 0.0), + scale=(0.5, 2.), + shear=(45, 15), + distortion=0.5, + p=geometry_p), CVDeterioration( + var=20, degrees=6, factor=4, p=deterioration_p), + CVColorJitter( + brightness=0.5, + contrast=0.5, + saturation=0.5, + hue=0.1, + p=colorjitter_p) + ]) + + def __call__(self, data): + img = data['image'] + img = self.transforms(img) + data['image'] = img + return data + + +class RecConAug(object): + def __init__(self, + prob=0.5, + image_shape=(32, 320, 3), + max_text_length=25, + ext_data_num=1, + **kwargs): + self.ext_data_num = ext_data_num + self.prob = prob + self.max_text_length = max_text_length + self.image_shape = image_shape + self.max_wh_ratio = self.image_shape[1] / self.image_shape[0] + + def merge_ext_data(self, data, ext_data): + ori_w = round(data['image'].shape[1] / data['image'].shape[0] * + self.image_shape[0]) + ext_w = round(ext_data['image'].shape[1] / ext_data['image'].shape[0] * + self.image_shape[0]) + data['image'] = cv2.resize(data['image'], (ori_w, self.image_shape[0])) + ext_data['image'] = cv2.resize(ext_data['image'], + (ext_w, self.image_shape[0])) + data['image'] = np.concatenate( + [data['image'], ext_data['image']], axis=1) + data["label"] += ext_data["label"] + return data + + def __call__(self, data): + rnd_num = random.random() + if rnd_num > self.prob: + return data + for idx, ext_data in enumerate(data["ext_data"]): + if len(data["label"]) + len(ext_data[ + "label"]) > self.max_text_length: + break + concat_ratio = data['image'].shape[1] / data['image'].shape[ + 0] + ext_data['image'].shape[1] / ext_data['image'].shape[0] + if concat_ratio > self.max_wh_ratio: + break + data = self.merge_ext_data(data, ext_data) + data.pop("ext_data") + return data + + +class SVTRRecAug(object): + def __init__(self, + aug_type=0, + geometry_p=0.5, + deterioration_p=0.25, + colorjitter_p=0.25, + **kwargs): + self.transforms = Compose([ + SVTRGeometry( + aug_type=aug_type, + degrees=45, + translate=(0.0, 0.0), + scale=(0.5, 2.), + shear=(45, 15), + distortion=0.5, + p=geometry_p), SVTRDeterioration( + var=20, degrees=6, factor=4, p=deterioration_p), + CVColorJitter( + brightness=0.5, + contrast=0.5, + saturation=0.5, + hue=0.1, + p=colorjitter_p) + ]) + + def __call__(self, data): + img = data['image'] + img = self.transforms(img) + data['image'] = img + return data + + +class ClsResizeImg(object): + def __init__(self, image_shape, **kwargs): + self.image_shape = image_shape + + def __call__(self, data): + img = data['image'] + norm_img, _ = resize_norm_img(img, self.image_shape) + data['image'] = norm_img + return data + + +class RecResizeImg(object): + def __init__(self, + image_shape, + infer_mode=False, + character_dict_path='./ppocr/utils/ppocr_keys_v1.txt', + padding=True, + **kwargs): + self.image_shape = image_shape + self.infer_mode = infer_mode + self.character_dict_path = character_dict_path + self.padding = padding + + def __call__(self, data): + img = data['image'] + if self.infer_mode and self.character_dict_path is not None: + norm_img, valid_ratio = resize_norm_img_chinese(img, + self.image_shape) + else: + norm_img, valid_ratio = resize_norm_img(img, self.image_shape, + self.padding) + data['image'] = norm_img + data['valid_ratio'] = valid_ratio + return data + + +class VLRecResizeImg(object): + def __init__(self, + image_shape, + infer_mode=False, + character_dict_path='./ppocr/utils/ppocr_keys_v1.txt', + padding=True, + **kwargs): + self.image_shape = image_shape + self.infer_mode = infer_mode + self.character_dict_path = character_dict_path + self.padding = padding + + def __call__(self, data): + img = data['image'] + + imgC, imgH, imgW = self.image_shape + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) + resized_w = imgW + resized_image = resized_image.astype('float32') + if self.image_shape[0] == 1: + resized_image = resized_image / 255 + norm_img = resized_image[np.newaxis, :] + else: + norm_img = resized_image.transpose((2, 0, 1)) / 255 + valid_ratio = min(1.0, float(resized_w / imgW)) + + data['image'] = norm_img + data['valid_ratio'] = valid_ratio + return data + + +class RFLRecResizeImg(object): + def __init__(self, image_shape, padding=True, interpolation=1, **kwargs): + self.image_shape = image_shape + self.padding = padding + + self.interpolation = interpolation + if self.interpolation == 0: + self.interpolation = cv2.INTER_NEAREST + elif self.interpolation == 1: + self.interpolation = cv2.INTER_LINEAR + elif self.interpolation == 2: + self.interpolation = cv2.INTER_CUBIC + elif self.interpolation == 3: + self.interpolation = cv2.INTER_AREA + else: + raise Exception("Unsupported interpolation type !!!") + + def __call__(self, data): + img = data['image'] + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + norm_img, valid_ratio = resize_norm_img( + img, self.image_shape, self.padding, self.interpolation) + data['image'] = norm_img + data['valid_ratio'] = valid_ratio + return data + + +class SRNRecResizeImg(object): + def __init__(self, image_shape, num_heads, max_text_length, **kwargs): + self.image_shape = image_shape + self.num_heads = num_heads + self.max_text_length = max_text_length + + def __call__(self, data): + img = data['image'] + norm_img = resize_norm_img_srn(img, self.image_shape) + data['image'] = norm_img + [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \ + srn_other_inputs(self.image_shape, self.num_heads, self.max_text_length) + + data['encoder_word_pos'] = encoder_word_pos + data['gsrm_word_pos'] = gsrm_word_pos + data['gsrm_slf_attn_bias1'] = gsrm_slf_attn_bias1 + data['gsrm_slf_attn_bias2'] = gsrm_slf_attn_bias2 + return data + + +class SARRecResizeImg(object): + def __init__(self, image_shape, width_downsample_ratio=0.25, **kwargs): + self.image_shape = image_shape + self.width_downsample_ratio = width_downsample_ratio + + def __call__(self, data): + img = data['image'] + norm_img, resize_shape, pad_shape, valid_ratio = resize_norm_img_sar( + img, self.image_shape, self.width_downsample_ratio) + data['image'] = norm_img + data['resized_shape'] = resize_shape + data['pad_shape'] = pad_shape + data['valid_ratio'] = valid_ratio + return data + + +class PRENResizeImg(object): + def __init__(self, image_shape, **kwargs): + """ + Accroding to original paper's realization, it's a hard resize method here. + So maybe you should optimize it to fit for your task better. + """ + self.dst_h, self.dst_w = image_shape + + def __call__(self, data): + img = data['image'] + resized_img = cv2.resize( + img, (self.dst_w, self.dst_h), interpolation=cv2.INTER_LINEAR) + resized_img = resized_img.transpose((2, 0, 1)) / 255 + resized_img -= 0.5 + resized_img /= 0.5 + data['image'] = resized_img.astype(np.float32) + return data + + +class SPINRecResizeImg(object): + def __init__(self, + image_shape, + interpolation=2, + mean=(127.5, 127.5, 127.5), + std=(127.5, 127.5, 127.5), + **kwargs): + self.image_shape = image_shape + + self.mean = np.array(mean, dtype=np.float32) + self.std = np.array(std, dtype=np.float32) + self.interpolation = interpolation + + def __call__(self, data): + img = data['image'] + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + # different interpolation type corresponding the OpenCV + if self.interpolation == 0: + interpolation = cv2.INTER_NEAREST + elif self.interpolation == 1: + interpolation = cv2.INTER_LINEAR + elif self.interpolation == 2: + interpolation = cv2.INTER_CUBIC + elif self.interpolation == 3: + interpolation = cv2.INTER_AREA + else: + raise Exception("Unsupported interpolation type !!!") + # Deal with the image error during image loading + if img is None: + return None + + img = cv2.resize(img, tuple(self.image_shape), interpolation) + img = np.array(img, np.float32) + img = np.expand_dims(img, -1) + img = img.transpose((2, 0, 1)) + # normalize the image + img = img.copy().astype(np.float32) + mean = np.float64(self.mean.reshape(1, -1)) + stdinv = 1 / np.float64(self.std.reshape(1, -1)) + img -= mean + img *= stdinv + data['image'] = img + return data + + +class GrayRecResizeImg(object): + def __init__(self, + image_shape, + resize_type, + inter_type='Image.ANTIALIAS', + scale=True, + padding=False, + **kwargs): + self.image_shape = image_shape + self.resize_type = resize_type + self.padding = padding + self.inter_type = eval(inter_type) + self.scale = scale + + def __call__(self, data): + img = data['image'] + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + image_shape = self.image_shape + if self.padding: + imgC, imgH, imgW = image_shape + # todo: change to 0 and modified image shape + h = img.shape[0] + w = img.shape[1] + ratio = w / float(h) + if math.ceil(imgH * ratio) > imgW: + resized_w = imgW + else: + resized_w = int(math.ceil(imgH * ratio)) + resized_image = cv2.resize(img, (resized_w, imgH)) + norm_img = np.expand_dims(resized_image, -1) + norm_img = norm_img.transpose((2, 0, 1)) + resized_image = norm_img.astype(np.float32) / 128. - 1. + padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) + padding_im[:, :, 0:resized_w] = resized_image + data['image'] = padding_im + return data + if self.resize_type == 'PIL': + image_pil = Image.fromarray(np.uint8(img)) + img = image_pil.resize(self.image_shape, self.inter_type) + img = np.array(img) + if self.resize_type == 'OpenCV': + img = cv2.resize(img, self.image_shape) + norm_img = np.expand_dims(img, -1) + norm_img = norm_img.transpose((2, 0, 1)) + if self.scale: + data['image'] = norm_img.astype(np.float32) / 128. - 1. + else: + data['image'] = norm_img.astype(np.float32) / 255. + return data + + +class ABINetRecResizeImg(object): + def __init__(self, image_shape, **kwargs): + self.image_shape = image_shape + + def __call__(self, data): + img = data['image'] + norm_img, valid_ratio = resize_norm_img_abinet(img, self.image_shape) + data['image'] = norm_img + data['valid_ratio'] = valid_ratio + return data + + +class SVTRRecResizeImg(object): + def __init__(self, image_shape, padding=True, **kwargs): + self.image_shape = image_shape + self.padding = padding + + def __call__(self, data): + img = data['image'] + + norm_img, valid_ratio = resize_norm_img(img, self.image_shape, + self.padding) + data['image'] = norm_img + data['valid_ratio'] = valid_ratio + return data + + +class RobustScannerRecResizeImg(object): + def __init__(self, + image_shape, + max_text_length, + width_downsample_ratio=0.25, + **kwargs): + self.image_shape = image_shape + self.width_downsample_ratio = width_downsample_ratio + self.max_text_length = max_text_length + + def __call__(self, data): + img = data['image'] + norm_img, resize_shape, pad_shape, valid_ratio = resize_norm_img_sar( + img, self.image_shape, self.width_downsample_ratio) + word_positons = np.array(range(0, self.max_text_length)).astype('int64') + data['image'] = norm_img + data['resized_shape'] = resize_shape + data['pad_shape'] = pad_shape + data['valid_ratio'] = valid_ratio + data['word_positons'] = word_positons + return data + + +def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25): + imgC, imgH, imgW_min, imgW_max = image_shape + h = img.shape[0] + w = img.shape[1] + valid_ratio = 1.0 + # make sure new_width is an integral multiple of width_divisor. + width_divisor = int(1 / width_downsample_ratio) + # resize + ratio = w / float(h) + resize_w = math.ceil(imgH * ratio) + if resize_w % width_divisor != 0: + resize_w = round(resize_w / width_divisor) * width_divisor + if imgW_min is not None: + resize_w = max(imgW_min, resize_w) + if imgW_max is not None: + valid_ratio = min(1.0, 1.0 * resize_w / imgW_max) + resize_w = min(imgW_max, resize_w) + resized_image = cv2.resize(img, (resize_w, imgH)) + resized_image = resized_image.astype('float32') + # norm + if image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + resize_shape = resized_image.shape + padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32) + padding_im[:, :, 0:resize_w] = resized_image + pad_shape = padding_im.shape + + return padding_im, resize_shape, pad_shape, valid_ratio + + +def resize_norm_img(img, + image_shape, + padding=True, + interpolation=cv2.INTER_LINEAR): + imgC, imgH, imgW = image_shape + h = img.shape[0] + w = img.shape[1] + if not padding: + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=interpolation) + resized_w = imgW + else: + ratio = w / float(h) + if math.ceil(imgH * ratio) > imgW: + resized_w = imgW + else: + resized_w = int(math.ceil(imgH * ratio)) + resized_image = cv2.resize(img, (resized_w, imgH)) + resized_image = resized_image.astype('float32') + if image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) + padding_im[:, :, 0:resized_w] = resized_image + valid_ratio = min(1.0, float(resized_w / imgW)) + return padding_im, valid_ratio + + +def resize_norm_img_chinese(img, image_shape): + imgC, imgH, imgW = image_shape + # todo: change to 0 and modified image shape + max_wh_ratio = imgW * 1.0 / imgH + h, w = img.shape[0], img.shape[1] + ratio = w * 1.0 / h + max_wh_ratio = max(max_wh_ratio, ratio) + imgW = int(imgH * max_wh_ratio) + if math.ceil(imgH * ratio) > imgW: + resized_w = imgW + else: + resized_w = int(math.ceil(imgH * ratio)) + resized_image = cv2.resize(img, (resized_w, imgH)) + resized_image = resized_image.astype('float32') + if image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) + padding_im[:, :, 0:resized_w] = resized_image + valid_ratio = min(1.0, float(resized_w / imgW)) + return padding_im, valid_ratio + + +def resize_norm_img_srn(img, image_shape): + imgC, imgH, imgW = image_shape + + img_black = np.zeros((imgH, imgW)) + im_hei = img.shape[0] + im_wid = img.shape[1] + + if im_wid <= im_hei * 1: + img_new = cv2.resize(img, (imgH * 1, imgH)) + elif im_wid <= im_hei * 2: + img_new = cv2.resize(img, (imgH * 2, imgH)) + elif im_wid <= im_hei * 3: + img_new = cv2.resize(img, (imgH * 3, imgH)) + else: + img_new = cv2.resize(img, (imgW, imgH)) + + img_np = np.asarray(img_new) + img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY) + img_black[:, 0:img_np.shape[1]] = img_np + img_black = img_black[:, :, np.newaxis] + + row, col, c = img_black.shape + c = 1 + + return np.reshape(img_black, (c, row, col)).astype(np.float32) + + +def resize_norm_img_abinet(img, image_shape): + imgC, imgH, imgW = image_shape + + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) + resized_w = imgW + resized_image = resized_image.astype('float32') + resized_image = resized_image / 255. + + mean = np.array([0.485, 0.456, 0.406]) + std = np.array([0.229, 0.224, 0.225]) + resized_image = ( + resized_image - mean[None, None, ...]) / std[None, None, ...] + resized_image = resized_image.transpose((2, 0, 1)) + resized_image = resized_image.astype('float32') + + valid_ratio = min(1.0, float(resized_w / imgW)) + return resized_image, valid_ratio + + +def srn_other_inputs(image_shape, num_heads, max_text_length): + + imgC, imgH, imgW = image_shape + feature_dim = int((imgH / 8) * (imgW / 8)) + + encoder_word_pos = np.array(range(0, feature_dim)).reshape( + (feature_dim, 1)).astype('int64') + gsrm_word_pos = np.array(range(0, max_text_length)).reshape( + (max_text_length, 1)).astype('int64') + + gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length)) + gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape( + [1, max_text_length, max_text_length]) + gsrm_slf_attn_bias1 = np.tile(gsrm_slf_attn_bias1, + [num_heads, 1, 1]) * [-1e9] + + gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape( + [1, max_text_length, max_text_length]) + gsrm_slf_attn_bias2 = np.tile(gsrm_slf_attn_bias2, + [num_heads, 1, 1]) * [-1e9] + + return [ + encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2 + ] + + +def flag(): + """ + flag + """ + return 1 if random.random() > 0.5000001 else -1 + + +def hsv_aug(img): + """ + cvtColor + """ + hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) + delta = 0.001 * random.random() * flag() + hsv[:, :, 2] = hsv[:, :, 2] * (1 + delta) + new_img = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR) + return new_img + + +def blur(img): + """ + blur + """ + h, w, _ = img.shape + if h > 10 and w > 10: + return cv2.GaussianBlur(img, (5, 5), 1) + else: + return img + + +def jitter(img): + """ + jitter + """ + w, h, _ = img.shape + if h > 10 and w > 10: + thres = min(w, h) + s = int(random.random() * thres * 0.01) + src_img = img.copy() + for i in range(s): + img[i:, i:, :] = src_img[:w - i, :h - i, :] + return img + else: + return img + + +def add_gasuss_noise(image, mean=0, var=0.1): + """ + Gasuss noise + """ + + noise = np.random.normal(mean, var**0.5, image.shape) + out = image + 0.5 * noise + out = np.clip(out, 0, 255) + out = np.uint8(out) + return out + + +def get_crop(image): + """ + random crop + """ + h, w, _ = image.shape + top_min = 1 + top_max = 8 + top_crop = int(random.randint(top_min, top_max)) + top_crop = min(top_crop, h - 1) + crop_img = image.copy() + ratio = random.randint(0, 1) + if ratio: + crop_img = crop_img[top_crop:h, :, :] + else: + crop_img = crop_img[0:h - top_crop, :, :] + return crop_img + + +def rad(x): + """ + rad + """ + return x * np.pi / 180 + + +def get_warpR(config): + """ + get_warpR + """ + anglex, angley, anglez, fov, w, h, r = \ + config.anglex, config.angley, config.anglez, config.fov, config.w, config.h, config.r + if w > 69 and w < 112: + anglex = anglex * 1.5 + + z = np.sqrt(w**2 + h**2) / 2 / np.tan(rad(fov / 2)) + # Homogeneous coordinate transformation matrix + rx = np.array([[1, 0, 0, 0], + [0, np.cos(rad(anglex)), -np.sin(rad(anglex)), 0], [ + 0, + -np.sin(rad(anglex)), + np.cos(rad(anglex)), + 0, + ], [0, 0, 0, 1]], np.float32) + ry = np.array([[np.cos(rad(angley)), 0, np.sin(rad(angley)), 0], + [0, 1, 0, 0], [ + -np.sin(rad(angley)), + 0, + np.cos(rad(angley)), + 0, + ], [0, 0, 0, 1]], np.float32) + rz = np.array([[np.cos(rad(anglez)), np.sin(rad(anglez)), 0, 0], + [-np.sin(rad(anglez)), np.cos(rad(anglez)), 0, 0], + [0, 0, 1, 0], [0, 0, 0, 1]], np.float32) + r = rx.dot(ry).dot(rz) + # generate 4 points + pcenter = np.array([h / 2, w / 2, 0, 0], np.float32) + p1 = np.array([0, 0, 0, 0], np.float32) - pcenter + p2 = np.array([w, 0, 0, 0], np.float32) - pcenter + p3 = np.array([0, h, 0, 0], np.float32) - pcenter + p4 = np.array([w, h, 0, 0], np.float32) - pcenter + dst1 = r.dot(p1) + dst2 = r.dot(p2) + dst3 = r.dot(p3) + dst4 = r.dot(p4) + list_dst = np.array([dst1, dst2, dst3, dst4]) + org = np.array([[0, 0], [w, 0], [0, h], [w, h]], np.float32) + dst = np.zeros((4, 2), np.float32) + # Project onto the image plane + dst[:, 0] = list_dst[:, 0] * z / (z - list_dst[:, 2]) + pcenter[0] + dst[:, 1] = list_dst[:, 1] * z / (z - list_dst[:, 2]) + pcenter[1] + + warpR = cv2.getPerspectiveTransform(org, dst) + + dst1, dst2, dst3, dst4 = dst + r1 = int(min(dst1[1], dst2[1])) + r2 = int(max(dst3[1], dst4[1])) + c1 = int(min(dst1[0], dst3[0])) + c2 = int(max(dst2[0], dst4[0])) + + try: + ratio = min(1.0 * h / (r2 - r1), 1.0 * w / (c2 - c1)) + + dx = -c1 + dy = -r1 + T1 = np.float32([[1., 0, dx], [0, 1., dy], [0, 0, 1.0 / ratio]]) + ret = T1.dot(warpR) + except: + ratio = 1.0 + T1 = np.float32([[1., 0, 0], [0, 1., 0], [0, 0, 1.]]) + ret = T1 + return ret, (-r1, -c1), ratio, dst + + +def get_warpAffine(config): + """ + get_warpAffine + """ + anglez = config.anglez + rz = np.array([[np.cos(rad(anglez)), np.sin(rad(anglez)), 0], + [-np.sin(rad(anglez)), np.cos(rad(anglez)), 0]], np.float32) + return rz diff --git a/ppocr/data/imaug/sast_process.py b/ppocr/data/imaug/sast_process.py new file mode 100644 index 0000000000000000000000000000000000000000..08d03b194dcfab92ab59329857d4a1326531218e --- /dev/null +++ b/ppocr/data/imaug/sast_process.py @@ -0,0 +1,777 @@ +#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. +""" +This part code is refered from: +https://github.com/songdejia/EAST/blob/master/data_utils.py +""" +import math +import cv2 +import numpy as np +import json +import sys +import os + +__all__ = ['SASTProcessTrain'] + + +class SASTProcessTrain(object): + def __init__(self, + image_shape=[512, 512], + min_crop_size=24, + min_crop_side_ratio=0.3, + min_text_size=10, + max_text_size=512, + **kwargs): + self.input_size = image_shape[1] + self.min_crop_size = min_crop_size + self.min_crop_side_ratio = min_crop_side_ratio + self.min_text_size = min_text_size + self.max_text_size = max_text_size + + def quad_area(self, poly): + """ + compute area of a polygon + :param poly: + :return: + """ + edge = [(poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]), + (poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]), + (poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]), + (poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])] + return np.sum(edge) / 2. + + def gen_quad_from_poly(self, poly): + """ + Generate min area quad from poly. + """ + point_num = poly.shape[0] + min_area_quad = np.zeros((4, 2), dtype=np.float32) + if True: + rect = cv2.minAreaRect(poly.astype( + np.int32)) # (center (x,y), (width, height), angle of rotation) + center_point = rect[0] + box = np.array(cv2.boxPoints(rect)) + + first_point_idx = 0 + min_dist = 1e4 + for i in range(4): + dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \ + np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \ + np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \ + np.linalg.norm(box[(i + 3) % 4] - poly[-1]) + if dist < min_dist: + min_dist = dist + first_point_idx = i + for i in range(4): + min_area_quad[i] = box[(first_point_idx + i) % 4] + + return min_area_quad + + def check_and_validate_polys(self, polys, tags, xxx_todo_changeme): + """ + check so that the text poly is in the same direction, + and also filter some invalid polygons + :param polys: + :param tags: + :return: + """ + (h, w) = xxx_todo_changeme + if polys.shape[0] == 0: + return polys, np.array([]), np.array([]) + polys[:, :, 0] = np.clip(polys[:, :, 0], 0, w - 1) + polys[:, :, 1] = np.clip(polys[:, :, 1], 0, h - 1) + + validated_polys = [] + validated_tags = [] + hv_tags = [] + for poly, tag in zip(polys, tags): + quad = self.gen_quad_from_poly(poly) + p_area = self.quad_area(quad) + if abs(p_area) < 1: + print('invalid poly') + continue + if p_area > 0: + if tag == False: + print('poly in wrong direction') + tag = True # reversed cases should be ignore + poly = poly[(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, + 1), :] + quad = quad[(0, 3, 2, 1), :] + + len_w = np.linalg.norm(quad[0] - quad[1]) + np.linalg.norm(quad[3] - + quad[2]) + len_h = np.linalg.norm(quad[0] - quad[3]) + np.linalg.norm(quad[1] - + quad[2]) + hv_tag = 1 + + if len_w * 2.0 < len_h: + hv_tag = 0 + + validated_polys.append(poly) + validated_tags.append(tag) + hv_tags.append(hv_tag) + return np.array(validated_polys), np.array(validated_tags), np.array( + hv_tags) + + def crop_area(self, + im, + polys, + tags, + hv_tags, + crop_background=False, + max_tries=25): + """ + make random crop from the input image + :param im: + :param polys: + :param tags: + :param crop_background: + :param max_tries: 50 -> 25 + :return: + """ + h, w, _ = im.shape + pad_h = h // 10 + pad_w = w // 10 + h_array = np.zeros((h + pad_h * 2), dtype=np.int32) + w_array = np.zeros((w + pad_w * 2), dtype=np.int32) + for poly in polys: + poly = np.round(poly, decimals=0).astype(np.int32) + minx = np.min(poly[:, 0]) + maxx = np.max(poly[:, 0]) + w_array[minx + pad_w:maxx + pad_w] = 1 + miny = np.min(poly[:, 1]) + maxy = np.max(poly[:, 1]) + h_array[miny + pad_h:maxy + pad_h] = 1 + # ensure the cropped area not across a text + h_axis = np.where(h_array == 0)[0] + w_axis = np.where(w_array == 0)[0] + if len(h_axis) == 0 or len(w_axis) == 0: + return im, polys, tags, hv_tags + for i in range(max_tries): + xx = np.random.choice(w_axis, size=2) + xmin = np.min(xx) - pad_w + xmax = np.max(xx) - pad_w + xmin = np.clip(xmin, 0, w - 1) + xmax = np.clip(xmax, 0, w - 1) + yy = np.random.choice(h_axis, size=2) + ymin = np.min(yy) - pad_h + ymax = np.max(yy) - pad_h + ymin = np.clip(ymin, 0, h - 1) + ymax = np.clip(ymax, 0, h - 1) + # if xmax - xmin < ARGS.min_crop_side_ratio * w or \ + # ymax - ymin < ARGS.min_crop_side_ratio * h: + if xmax - xmin < self.min_crop_size or \ + ymax - ymin < self.min_crop_size: + # area too small + continue + if polys.shape[0] != 0: + poly_axis_in_area = (polys[:, :, 0] >= xmin) & (polys[:, :, 0] <= xmax) \ + & (polys[:, :, 1] >= ymin) & (polys[:, :, 1] <= ymax) + selected_polys = np.where( + np.sum(poly_axis_in_area, axis=1) == 4)[0] + else: + selected_polys = [] + if len(selected_polys) == 0: + # no text in this area + if crop_background: + return im[ymin : ymax + 1, xmin : xmax + 1, :], \ + polys[selected_polys], tags[selected_polys], hv_tags[selected_polys] + else: + continue + im = im[ymin:ymax + 1, xmin:xmax + 1, :] + polys = polys[selected_polys] + tags = tags[selected_polys] + hv_tags = hv_tags[selected_polys] + polys[:, :, 0] -= xmin + polys[:, :, 1] -= ymin + return im, polys, tags, hv_tags + + return im, polys, tags, hv_tags + + def generate_direction_map(self, poly_quads, direction_map): + """ + """ + width_list = [] + height_list = [] + for quad in poly_quads: + quad_w = (np.linalg.norm(quad[0] - quad[1]) + + np.linalg.norm(quad[2] - quad[3])) / 2.0 + quad_h = (np.linalg.norm(quad[0] - quad[3]) + + np.linalg.norm(quad[2] - quad[1])) / 2.0 + width_list.append(quad_w) + height_list.append(quad_h) + norm_width = max(sum(width_list) / (len(width_list) + 1e-6), 1.0) + average_height = max(sum(height_list) / (len(height_list) + 1e-6), 1.0) + + for quad in poly_quads: + direct_vector_full = ( + (quad[1] + quad[2]) - (quad[0] + quad[3])) / 2.0 + direct_vector = direct_vector_full / ( + np.linalg.norm(direct_vector_full) + 1e-6) * norm_width + direction_label = tuple( + map(float, [ + direct_vector[0], direct_vector[1], 1.0 / (average_height + + 1e-6) + ])) + cv2.fillPoly(direction_map, + quad.round().astype(np.int32)[np.newaxis, :, :], + direction_label) + return direction_map + + def calculate_average_height(self, poly_quads): + """ + """ + height_list = [] + for quad in poly_quads: + quad_h = (np.linalg.norm(quad[0] - quad[3]) + + np.linalg.norm(quad[2] - quad[1])) / 2.0 + height_list.append(quad_h) + average_height = max(sum(height_list) / len(height_list), 1.0) + return average_height + + def generate_tcl_label(self, + hw, + polys, + tags, + ds_ratio, + tcl_ratio=0.3, + shrink_ratio_of_width=0.15): + """ + Generate polygon. + """ + h, w = hw + h, w = int(h * ds_ratio), int(w * ds_ratio) + polys = polys * ds_ratio + + score_map = np.zeros( + ( + h, + w, ), dtype=np.float32) + tbo_map = np.zeros((h, w, 5), dtype=np.float32) + training_mask = np.ones( + ( + h, + w, ), dtype=np.float32) + direction_map = np.ones((h, w, 3)) * np.array([0, 0, 1]).reshape( + [1, 1, 3]).astype(np.float32) + + for poly_idx, poly_tag in enumerate(zip(polys, tags)): + poly = poly_tag[0] + tag = poly_tag[1] + + # generate min_area_quad + min_area_quad, center_point = self.gen_min_area_quad_from_poly(poly) + min_area_quad_h = 0.5 * ( + np.linalg.norm(min_area_quad[0] - min_area_quad[3]) + + np.linalg.norm(min_area_quad[1] - min_area_quad[2])) + min_area_quad_w = 0.5 * ( + np.linalg.norm(min_area_quad[0] - min_area_quad[1]) + + np.linalg.norm(min_area_quad[2] - min_area_quad[3])) + + if min(min_area_quad_h, min_area_quad_w) < self.min_text_size * ds_ratio \ + or min(min_area_quad_h, min_area_quad_w) > self.max_text_size * ds_ratio: + continue + + if tag: + # continue + cv2.fillPoly(training_mask, + poly.astype(np.int32)[np.newaxis, :, :], 0.15) + else: + tcl_poly = self.poly2tcl(poly, tcl_ratio) + tcl_quads = self.poly2quads(tcl_poly) + poly_quads = self.poly2quads(poly) + # stcl map + stcl_quads, quad_index = self.shrink_poly_along_width( + tcl_quads, + shrink_ratio_of_width=shrink_ratio_of_width, + expand_height_ratio=1.0 / tcl_ratio) + # generate tcl map + cv2.fillPoly(score_map, + np.round(stcl_quads).astype(np.int32), 1.0) + + # generate tbo map + for idx, quad in enumerate(stcl_quads): + quad_mask = np.zeros((h, w), dtype=np.float32) + quad_mask = cv2.fillPoly( + quad_mask, + np.round(quad[np.newaxis, :, :]).astype(np.int32), 1.0) + tbo_map = self.gen_quad_tbo(poly_quads[quad_index[idx]], + quad_mask, tbo_map) + return score_map, tbo_map, training_mask + + def generate_tvo_and_tco(self, + hw, + polys, + tags, + tcl_ratio=0.3, + ds_ratio=0.25): + """ + Generate tcl map, tvo map and tbo map. + """ + h, w = hw + h, w = int(h * ds_ratio), int(w * ds_ratio) + polys = polys * ds_ratio + poly_mask = np.zeros((h, w), dtype=np.float32) + + tvo_map = np.ones((9, h, w), dtype=np.float32) + tvo_map[0:-1:2] = np.tile(np.arange(0, w), (h, 1)) + tvo_map[1:-1:2] = np.tile(np.arange(0, w), (h, 1)).T + poly_tv_xy_map = np.zeros((8, h, w), dtype=np.float32) + + # tco map + tco_map = np.ones((3, h, w), dtype=np.float32) + tco_map[0] = np.tile(np.arange(0, w), (h, 1)) + tco_map[1] = np.tile(np.arange(0, w), (h, 1)).T + poly_tc_xy_map = np.zeros((2, h, w), dtype=np.float32) + + poly_short_edge_map = np.ones((h, w), dtype=np.float32) + + for poly, poly_tag in zip(polys, tags): + + if poly_tag == True: + continue + + # adjust point order for vertical poly + poly = self.adjust_point(poly) + + # generate min_area_quad + min_area_quad, center_point = self.gen_min_area_quad_from_poly(poly) + min_area_quad_h = 0.5 * ( + np.linalg.norm(min_area_quad[0] - min_area_quad[3]) + + np.linalg.norm(min_area_quad[1] - min_area_quad[2])) + min_area_quad_w = 0.5 * ( + np.linalg.norm(min_area_quad[0] - min_area_quad[1]) + + np.linalg.norm(min_area_quad[2] - min_area_quad[3])) + + # generate tcl map and text, 128 * 128 + tcl_poly = self.poly2tcl(poly, tcl_ratio) + + # generate poly_tv_xy_map + for idx in range(4): + cv2.fillPoly( + poly_tv_xy_map[2 * idx], + np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32), + float(min(max(min_area_quad[idx, 0], 0), w))) + cv2.fillPoly( + poly_tv_xy_map[2 * idx + 1], + np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32), + float(min(max(min_area_quad[idx, 1], 0), h))) + + # generate poly_tc_xy_map + for idx in range(2): + cv2.fillPoly( + poly_tc_xy_map[idx], + np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32), + float(center_point[idx])) + + # generate poly_short_edge_map + cv2.fillPoly( + poly_short_edge_map, + np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32), + float(max(min(min_area_quad_h, min_area_quad_w), 1.0))) + + # generate poly_mask and training_mask + cv2.fillPoly(poly_mask, + np.round(tcl_poly[np.newaxis, :, :]).astype(np.int32), + 1) + + tvo_map *= poly_mask + tvo_map[:8] -= poly_tv_xy_map + tvo_map[-1] /= poly_short_edge_map + tvo_map = tvo_map.transpose((1, 2, 0)) + + tco_map *= poly_mask + tco_map[:2] -= poly_tc_xy_map + tco_map[-1] /= poly_short_edge_map + tco_map = tco_map.transpose((1, 2, 0)) + + return tvo_map, tco_map + + def adjust_point(self, poly): + """ + adjust point order. + """ + point_num = poly.shape[0] + if point_num == 4: + len_1 = np.linalg.norm(poly[0] - poly[1]) + len_2 = np.linalg.norm(poly[1] - poly[2]) + len_3 = np.linalg.norm(poly[2] - poly[3]) + len_4 = np.linalg.norm(poly[3] - poly[0]) + + if (len_1 + len_3) * 1.5 < (len_2 + len_4): + poly = poly[[1, 2, 3, 0], :] + + elif point_num > 4: + vector_1 = poly[0] - poly[1] + vector_2 = poly[1] - poly[2] + cos_theta = np.dot(vector_1, vector_2) / ( + np.linalg.norm(vector_1) * np.linalg.norm(vector_2) + 1e-6) + theta = np.arccos(np.round(cos_theta, decimals=4)) + + if abs(theta) > (70 / 180 * math.pi): + index = list(range(1, point_num)) + [0] + poly = poly[np.array(index), :] + return poly + + def gen_min_area_quad_from_poly(self, poly): + """ + Generate min area quad from poly. + """ + point_num = poly.shape[0] + min_area_quad = np.zeros((4, 2), dtype=np.float32) + if point_num == 4: + min_area_quad = poly + center_point = np.sum(poly, axis=0) / 4 + else: + rect = cv2.minAreaRect(poly.astype( + np.int32)) # (center (x,y), (width, height), angle of rotation) + center_point = rect[0] + box = np.array(cv2.boxPoints(rect)) + + first_point_idx = 0 + min_dist = 1e4 + for i in range(4): + dist = np.linalg.norm(box[(i + 0) % 4] - poly[0]) + \ + np.linalg.norm(box[(i + 1) % 4] - poly[point_num // 2 - 1]) + \ + np.linalg.norm(box[(i + 2) % 4] - poly[point_num // 2]) + \ + np.linalg.norm(box[(i + 3) % 4] - poly[-1]) + if dist < min_dist: + min_dist = dist + first_point_idx = i + + for i in range(4): + min_area_quad[i] = box[(first_point_idx + i) % 4] + + return min_area_quad, center_point + + def shrink_quad_along_width(self, + quad, + begin_width_ratio=0., + end_width_ratio=1.): + """ + Generate shrink_quad_along_width. + """ + ratio_pair = np.array( + [[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + def shrink_poly_along_width(self, + quads, + shrink_ratio_of_width, + expand_height_ratio=1.0): + """ + shrink poly with given length. + """ + upper_edge_list = [] + + def get_cut_info(edge_len_list, cut_len): + for idx, edge_len in enumerate(edge_len_list): + cut_len -= edge_len + if cut_len <= 0.000001: + ratio = (cut_len + edge_len_list[idx]) / edge_len_list[idx] + return idx, ratio + + for quad in quads: + upper_edge_len = np.linalg.norm(quad[0] - quad[1]) + upper_edge_list.append(upper_edge_len) + + # length of left edge and right edge. + left_length = np.linalg.norm(quads[0][0] - quads[0][ + 3]) * expand_height_ratio + right_length = np.linalg.norm(quads[-1][1] - quads[-1][ + 2]) * expand_height_ratio + + shrink_length = min(left_length, right_length, + sum(upper_edge_list)) * shrink_ratio_of_width + # shrinking length + upper_len_left = shrink_length + upper_len_right = sum(upper_edge_list) - shrink_length + + left_idx, left_ratio = get_cut_info(upper_edge_list, upper_len_left) + left_quad = self.shrink_quad_along_width( + quads[left_idx], begin_width_ratio=left_ratio, end_width_ratio=1) + right_idx, right_ratio = get_cut_info(upper_edge_list, upper_len_right) + right_quad = self.shrink_quad_along_width( + quads[right_idx], begin_width_ratio=0, end_width_ratio=right_ratio) + + out_quad_list = [] + if left_idx == right_idx: + out_quad_list.append( + [left_quad[0], right_quad[1], right_quad[2], left_quad[3]]) + else: + out_quad_list.append(left_quad) + for idx in range(left_idx + 1, right_idx): + out_quad_list.append(quads[idx]) + out_quad_list.append(right_quad) + + return np.array(out_quad_list), list(range(left_idx, right_idx + 1)) + + def vector_angle(self, A, B): + """ + Calculate the angle between vector AB and x-axis positive direction. + """ + AB = np.array([B[1] - A[1], B[0] - A[0]]) + return np.arctan2(*AB) + + def theta_line_cross_point(self, theta, point): + """ + Calculate the line through given point and angle in ax + by + c =0 form. + """ + x, y = point + cos = np.cos(theta) + sin = np.sin(theta) + return [sin, -cos, cos * y - sin * x] + + def line_cross_two_point(self, A, B): + """ + Calculate the line through given point A and B in ax + by + c =0 form. + """ + angle = self.vector_angle(A, B) + return self.theta_line_cross_point(angle, A) + + def average_angle(self, poly): + """ + Calculate the average angle between left and right edge in given poly. + """ + p0, p1, p2, p3 = poly + angle30 = self.vector_angle(p3, p0) + angle21 = self.vector_angle(p2, p1) + return (angle30 + angle21) / 2 + + def line_cross_point(self, line1, line2): + """ + line1 and line2 in 0=ax+by+c form, compute the cross point of line1 and line2 + """ + a1, b1, c1 = line1 + a2, b2, c2 = line2 + d = a1 * b2 - a2 * b1 + + if d == 0: + #print("line1", line1) + #print("line2", line2) + print('Cross point does not exist') + return np.array([0, 0], dtype=np.float32) + else: + x = (b1 * c2 - b2 * c1) / d + y = (a2 * c1 - a1 * c2) / d + + return np.array([x, y], dtype=np.float32) + + def quad2tcl(self, poly, ratio): + """ + Generate center line by poly clock-wise point. (4, 2) + """ + ratio_pair = np.array( + [[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32) + p0_3 = poly[0] + (poly[3] - poly[0]) * ratio_pair + p1_2 = poly[1] + (poly[2] - poly[1]) * ratio_pair + return np.array([p0_3[0], p1_2[0], p1_2[1], p0_3[1]]) + + def poly2tcl(self, poly, ratio): + """ + Generate center line by poly clock-wise point. + """ + ratio_pair = np.array( + [[0.5 - ratio / 2], [0.5 + ratio / 2]], dtype=np.float32) + tcl_poly = np.zeros_like(poly) + point_num = poly.shape[0] + + for idx in range(point_num // 2): + point_pair = poly[idx] + (poly[point_num - 1 - idx] - poly[idx] + ) * ratio_pair + tcl_poly[idx] = point_pair[0] + tcl_poly[point_num - 1 - idx] = point_pair[1] + return tcl_poly + + def gen_quad_tbo(self, quad, tcl_mask, tbo_map): + """ + Generate tbo_map for give quad. + """ + # upper and lower line function: ax + by + c = 0; + up_line = self.line_cross_two_point(quad[0], quad[1]) + lower_line = self.line_cross_two_point(quad[3], quad[2]) + + quad_h = 0.5 * (np.linalg.norm(quad[0] - quad[3]) + + np.linalg.norm(quad[1] - quad[2])) + quad_w = 0.5 * (np.linalg.norm(quad[0] - quad[1]) + + np.linalg.norm(quad[2] - quad[3])) + + # average angle of left and right line. + angle = self.average_angle(quad) + + xy_in_poly = np.argwhere(tcl_mask == 1) + for y, x in xy_in_poly: + point = (x, y) + line = self.theta_line_cross_point(angle, point) + cross_point_upper = self.line_cross_point(up_line, line) + cross_point_lower = self.line_cross_point(lower_line, line) + ##FIX, offset reverse + upper_offset_x, upper_offset_y = cross_point_upper - point + lower_offset_x, lower_offset_y = cross_point_lower - point + tbo_map[y, x, 0] = upper_offset_y + tbo_map[y, x, 1] = upper_offset_x + tbo_map[y, x, 2] = lower_offset_y + tbo_map[y, x, 3] = lower_offset_x + tbo_map[y, x, 4] = 1.0 / max(min(quad_h, quad_w), 1.0) * 2 + return tbo_map + + def poly2quads(self, poly): + """ + Split poly into quads. + """ + quad_list = [] + point_num = poly.shape[0] + + # point pair + point_pair_list = [] + for idx in range(point_num // 2): + point_pair = [poly[idx], poly[point_num - 1 - idx]] + point_pair_list.append(point_pair) + + quad_num = point_num // 2 - 1 + for idx in range(quad_num): + # reshape and adjust to clock-wise + quad_list.append((np.array(point_pair_list)[[idx, idx + 1]] + ).reshape(4, 2)[[0, 2, 3, 1]]) + + return np.array(quad_list) + + def __call__(self, data): + im = data['image'] + text_polys = data['polys'] + text_tags = data['ignore_tags'] + if im is None: + return None + if text_polys.shape[0] == 0: + return None + + h, w, _ = im.shape + text_polys, text_tags, hv_tags = self.check_and_validate_polys( + text_polys, text_tags, (h, w)) + + if text_polys.shape[0] == 0: + return None + + #set aspect ratio and keep area fix + asp_scales = np.arange(1.0, 1.55, 0.1) + asp_scale = np.random.choice(asp_scales) + + if np.random.rand() < 0.5: + asp_scale = 1.0 / asp_scale + asp_scale = math.sqrt(asp_scale) + + asp_wx = asp_scale + asp_hy = 1.0 / asp_scale + im = cv2.resize(im, dsize=None, fx=asp_wx, fy=asp_hy) + text_polys[:, :, 0] *= asp_wx + text_polys[:, :, 1] *= asp_hy + + h, w, _ = im.shape + if max(h, w) > 2048: + rd_scale = 2048.0 / max(h, w) + im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale) + text_polys *= rd_scale + h, w, _ = im.shape + if min(h, w) < 16: + return None + + #no background + im, text_polys, text_tags, hv_tags = self.crop_area(im, \ + text_polys, text_tags, hv_tags, crop_background=False) + + if text_polys.shape[0] == 0: + return None + #continue for all ignore case + if np.sum((text_tags * 1.0)) >= text_tags.size: + return None + new_h, new_w, _ = im.shape + if (new_h is None) or (new_w is None): + return None + #resize image + std_ratio = float(self.input_size) / max(new_w, new_h) + rand_scales = np.array( + [0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0, 1.0, 1.0, 1.0, 1.0]) + rz_scale = std_ratio * np.random.choice(rand_scales) + im = cv2.resize(im, dsize=None, fx=rz_scale, fy=rz_scale) + text_polys[:, :, 0] *= rz_scale + text_polys[:, :, 1] *= rz_scale + + #add gaussian blur + if np.random.rand() < 0.1 * 0.5: + ks = np.random.permutation(5)[0] + 1 + ks = int(ks / 2) * 2 + 1 + im = cv2.GaussianBlur(im, ksize=(ks, ks), sigmaX=0, sigmaY=0) + #add brighter + if np.random.rand() < 0.1 * 0.5: + im = im * (1.0 + np.random.rand() * 0.5) + im = np.clip(im, 0.0, 255.0) + #add darker + if np.random.rand() < 0.1 * 0.5: + im = im * (1.0 - np.random.rand() * 0.5) + im = np.clip(im, 0.0, 255.0) + + # Padding the im to [input_size, input_size] + new_h, new_w, _ = im.shape + if min(new_w, new_h) < self.input_size * 0.5: + return None + + im_padded = np.ones( + (self.input_size, self.input_size, 3), dtype=np.float32) + im_padded[:, :, 2] = 0.485 * 255 + im_padded[:, :, 1] = 0.456 * 255 + im_padded[:, :, 0] = 0.406 * 255 + + # Random the start position + del_h = self.input_size - new_h + del_w = self.input_size - new_w + sh, sw = 0, 0 + if del_h > 1: + sh = int(np.random.rand() * del_h) + if del_w > 1: + sw = int(np.random.rand() * del_w) + + # Padding + im_padded[sh:sh + new_h, sw:sw + new_w, :] = im.copy() + text_polys[:, :, 0] += sw + text_polys[:, :, 1] += sh + + score_map, border_map, training_mask = self.generate_tcl_label( + (self.input_size, self.input_size), text_polys, text_tags, 0.25) + + # SAST head + tvo_map, tco_map = self.generate_tvo_and_tco( + (self.input_size, self.input_size), + text_polys, + text_tags, + tcl_ratio=0.3, + ds_ratio=0.25) + # print("test--------tvo_map shape:", tvo_map.shape) + + im_padded[:, :, 2] -= 0.485 * 255 + im_padded[:, :, 1] -= 0.456 * 255 + im_padded[:, :, 0] -= 0.406 * 255 + im_padded[:, :, 2] /= (255.0 * 0.229) + im_padded[:, :, 1] /= (255.0 * 0.224) + im_padded[:, :, 0] /= (255.0 * 0.225) + im_padded = im_padded.transpose((2, 0, 1)) + + data['image'] = im_padded[::-1, :, :] + data['score_map'] = score_map[np.newaxis, :, :] + data['border_map'] = border_map.transpose((2, 0, 1)) + data['training_mask'] = training_mask[np.newaxis, :, :] + data['tvo_map'] = tvo_map.transpose((2, 0, 1)) + data['tco_map'] = tco_map.transpose((2, 0, 1)) + return data diff --git a/ppocr/data/imaug/ssl_img_aug.py b/ppocr/data/imaug/ssl_img_aug.py new file mode 100644 index 0000000000000000000000000000000000000000..f9ed6ac3e230ae85754bf40189c392c7e6e29b63 --- /dev/null +++ b/ppocr/data/imaug/ssl_img_aug.py @@ -0,0 +1,60 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import cv2 +import numpy as np +import random +from PIL import Image + +from .rec_img_aug import resize_norm_img + + +class SSLRotateResize(object): + def __init__(self, + image_shape, + padding=False, + select_all=True, + mode="train", + **kwargs): + self.image_shape = image_shape + self.padding = padding + self.select_all = select_all + self.mode = mode + + def __call__(self, data): + img = data["image"] + + data["image_r90"] = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE) + data["image_r180"] = cv2.rotate(data["image_r90"], + cv2.ROTATE_90_CLOCKWISE) + data["image_r270"] = cv2.rotate(data["image_r180"], + cv2.ROTATE_90_CLOCKWISE) + + images = [] + for key in ["image", "image_r90", "image_r180", "image_r270"]: + images.append( + resize_norm_img( + data.pop(key), + image_shape=self.image_shape, + padding=self.padding)[0]) + data["image"] = np.stack(images, axis=0) + data["label"] = np.array(list(range(4))) + if not self.select_all: + data["image"] = data["image"][0::2] # just choose 0 and 180 + data["label"] = data["label"][0:2] # label needs to be continuous + if self.mode == "test": + data["image"] = data["image"][0] + data["label"] = data["label"][0] + return data diff --git a/ppocr/data/imaug/table_ops.py b/ppocr/data/imaug/table_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..c2c2fb2be6c80fdeb637717af2bbe122e1be999c --- /dev/null +++ b/ppocr/data/imaug/table_ops.py @@ -0,0 +1,229 @@ +""" +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import sys +import six +import cv2 +import numpy as np + + +class GenTableMask(object): + """ gen table mask """ + + def __init__(self, shrink_h_max, shrink_w_max, mask_type=0, **kwargs): + self.shrink_h_max = 5 + self.shrink_w_max = 5 + self.mask_type = mask_type + + def projection(self, erosion, h, w, spilt_threshold=0): + # 水平投影 + projection_map = np.ones_like(erosion) + project_val_array = [0 for _ in range(0, h)] + + for j in range(0, h): + for i in range(0, w): + if erosion[j, i] == 255: + project_val_array[j] += 1 + # 根据数组,获取切割点 + start_idx = 0 # 记录进入字符区的索引 + end_idx = 0 # 记录进入空白区域的索引 + in_text = False # 是否遍历到了字符区内 + box_list = [] + for i in range(len(project_val_array)): + if in_text == False and project_val_array[ + i] > spilt_threshold: # 进入字符区了 + in_text = True + start_idx = i + elif project_val_array[ + i] <= spilt_threshold and in_text == True: # 进入空白区了 + end_idx = i + in_text = False + if end_idx - start_idx <= 2: + continue + box_list.append((start_idx, end_idx + 1)) + + if in_text: + box_list.append((start_idx, h - 1)) + # 绘制投影直方图 + for j in range(0, h): + for i in range(0, project_val_array[j]): + projection_map[j, i] = 0 + return box_list, projection_map + + def projection_cx(self, box_img): + box_gray_img = cv2.cvtColor(box_img, cv2.COLOR_BGR2GRAY) + h, w = box_gray_img.shape + # 灰度图片进行二值化处理 + ret, thresh1 = cv2.threshold(box_gray_img, 200, 255, + cv2.THRESH_BINARY_INV) + # 纵向腐蚀 + if h < w: + kernel = np.ones((2, 1), np.uint8) + erode = cv2.erode(thresh1, kernel, iterations=1) + else: + erode = thresh1 + # 水平膨胀 + kernel = np.ones((1, 5), np.uint8) + erosion = cv2.dilate(erode, kernel, iterations=1) + # 水平投影 + projection_map = np.ones_like(erosion) + project_val_array = [0 for _ in range(0, h)] + + for j in range(0, h): + for i in range(0, w): + if erosion[j, i] == 255: + project_val_array[j] += 1 + # 根据数组,获取切割点 + start_idx = 0 # 记录进入字符区的索引 + end_idx = 0 # 记录进入空白区域的索引 + in_text = False # 是否遍历到了字符区内 + box_list = [] + spilt_threshold = 0 + for i in range(len(project_val_array)): + if in_text == False and project_val_array[ + i] > spilt_threshold: # 进入字符区了 + in_text = True + start_idx = i + elif project_val_array[ + i] <= spilt_threshold and in_text == True: # 进入空白区了 + end_idx = i + in_text = False + if end_idx - start_idx <= 2: + continue + box_list.append((start_idx, end_idx + 1)) + + if in_text: + box_list.append((start_idx, h - 1)) + # 绘制投影直方图 + for j in range(0, h): + for i in range(0, project_val_array[j]): + projection_map[j, i] = 0 + split_bbox_list = [] + if len(box_list) > 1: + for i, (h_start, h_end) in enumerate(box_list): + if i == 0: + h_start = 0 + if i == len(box_list): + h_end = h + word_img = erosion[h_start:h_end + 1, :] + word_h, word_w = word_img.shape + w_split_list, w_projection_map = self.projection(word_img.T, + word_w, word_h) + w_start, w_end = w_split_list[0][0], w_split_list[-1][1] + if h_start > 0: + h_start -= 1 + h_end += 1 + word_img = box_img[h_start:h_end + 1:, w_start:w_end + 1, :] + split_bbox_list.append([w_start, h_start, w_end, h_end]) + else: + split_bbox_list.append([0, 0, w, h]) + return split_bbox_list + + def shrink_bbox(self, bbox): + left, top, right, bottom = bbox + sh_h = min(max(int((bottom - top) * 0.1), 1), self.shrink_h_max) + sh_w = min(max(int((right - left) * 0.1), 1), self.shrink_w_max) + left_new = left + sh_w + right_new = right - sh_w + top_new = top + sh_h + bottom_new = bottom - sh_h + if left_new >= right_new: + left_new = left + right_new = right + if top_new >= bottom_new: + top_new = top + bottom_new = bottom + return [left_new, top_new, right_new, bottom_new] + + def __call__(self, data): + img = data['image'] + cells = data['cells'] + height, width = img.shape[0:2] + if self.mask_type == 1: + mask_img = np.zeros((height, width), dtype=np.float32) + else: + mask_img = np.zeros((height, width, 3), dtype=np.float32) + cell_num = len(cells) + for cno in range(cell_num): + if "bbox" in cells[cno]: + bbox = cells[cno]['bbox'] + left, top, right, bottom = bbox + box_img = img[top:bottom, left:right, :].copy() + split_bbox_list = self.projection_cx(box_img) + for sno in range(len(split_bbox_list)): + split_bbox_list[sno][0] += left + split_bbox_list[sno][1] += top + split_bbox_list[sno][2] += left + split_bbox_list[sno][3] += top + + for sno in range(len(split_bbox_list)): + left, top, right, bottom = split_bbox_list[sno] + left, top, right, bottom = self.shrink_bbox( + [left, top, right, bottom]) + if self.mask_type == 1: + mask_img[top:bottom, left:right] = 1.0 + data['mask_img'] = mask_img + else: + mask_img[top:bottom, left:right, :] = (255, 255, 255) + data['image'] = mask_img + return data + + +class ResizeTableImage(object): + def __init__(self, max_len, resize_bboxes=False, infer_mode=False, + **kwargs): + super(ResizeTableImage, self).__init__() + self.max_len = max_len + self.resize_bboxes = resize_bboxes + self.infer_mode = infer_mode + + def __call__(self, data): + img = data['image'] + height, width = img.shape[0:2] + ratio = self.max_len / (max(height, width) * 1.0) + resize_h = int(height * ratio) + resize_w = int(width * ratio) + resize_img = cv2.resize(img, (resize_w, resize_h)) + if self.resize_bboxes and not self.infer_mode: + data['bboxes'] = data['bboxes'] * ratio + data['image'] = resize_img + data['src_img'] = img + data['shape'] = np.array([height, width, ratio, ratio]) + data['max_len'] = self.max_len + return data + + +class PaddingTableImage(object): + def __init__(self, size, **kwargs): + super(PaddingTableImage, self).__init__() + self.size = size + + def __call__(self, data): + img = data['image'] + pad_h, pad_w = self.size + padding_img = np.zeros((pad_h, pad_w, 3), dtype=np.float32) + height, width = img.shape[0:2] + padding_img[0:height, 0:width, :] = img.copy() + data['image'] = padding_img + shape = data['shape'].tolist() + shape.extend([pad_h, pad_w]) + data['shape'] = np.array(shape) + return data diff --git a/ppocr/data/imaug/text_image_aug/__init__.py b/ppocr/data/imaug/text_image_aug/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bca262638efb00190aaf2b6328cd34a9e87ba131 --- /dev/null +++ b/ppocr/data/imaug/text_image_aug/__init__.py @@ -0,0 +1,17 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .augment import tia_perspective, tia_distort, tia_stretch + +__all__ = ['tia_distort', 'tia_stretch', 'tia_perspective'] diff --git a/ppocr/data/imaug/text_image_aug/__pycache__/__init__.cpython-37.pyc b/ppocr/data/imaug/text_image_aug/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63fdf22d721ed555bf836b39ee17cf886c688d9b Binary files /dev/null and b/ppocr/data/imaug/text_image_aug/__pycache__/__init__.cpython-37.pyc differ diff --git a/ppocr/data/imaug/text_image_aug/__pycache__/__init__.cpython-38.pyc b/ppocr/data/imaug/text_image_aug/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..302f5639b1f3b7aa8dddbf53496d21667f80a494 Binary files /dev/null and b/ppocr/data/imaug/text_image_aug/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppocr/data/imaug/text_image_aug/__pycache__/augment.cpython-37.pyc b/ppocr/data/imaug/text_image_aug/__pycache__/augment.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca727fb99b55e672675da01f0af94b11c7dc163b Binary files /dev/null and b/ppocr/data/imaug/text_image_aug/__pycache__/augment.cpython-37.pyc differ diff --git a/ppocr/data/imaug/text_image_aug/__pycache__/augment.cpython-38.pyc b/ppocr/data/imaug/text_image_aug/__pycache__/augment.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6f98436f9860d02ac0bf073510b4fa118559319 Binary files /dev/null and b/ppocr/data/imaug/text_image_aug/__pycache__/augment.cpython-38.pyc differ diff --git a/ppocr/data/imaug/text_image_aug/__pycache__/warp_mls.cpython-37.pyc b/ppocr/data/imaug/text_image_aug/__pycache__/warp_mls.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0abd55f9e6e7cad89f951d400579c21fc5695437 Binary files /dev/null and b/ppocr/data/imaug/text_image_aug/__pycache__/warp_mls.cpython-37.pyc differ diff --git a/ppocr/data/imaug/text_image_aug/__pycache__/warp_mls.cpython-38.pyc b/ppocr/data/imaug/text_image_aug/__pycache__/warp_mls.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a8ddf4d52af8d8aa7be895409f3898ab4629a1a Binary files /dev/null and b/ppocr/data/imaug/text_image_aug/__pycache__/warp_mls.cpython-38.pyc differ diff --git a/ppocr/data/imaug/text_image_aug/augment.py b/ppocr/data/imaug/text_image_aug/augment.py new file mode 100644 index 0000000000000000000000000000000000000000..2d15dd5f353c72a6cc3876481c423d81a8175c95 --- /dev/null +++ b/ppocr/data/imaug/text_image_aug/augment.py @@ -0,0 +1,120 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/RubanSeven/Text-Image-Augmentation-python/blob/master/augment.py +""" + +import numpy as np +from .warp_mls import WarpMLS + + +def tia_distort(src, segment=4): + img_h, img_w = src.shape[:2] + + cut = img_w // segment + thresh = cut // 3 + + src_pts = list() + dst_pts = list() + + src_pts.append([0, 0]) + src_pts.append([img_w, 0]) + src_pts.append([img_w, img_h]) + src_pts.append([0, img_h]) + + dst_pts.append([np.random.randint(thresh), np.random.randint(thresh)]) + dst_pts.append( + [img_w - np.random.randint(thresh), np.random.randint(thresh)]) + dst_pts.append( + [img_w - np.random.randint(thresh), img_h - np.random.randint(thresh)]) + dst_pts.append( + [np.random.randint(thresh), img_h - np.random.randint(thresh)]) + + half_thresh = thresh * 0.5 + + for cut_idx in np.arange(1, segment, 1): + src_pts.append([cut * cut_idx, 0]) + src_pts.append([cut * cut_idx, img_h]) + dst_pts.append([ + cut * cut_idx + np.random.randint(thresh) - half_thresh, + np.random.randint(thresh) - half_thresh + ]) + dst_pts.append([ + cut * cut_idx + np.random.randint(thresh) - half_thresh, + img_h + np.random.randint(thresh) - half_thresh + ]) + + trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h) + dst = trans.generate() + + return dst + + +def tia_stretch(src, segment=4): + img_h, img_w = src.shape[:2] + + cut = img_w // segment + thresh = cut * 4 // 5 + + src_pts = list() + dst_pts = list() + + src_pts.append([0, 0]) + src_pts.append([img_w, 0]) + src_pts.append([img_w, img_h]) + src_pts.append([0, img_h]) + + dst_pts.append([0, 0]) + dst_pts.append([img_w, 0]) + dst_pts.append([img_w, img_h]) + dst_pts.append([0, img_h]) + + half_thresh = thresh * 0.5 + + for cut_idx in np.arange(1, segment, 1): + move = np.random.randint(thresh) - half_thresh + src_pts.append([cut * cut_idx, 0]) + src_pts.append([cut * cut_idx, img_h]) + dst_pts.append([cut * cut_idx + move, 0]) + dst_pts.append([cut * cut_idx + move, img_h]) + + trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h) + dst = trans.generate() + + return dst + + +def tia_perspective(src): + img_h, img_w = src.shape[:2] + + thresh = img_h // 2 + + src_pts = list() + dst_pts = list() + + src_pts.append([0, 0]) + src_pts.append([img_w, 0]) + src_pts.append([img_w, img_h]) + src_pts.append([0, img_h]) + + dst_pts.append([0, np.random.randint(thresh)]) + dst_pts.append([img_w, np.random.randint(thresh)]) + dst_pts.append([img_w, img_h - np.random.randint(thresh)]) + dst_pts.append([0, img_h - np.random.randint(thresh)]) + + trans = WarpMLS(src, src_pts, dst_pts, img_w, img_h) + dst = trans.generate() + + return dst \ No newline at end of file diff --git a/ppocr/data/imaug/text_image_aug/warp_mls.py b/ppocr/data/imaug/text_image_aug/warp_mls.py new file mode 100644 index 0000000000000000000000000000000000000000..75de11115cf9ba824a7cd62b8b880ea7f99e4cb2 --- /dev/null +++ b/ppocr/data/imaug/text_image_aug/warp_mls.py @@ -0,0 +1,168 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/RubanSeven/Text-Image-Augmentation-python/blob/master/warp_mls.py +""" + +import numpy as np + + +class WarpMLS: + def __init__(self, src, src_pts, dst_pts, dst_w, dst_h, trans_ratio=1.): + self.src = src + self.src_pts = src_pts + self.dst_pts = dst_pts + self.pt_count = len(self.dst_pts) + self.dst_w = dst_w + self.dst_h = dst_h + self.trans_ratio = trans_ratio + self.grid_size = 100 + self.rdx = np.zeros((self.dst_h, self.dst_w)) + self.rdy = np.zeros((self.dst_h, self.dst_w)) + + @staticmethod + def __bilinear_interp(x, y, v11, v12, v21, v22): + return (v11 * (1 - y) + v12 * y) * (1 - x) + (v21 * + (1 - y) + v22 * y) * x + + def generate(self): + self.calc_delta() + return self.gen_img() + + def calc_delta(self): + w = np.zeros(self.pt_count, dtype=np.float32) + + if self.pt_count < 2: + return + + i = 0 + while 1: + if self.dst_w <= i < self.dst_w + self.grid_size - 1: + i = self.dst_w - 1 + elif i >= self.dst_w: + break + + j = 0 + while 1: + if self.dst_h <= j < self.dst_h + self.grid_size - 1: + j = self.dst_h - 1 + elif j >= self.dst_h: + break + + sw = 0 + swp = np.zeros(2, dtype=np.float32) + swq = np.zeros(2, dtype=np.float32) + new_pt = np.zeros(2, dtype=np.float32) + cur_pt = np.array([i, j], dtype=np.float32) + + k = 0 + for k in range(self.pt_count): + if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]: + break + + w[k] = 1. / ( + (i - self.dst_pts[k][0]) * (i - self.dst_pts[k][0]) + + (j - self.dst_pts[k][1]) * (j - self.dst_pts[k][1])) + + sw += w[k] + swp = swp + w[k] * np.array(self.dst_pts[k]) + swq = swq + w[k] * np.array(self.src_pts[k]) + + if k == self.pt_count - 1: + pstar = 1 / sw * swp + qstar = 1 / sw * swq + + miu_s = 0 + for k in range(self.pt_count): + if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]: + continue + pt_i = self.dst_pts[k] - pstar + miu_s += w[k] * np.sum(pt_i * pt_i) + + cur_pt -= pstar + cur_pt_j = np.array([-cur_pt[1], cur_pt[0]]) + + for k in range(self.pt_count): + if i == self.dst_pts[k][0] and j == self.dst_pts[k][1]: + continue + + pt_i = self.dst_pts[k] - pstar + pt_j = np.array([-pt_i[1], pt_i[0]]) + + tmp_pt = np.zeros(2, dtype=np.float32) + tmp_pt[0] = np.sum(pt_i * cur_pt) * self.src_pts[k][0] - \ + np.sum(pt_j * cur_pt) * self.src_pts[k][1] + tmp_pt[1] = -np.sum(pt_i * cur_pt_j) * self.src_pts[k][0] + \ + np.sum(pt_j * cur_pt_j) * self.src_pts[k][1] + tmp_pt *= (w[k] / miu_s) + new_pt += tmp_pt + + new_pt += qstar + else: + new_pt = self.src_pts[k] + + self.rdx[j, i] = new_pt[0] - i + self.rdy[j, i] = new_pt[1] - j + + j += self.grid_size + i += self.grid_size + + def gen_img(self): + src_h, src_w = self.src.shape[:2] + dst = np.zeros_like(self.src, dtype=np.float32) + + for i in np.arange(0, self.dst_h, self.grid_size): + for j in np.arange(0, self.dst_w, self.grid_size): + ni = i + self.grid_size + nj = j + self.grid_size + w = h = self.grid_size + if ni >= self.dst_h: + ni = self.dst_h - 1 + h = ni - i + 1 + if nj >= self.dst_w: + nj = self.dst_w - 1 + w = nj - j + 1 + + di = np.reshape(np.arange(h), (-1, 1)) + dj = np.reshape(np.arange(w), (1, -1)) + delta_x = self.__bilinear_interp( + di / h, dj / w, self.rdx[i, j], self.rdx[i, nj], + self.rdx[ni, j], self.rdx[ni, nj]) + delta_y = self.__bilinear_interp( + di / h, dj / w, self.rdy[i, j], self.rdy[i, nj], + self.rdy[ni, j], self.rdy[ni, nj]) + nx = j + dj + delta_x * self.trans_ratio + ny = i + di + delta_y * self.trans_ratio + nx = np.clip(nx, 0, src_w - 1) + ny = np.clip(ny, 0, src_h - 1) + nxi = np.array(np.floor(nx), dtype=np.int32) + nyi = np.array(np.floor(ny), dtype=np.int32) + nxi1 = np.array(np.ceil(nx), dtype=np.int32) + nyi1 = np.array(np.ceil(ny), dtype=np.int32) + + if len(self.src.shape) == 3: + x = np.tile(np.expand_dims(ny - nyi, axis=-1), (1, 1, 3)) + y = np.tile(np.expand_dims(nx - nxi, axis=-1), (1, 1, 3)) + else: + x = ny - nyi + y = nx - nxi + dst[i:i + h, j:j + w] = self.__bilinear_interp( + x, y, self.src[nyi, nxi], self.src[nyi, nxi1], + self.src[nyi1, nxi], self.src[nyi1, nxi1]) + + dst = np.clip(dst, 0, 255) + dst = np.array(dst, dtype=np.uint8) + + return dst diff --git a/ppocr/data/imaug/vqa/__init__.py b/ppocr/data/imaug/vqa/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..73f7dcdf712f6db0ff4354b1b01134d1277ff078 --- /dev/null +++ b/ppocr/data/imaug/vqa/__init__.py @@ -0,0 +1,20 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .token import VQATokenPad, VQASerTokenChunk, VQAReTokenChunk, VQAReTokenRelation, TensorizeEntitiesRelations + +__all__ = [ + 'VQATokenPad', 'VQASerTokenChunk', 'VQAReTokenChunk', 'VQAReTokenRelation', + 'TensorizeEntitiesRelations' +] diff --git a/ppocr/data/imaug/vqa/augment.py b/ppocr/data/imaug/vqa/augment.py new file mode 100644 index 0000000000000000000000000000000000000000..b95fcdf0f0baea481de59321a22dab283d99e693 --- /dev/null +++ b/ppocr/data/imaug/vqa/augment.py @@ -0,0 +1,33 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import numpy as np +import random +from copy import deepcopy + + +def order_by_tbyx(ocr_info): + res = sorted(ocr_info, key=lambda r: (r["bbox"][1], r["bbox"][0])) + for i in range(len(res) - 1): + for j in range(i, 0, -1): + if abs(res[j + 1]["bbox"][1] - res[j]["bbox"][1]) < 20 and \ + (res[j + 1]["bbox"][0] < res[j]["bbox"][0]): + tmp = deepcopy(res[j]) + res[j] = deepcopy(res[j + 1]) + res[j + 1] = deepcopy(tmp) + else: + break + return res diff --git a/ppocr/data/imaug/vqa/token/__init__.py b/ppocr/data/imaug/vqa/token/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5fbaa43db9e7182cfa3efa4f3dc0d9e54c17c822 --- /dev/null +++ b/ppocr/data/imaug/vqa/token/__init__.py @@ -0,0 +1,18 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .vqa_token_chunk import VQASerTokenChunk, VQAReTokenChunk +from .vqa_token_pad import VQATokenPad +from .vqa_token_relation import VQAReTokenRelation +from .vqa_re_convert import TensorizeEntitiesRelations \ No newline at end of file diff --git a/ppocr/data/imaug/vqa/token/vqa_re_convert.py b/ppocr/data/imaug/vqa/token/vqa_re_convert.py new file mode 100644 index 0000000000000000000000000000000000000000..86962f2590b57f38640d76ef5d8b74ead5e854e0 --- /dev/null +++ b/ppocr/data/imaug/vqa/token/vqa_re_convert.py @@ -0,0 +1,51 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +class TensorizeEntitiesRelations(object): + def __init__(self, max_seq_len=512, infer_mode=False, **kwargs): + self.max_seq_len = max_seq_len + self.infer_mode = infer_mode + + def __call__(self, data): + entities = data['entities'] + relations = data['relations'] + + entities_new = np.full( + shape=[self.max_seq_len + 1, 3], fill_value=-1, dtype='int64') + entities_new[0, 0] = len(entities['start']) + entities_new[0, 1] = len(entities['end']) + entities_new[0, 2] = len(entities['label']) + entities_new[1:len(entities['start']) + 1, 0] = np.array(entities[ + 'start']) + entities_new[1:len(entities['end']) + 1, 1] = np.array(entities['end']) + entities_new[1:len(entities['label']) + 1, 2] = np.array(entities[ + 'label']) + + relations_new = np.full( + shape=[self.max_seq_len * self.max_seq_len + 1, 2], + fill_value=-1, + dtype='int64') + relations_new[0, 0] = len(relations['head']) + relations_new[0, 1] = len(relations['tail']) + relations_new[1:len(relations['head']) + 1, 0] = np.array(relations[ + 'head']) + relations_new[1:len(relations['tail']) + 1, 1] = np.array(relations[ + 'tail']) + + data['entities'] = entities_new + data['relations'] = relations_new + return data diff --git a/ppocr/data/imaug/vqa/token/vqa_token_chunk.py b/ppocr/data/imaug/vqa/token/vqa_token_chunk.py new file mode 100644 index 0000000000000000000000000000000000000000..1fa949e688289b320c6a7c121c944708febe2c9d --- /dev/null +++ b/ppocr/data/imaug/vqa/token/vqa_token_chunk.py @@ -0,0 +1,122 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict + + +class VQASerTokenChunk(object): + def __init__(self, max_seq_len=512, infer_mode=False, **kwargs): + self.max_seq_len = max_seq_len + self.infer_mode = infer_mode + + def __call__(self, data): + encoded_inputs_all = [] + seq_len = len(data['input_ids']) + for index in range(0, seq_len, self.max_seq_len): + chunk_beg = index + chunk_end = min(index + self.max_seq_len, seq_len) + encoded_inputs_example = {} + for key in data: + if key in [ + 'label', 'input_ids', 'labels', 'token_type_ids', + 'bbox', 'attention_mask' + ]: + if self.infer_mode and key == 'labels': + encoded_inputs_example[key] = data[key] + else: + encoded_inputs_example[key] = data[key][chunk_beg: + chunk_end] + else: + encoded_inputs_example[key] = data[key] + + encoded_inputs_all.append(encoded_inputs_example) + if len(encoded_inputs_all) == 0: + return None + return encoded_inputs_all[0] + + +class VQAReTokenChunk(object): + def __init__(self, + max_seq_len=512, + entities_labels=None, + infer_mode=False, + **kwargs): + self.max_seq_len = max_seq_len + self.entities_labels = { + 'HEADER': 0, + 'QUESTION': 1, + 'ANSWER': 2 + } if entities_labels is None else entities_labels + self.infer_mode = infer_mode + + def __call__(self, data): + # prepare data + entities = data.pop('entities') + relations = data.pop('relations') + encoded_inputs_all = [] + for index in range(0, len(data["input_ids"]), self.max_seq_len): + item = {} + for key in data: + if key in [ + 'label', 'input_ids', 'labels', 'token_type_ids', + 'bbox', 'attention_mask' + ]: + if self.infer_mode and key == 'labels': + item[key] = data[key] + else: + item[key] = data[key][index:index + self.max_seq_len] + else: + item[key] = data[key] + # select entity in current chunk + entities_in_this_span = [] + global_to_local_map = {} # + for entity_id, entity in enumerate(entities): + if (index <= entity["start"] < index + self.max_seq_len and + index <= entity["end"] < index + self.max_seq_len): + entity["start"] = entity["start"] - index + entity["end"] = entity["end"] - index + global_to_local_map[entity_id] = len(entities_in_this_span) + entities_in_this_span.append(entity) + + # select relations in current chunk + relations_in_this_span = [] + for relation in relations: + if (index <= relation["start_index"] < index + self.max_seq_len + and index <= relation["end_index"] < + index + self.max_seq_len): + relations_in_this_span.append({ + "head": global_to_local_map[relation["head"]], + "tail": global_to_local_map[relation["tail"]], + "start_index": relation["start_index"] - index, + "end_index": relation["end_index"] - index, + }) + item.update({ + "entities": self.reformat(entities_in_this_span), + "relations": self.reformat(relations_in_this_span), + }) + if len(item['entities']) > 0: + item['entities']['label'] = [ + self.entities_labels[x] for x in item['entities']['label'] + ] + encoded_inputs_all.append(item) + if len(encoded_inputs_all) == 0: + return None + return encoded_inputs_all[0] + + def reformat(self, data): + new_data = defaultdict(list) + for item in data: + for k, v in item.items(): + new_data[k].append(v) + return new_data diff --git a/ppocr/data/imaug/vqa/token/vqa_token_pad.py b/ppocr/data/imaug/vqa/token/vqa_token_pad.py new file mode 100644 index 0000000000000000000000000000000000000000..8e5a20f95f0159e5c57072dd86eff0f25cf49eac --- /dev/null +++ b/ppocr/data/imaug/vqa/token/vqa_token_pad.py @@ -0,0 +1,104 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle +import numpy as np + + +class VQATokenPad(object): + def __init__(self, + max_seq_len=512, + pad_to_max_seq_len=True, + return_attention_mask=True, + return_token_type_ids=True, + truncation_strategy="longest_first", + return_overflowing_tokens=False, + return_special_tokens_mask=False, + infer_mode=False, + **kwargs): + self.max_seq_len = max_seq_len + self.pad_to_max_seq_len = max_seq_len + self.return_attention_mask = return_attention_mask + self.return_token_type_ids = return_token_type_ids + self.truncation_strategy = truncation_strategy + self.return_overflowing_tokens = return_overflowing_tokens + self.return_special_tokens_mask = return_special_tokens_mask + self.pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index + self.infer_mode = infer_mode + + def __call__(self, data): + needs_to_be_padded = self.pad_to_max_seq_len and len(data[ + "input_ids"]) < self.max_seq_len + + if needs_to_be_padded: + if 'tokenizer_params' in data: + tokenizer_params = data.pop('tokenizer_params') + else: + tokenizer_params = dict( + padding_side='right', pad_token_type_id=0, pad_token_id=1) + + difference = self.max_seq_len - len(data["input_ids"]) + if tokenizer_params['padding_side'] == 'right': + if self.return_attention_mask: + data["attention_mask"] = [1] * len(data[ + "input_ids"]) + [0] * difference + if self.return_token_type_ids: + data["token_type_ids"] = ( + data["token_type_ids"] + + [tokenizer_params['pad_token_type_id']] * difference) + if self.return_special_tokens_mask: + data["special_tokens_mask"] = data[ + "special_tokens_mask"] + [1] * difference + data["input_ids"] = data["input_ids"] + [ + tokenizer_params['pad_token_id'] + ] * difference + if not self.infer_mode: + data["labels"] = data[ + "labels"] + [self.pad_token_label_id] * difference + data["bbox"] = data["bbox"] + [[0, 0, 0, 0]] * difference + elif tokenizer_params['padding_side'] == 'left': + if self.return_attention_mask: + data["attention_mask"] = [0] * difference + [ + 1 + ] * len(data["input_ids"]) + if self.return_token_type_ids: + data["token_type_ids"] = ( + [tokenizer_params['pad_token_type_id']] * difference + + data["token_type_ids"]) + if self.return_special_tokens_mask: + data["special_tokens_mask"] = [ + 1 + ] * difference + data["special_tokens_mask"] + data["input_ids"] = [tokenizer_params['pad_token_id'] + ] * difference + data["input_ids"] + if not self.infer_mode: + data["labels"] = [self.pad_token_label_id + ] * difference + data["labels"] + data["bbox"] = [[0, 0, 0, 0]] * difference + data["bbox"] + else: + if self.return_attention_mask: + data["attention_mask"] = [1] * len(data["input_ids"]) + + for key in data: + if key in [ + 'input_ids', 'labels', 'token_type_ids', 'bbox', + 'attention_mask' + ]: + if self.infer_mode: + if key != 'labels': + length = min(len(data[key]), self.max_seq_len) + data[key] = data[key][:length] + else: + continue + data[key] = np.array(data[key], dtype='int64') + return data diff --git a/ppocr/data/imaug/vqa/token/vqa_token_relation.py b/ppocr/data/imaug/vqa/token/vqa_token_relation.py new file mode 100644 index 0000000000000000000000000000000000000000..293988ff85aecb39bac84b412f3466abecc6db4d --- /dev/null +++ b/ppocr/data/imaug/vqa/token/vqa_token_relation.py @@ -0,0 +1,67 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class VQAReTokenRelation(object): + def __init__(self, **kwargs): + pass + + def __call__(self, data): + """ + build relations + """ + entities = data['entities'] + relations = data['relations'] + id2label = data.pop('id2label') + empty_entity = data.pop('empty_entity') + entity_id_to_index_map = data.pop('entity_id_to_index_map') + + relations = list(set(relations)) + relations = [ + rel for rel in relations + if rel[0] not in empty_entity and rel[1] not in empty_entity + ] + kv_relations = [] + for rel in relations: + pair = [id2label[rel[0]], id2label[rel[1]]] + if pair == ["question", "answer"]: + kv_relations.append({ + "head": entity_id_to_index_map[rel[0]], + "tail": entity_id_to_index_map[rel[1]] + }) + elif pair == ["answer", "question"]: + kv_relations.append({ + "head": entity_id_to_index_map[rel[1]], + "tail": entity_id_to_index_map[rel[0]] + }) + else: + continue + relations = sorted( + [{ + "head": rel["head"], + "tail": rel["tail"], + "start_index": self.get_relation_span(rel, entities)[0], + "end_index": self.get_relation_span(rel, entities)[1], + } for rel in kv_relations], + key=lambda x: x["head"], ) + + data['relations'] = relations + return data + + def get_relation_span(self, rel, entities): + bound = [] + for entity_index in [rel["head"], rel["tail"]]: + bound.append(entities[entity_index]["start"]) + bound.append(entities[entity_index]["end"]) + return min(bound), max(bound) diff --git a/ppocr/data/lmdb_dataset.py b/ppocr/data/lmdb_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..295643e401481d30cf433346727f39d4a4c7d2f4 --- /dev/null +++ b/ppocr/data/lmdb_dataset.py @@ -0,0 +1,205 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import os +from paddle.io import Dataset +import lmdb +import cv2 +import string +import six +from PIL import Image + +from .imaug import transform, create_operators + + +class LMDBDataSet(Dataset): + def __init__(self, config, mode, logger, seed=None): + super(LMDBDataSet, self).__init__() + + global_config = config['Global'] + dataset_config = config[mode]['dataset'] + loader_config = config[mode]['loader'] + batch_size = loader_config['batch_size_per_card'] + data_dir = dataset_config['data_dir'] + self.do_shuffle = loader_config['shuffle'] + + self.lmdb_sets = self.load_hierarchical_lmdb_dataset(data_dir) + logger.info("Initialize indexs of datasets:%s" % data_dir) + self.data_idx_order_list = self.dataset_traversal() + if self.do_shuffle: + np.random.shuffle(self.data_idx_order_list) + self.ops = create_operators(dataset_config['transforms'], global_config) + self.ext_op_transform_idx = dataset_config.get("ext_op_transform_idx", + 1) + + ratio_list = dataset_config.get("ratio_list", [1.0]) + self.need_reset = True in [x < 1 for x in ratio_list] + + def load_hierarchical_lmdb_dataset(self, data_dir): + lmdb_sets = {} + dataset_idx = 0 + for dirpath, dirnames, filenames in os.walk(data_dir + '/'): + if not dirnames: + env = lmdb.open( + dirpath, + max_readers=32, + readonly=True, + lock=False, + readahead=False, + meminit=False) + txn = env.begin(write=False) + num_samples = int(txn.get('num-samples'.encode())) + lmdb_sets[dataset_idx] = {"dirpath":dirpath, "env":env, \ + "txn":txn, "num_samples":num_samples} + dataset_idx += 1 + return lmdb_sets + + def dataset_traversal(self): + lmdb_num = len(self.lmdb_sets) + total_sample_num = 0 + for lno in range(lmdb_num): + total_sample_num += self.lmdb_sets[lno]['num_samples'] + data_idx_order_list = np.zeros((total_sample_num, 2)) + beg_idx = 0 + for lno in range(lmdb_num): + tmp_sample_num = self.lmdb_sets[lno]['num_samples'] + end_idx = beg_idx + tmp_sample_num + data_idx_order_list[beg_idx:end_idx, 0] = lno + data_idx_order_list[beg_idx:end_idx, 1] \ + = list(range(tmp_sample_num)) + data_idx_order_list[beg_idx:end_idx, 1] += 1 + beg_idx = beg_idx + tmp_sample_num + return data_idx_order_list + + def get_img_data(self, value): + """get_img_data""" + if not value: + return None + imgdata = np.frombuffer(value, dtype='uint8') + if imgdata is None: + return None + imgori = cv2.imdecode(imgdata, 1) + if imgori is None: + return None + return imgori + + def get_ext_data(self): + ext_data_num = 0 + for op in self.ops: + if hasattr(op, 'ext_data_num'): + ext_data_num = getattr(op, 'ext_data_num') + break + load_data_ops = self.ops[:self.ext_op_transform_idx] + ext_data = [] + + while len(ext_data) < ext_data_num: + lmdb_idx, file_idx = self.data_idx_order_list[np.random.randint( + len(self))] + lmdb_idx = int(lmdb_idx) + file_idx = int(file_idx) + sample_info = self.get_lmdb_sample_info( + self.lmdb_sets[lmdb_idx]['txn'], file_idx) + if sample_info is None: + continue + img, label = sample_info + data = {'image': img, 'label': label} + data = transform(data, load_data_ops) + if data is None: + continue + ext_data.append(data) + return ext_data + + def get_lmdb_sample_info(self, txn, index): + label_key = 'label-%09d'.encode() % index + label = txn.get(label_key) + if label is None: + return None + label = label.decode('utf-8') + img_key = 'image-%09d'.encode() % index + imgbuf = txn.get(img_key) + return imgbuf, label + + def __getitem__(self, idx): + lmdb_idx, file_idx = self.data_idx_order_list[idx] + lmdb_idx = int(lmdb_idx) + file_idx = int(file_idx) + sample_info = self.get_lmdb_sample_info(self.lmdb_sets[lmdb_idx]['txn'], + file_idx) + if sample_info is None: + return self.__getitem__(np.random.randint(self.__len__())) + img, label = sample_info + data = {'image': img, 'label': label} + data['ext_data'] = self.get_ext_data() + outs = transform(data, self.ops) + if outs is None: + return self.__getitem__(np.random.randint(self.__len__())) + return outs + + def __len__(self): + return self.data_idx_order_list.shape[0] + + +class LMDBDataSetSR(LMDBDataSet): + def buf2PIL(self, txn, key, type='RGB'): + imgbuf = txn.get(key) + buf = six.BytesIO() + buf.write(imgbuf) + buf.seek(0) + im = Image.open(buf).convert(type) + return im + + def str_filt(self, str_, voc_type): + alpha_dict = { + 'digit': string.digits, + 'lower': string.digits + string.ascii_lowercase, + 'upper': string.digits + string.ascii_letters, + 'all': string.digits + string.ascii_letters + string.punctuation + } + if voc_type == 'lower': + str_ = str_.lower() + for char in str_: + if char not in alpha_dict[voc_type]: + str_ = str_.replace(char, '') + return str_ + + def get_lmdb_sample_info(self, txn, index): + self.voc_type = 'upper' + self.max_len = 100 + self.test = False + label_key = b'label-%09d' % index + word = str(txn.get(label_key).decode()) + img_HR_key = b'image_hr-%09d' % index # 128*32 + img_lr_key = b'image_lr-%09d' % index # 64*16 + try: + img_HR = self.buf2PIL(txn, img_HR_key, 'RGB') + img_lr = self.buf2PIL(txn, img_lr_key, 'RGB') + except IOError or len(word) > self.max_len: + return self[index + 1] + label_str = self.str_filt(word, self.voc_type) + return img_HR, img_lr, label_str + + def __getitem__(self, idx): + lmdb_idx, file_idx = self.data_idx_order_list[idx] + lmdb_idx = int(lmdb_idx) + file_idx = int(file_idx) + sample_info = self.get_lmdb_sample_info(self.lmdb_sets[lmdb_idx]['txn'], + file_idx) + if sample_info is None: + return self.__getitem__(np.random.randint(self.__len__())) + img_HR, img_lr, label_str = sample_info + data = {'image_hr': img_HR, 'image_lr': img_lr, 'label': label_str} + outs = transform(data, self.ops) + if outs is None: + return self.__getitem__(np.random.randint(self.__len__())) + return outs diff --git a/ppocr/data/pgnet_dataset.py b/ppocr/data/pgnet_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..6f80179c4eb971ace360edb5368f6a2acd5a6322 --- /dev/null +++ b/ppocr/data/pgnet_dataset.py @@ -0,0 +1,106 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import os +from paddle.io import Dataset +from .imaug import transform, create_operators +import random + + +class PGDataSet(Dataset): + def __init__(self, config, mode, logger, seed=None): + super(PGDataSet, self).__init__() + + self.logger = logger + self.seed = seed + self.mode = mode + global_config = config['Global'] + dataset_config = config[mode]['dataset'] + loader_config = config[mode]['loader'] + + self.delimiter = dataset_config.get('delimiter', '\t') + label_file_list = dataset_config.pop('label_file_list') + data_source_num = len(label_file_list) + ratio_list = dataset_config.get("ratio_list", [1.0]) + if isinstance(ratio_list, (float, int)): + ratio_list = [float(ratio_list)] * int(data_source_num) + assert len( + ratio_list + ) == data_source_num, "The length of ratio_list should be the same as the file_list." + self.data_dir = dataset_config['data_dir'] + self.do_shuffle = loader_config['shuffle'] + + logger.info("Initialize indexs of datasets:%s" % label_file_list) + self.data_lines = self.get_image_info_list(label_file_list, ratio_list) + self.data_idx_order_list = list(range(len(self.data_lines))) + if mode.lower() == "train": + self.shuffle_data_random() + + self.ops = create_operators(dataset_config['transforms'], global_config) + + self.need_reset = True in [x < 1 for x in ratio_list] + + def shuffle_data_random(self): + if self.do_shuffle: + random.seed(self.seed) + random.shuffle(self.data_lines) + return + + def get_image_info_list(self, file_list, ratio_list): + if isinstance(file_list, str): + file_list = [file_list] + data_lines = [] + for idx, file in enumerate(file_list): + with open(file, "rb") as f: + lines = f.readlines() + if self.mode == "train" or ratio_list[idx] < 1.0: + random.seed(self.seed) + lines = random.sample(lines, + round(len(lines) * ratio_list[idx])) + data_lines.extend(lines) + return data_lines + + def __getitem__(self, idx): + file_idx = self.data_idx_order_list[idx] + data_line = self.data_lines[file_idx] + img_id = 0 + try: + data_line = data_line.decode('utf-8') + substr = data_line.strip("\n").split(self.delimiter) + file_name = substr[0] + label = substr[1] + img_path = os.path.join(self.data_dir, file_name) + if self.mode.lower() == 'eval': + try: + img_id = int(data_line.split(".")[0][7:]) + except: + img_id = 0 + data = {'img_path': img_path, 'label': label, 'img_id': img_id} + if not os.path.exists(img_path): + raise Exception("{} does not exist!".format(img_path)) + with open(data['img_path'], 'rb') as f: + img = f.read() + data['image'] = img + outs = transform(data, self.ops) + except Exception as e: + self.logger.error( + "When parsing line {}, error happened with msg: {}".format( + self.data_idx_order_list[idx], e)) + outs = None + if outs is None: + return self.__getitem__(np.random.randint(self.__len__())) + return outs + + def __len__(self): + return len(self.data_idx_order_list) diff --git a/ppocr/data/pubtab_dataset.py b/ppocr/data/pubtab_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..642d3eb1961cbf0e829e6fb122f38c6af99df1c5 --- /dev/null +++ b/ppocr/data/pubtab_dataset.py @@ -0,0 +1,133 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import os +import random +from paddle.io import Dataset +import json +from copy import deepcopy + +from .imaug import transform, create_operators + + +class PubTabDataSet(Dataset): + def __init__(self, config, mode, logger, seed=None): + super(PubTabDataSet, self).__init__() + self.logger = logger + + global_config = config['Global'] + dataset_config = config[mode]['dataset'] + loader_config = config[mode]['loader'] + + label_file_list = dataset_config.pop('label_file_list') + data_source_num = len(label_file_list) + ratio_list = dataset_config.get("ratio_list", [1.0]) + if isinstance(ratio_list, (float, int)): + ratio_list = [float(ratio_list)] * int(data_source_num) + + assert len( + ratio_list + ) == data_source_num, "The length of ratio_list should be the same as the file_list." + + self.data_dir = dataset_config['data_dir'] + self.do_shuffle = loader_config['shuffle'] + + self.seed = seed + self.mode = mode.lower() + logger.info("Initialize indexs of datasets:%s" % label_file_list) + self.data_lines = self.get_image_info_list(label_file_list, ratio_list) + # self.check(config['Global']['max_text_length']) + + if mode.lower() == "train" and self.do_shuffle: + self.shuffle_data_random() + self.ops = create_operators(dataset_config['transforms'], global_config) + self.need_reset = True in [x < 1 for x in ratio_list] + + def get_image_info_list(self, file_list, ratio_list): + if isinstance(file_list, str): + file_list = [file_list] + data_lines = [] + for idx, file in enumerate(file_list): + with open(file, "rb") as f: + lines = f.readlines() + if self.mode == "train" or ratio_list[idx] < 1.0: + random.seed(self.seed) + lines = random.sample(lines, + round(len(lines) * ratio_list[idx])) + data_lines.extend(lines) + return data_lines + + def check(self, max_text_length): + data_lines = [] + for line in self.data_lines: + data_line = line.decode('utf-8').strip("\n") + info = json.loads(data_line) + file_name = info['filename'] + cells = info['html']['cells'].copy() + structure = info['html']['structure']['tokens'].copy() + + img_path = os.path.join(self.data_dir, file_name) + if not os.path.exists(img_path): + self.logger.warning("{} does not exist!".format(img_path)) + continue + if len(structure) == 0 or len(structure) > max_text_length: + continue + # data = {'img_path': img_path, 'cells': cells, 'structure':structure,'file_name':file_name} + data_lines.append(line) + self.data_lines = data_lines + + def shuffle_data_random(self): + if self.do_shuffle: + random.seed(self.seed) + random.shuffle(self.data_lines) + return + + def __getitem__(self, idx): + try: + data_line = self.data_lines[idx] + data_line = data_line.decode('utf-8').strip("\n") + info = json.loads(data_line) + file_name = info['filename'] + cells = info['html']['cells'].copy() + structure = info['html']['structure']['tokens'].copy() + + img_path = os.path.join(self.data_dir, file_name) + if not os.path.exists(img_path): + raise Exception("{} does not exist!".format(img_path)) + data = { + 'img_path': img_path, + 'cells': cells, + 'structure': structure, + 'file_name': file_name + } + + with open(data['img_path'], 'rb') as f: + img = f.read() + data['image'] = img + outs = transform(data, self.ops) + except: + import traceback + err = traceback.format_exc() + self.logger.error( + "When parsing line {}, error happened with msg: {}".format( + data_line, err)) + outs = None + if outs is None: + rnd_idx = np.random.randint(self.__len__( + )) if self.mode == "train" else (idx + 1) % self.__len__() + return self.__getitem__(rnd_idx) + return outs + + def __len__(self): + return len(self.data_lines) diff --git a/ppocr/data/simple_dataset.py b/ppocr/data/simple_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..402f1e38fed9e32722e2dd160f10f779028807a3 --- /dev/null +++ b/ppocr/data/simple_dataset.py @@ -0,0 +1,151 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import os +import json +import random +import traceback +from paddle.io import Dataset +from .imaug import transform, create_operators + + +class SimpleDataSet(Dataset): + def __init__(self, config, mode, logger, seed=None): + super(SimpleDataSet, self).__init__() + self.logger = logger + self.mode = mode.lower() + + global_config = config['Global'] + dataset_config = config[mode]['dataset'] + loader_config = config[mode]['loader'] + + self.delimiter = dataset_config.get('delimiter', '\t') + label_file_list = dataset_config.pop('label_file_list') + data_source_num = len(label_file_list) + ratio_list = dataset_config.get("ratio_list", 1.0) + if isinstance(ratio_list, (float, int)): + ratio_list = [float(ratio_list)] * int(data_source_num) + + assert len( + ratio_list + ) == data_source_num, "The length of ratio_list should be the same as the file_list." + self.data_dir = dataset_config['data_dir'] + self.do_shuffle = loader_config['shuffle'] + self.seed = seed + logger.info("Initialize indexs of datasets:%s" % label_file_list) + self.data_lines = self.get_image_info_list(label_file_list, ratio_list) + self.data_idx_order_list = list(range(len(self.data_lines))) + if self.mode == "train" and self.do_shuffle: + self.shuffle_data_random() + self.ops = create_operators(dataset_config['transforms'], global_config) + self.ext_op_transform_idx = dataset_config.get("ext_op_transform_idx", + 2) + self.need_reset = True in [x < 1 for x in ratio_list] + + def get_image_info_list(self, file_list, ratio_list): + if isinstance(file_list, str): + file_list = [file_list] + data_lines = [] + for idx, file in enumerate(file_list): + with open(file, "rb") as f: + lines = f.readlines() + if self.mode == "train" or ratio_list[idx] < 1.0: + random.seed(self.seed) + lines = random.sample(lines, + round(len(lines) * ratio_list[idx])) + data_lines.extend(lines) + return data_lines + + def shuffle_data_random(self): + random.seed(self.seed) + random.shuffle(self.data_lines) + return + + def _try_parse_filename_list(self, file_name): + # multiple images -> one gt label + if len(file_name) > 0 and file_name[0] == "[": + try: + info = json.loads(file_name) + file_name = random.choice(info) + except: + pass + return file_name + + def get_ext_data(self): + ext_data_num = 0 + for op in self.ops: + if hasattr(op, 'ext_data_num'): + ext_data_num = getattr(op, 'ext_data_num') + break + load_data_ops = self.ops[:self.ext_op_transform_idx] + ext_data = [] + + while len(ext_data) < ext_data_num: + file_idx = self.data_idx_order_list[np.random.randint(self.__len__( + ))] + data_line = self.data_lines[file_idx] + data_line = data_line.decode('utf-8') + substr = data_line.strip("\n").split(self.delimiter) + file_name = substr[0] + file_name = self._try_parse_filename_list(file_name) + label = substr[1] + img_path = os.path.join(self.data_dir, file_name) + data = {'img_path': img_path, 'label': label} + if not os.path.exists(img_path): + continue + with open(data['img_path'], 'rb') as f: + img = f.read() + data['image'] = img + data = transform(data, load_data_ops) + + if data is None: + continue + if 'polys' in data.keys(): + if data['polys'].shape[1] != 4: + continue + ext_data.append(data) + return ext_data + + def __getitem__(self, idx): + file_idx = self.data_idx_order_list[idx] + data_line = self.data_lines[file_idx] + try: + data_line = data_line.decode('utf-8') + substr = data_line.strip("\n").split(self.delimiter) + file_name = substr[0] + file_name = self._try_parse_filename_list(file_name) + label = substr[1] + img_path = os.path.join(self.data_dir, file_name) + data = {'img_path': img_path, 'label': label} + if not os.path.exists(img_path): + raise Exception("{} does not exist!".format(img_path)) + with open(data['img_path'], 'rb') as f: + img = f.read() + data['image'] = img + data['ext_data'] = self.get_ext_data() + outs = transform(data, self.ops) + except: + self.logger.error( + "When parsing line {}, error happened with msg: {}".format( + data_line, traceback.format_exc())) + outs = None + if outs is None: + # during evaluation, we should fix the idx to get same results for many times of evaluation. + rnd_idx = np.random.randint(self.__len__( + )) if self.mode == "train" else (idx + 1) % self.__len__() + return self.__getitem__(rnd_idx) + return outs + + def __len__(self): + return len(self.data_idx_order_list) diff --git a/ppocr/ext_op/__init__.py b/ppocr/ext_op/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b0be1ada03b07afcfe951b655b710849aba8dbaa --- /dev/null +++ b/ppocr/ext_op/__init__.py @@ -0,0 +1 @@ +from .roi_align_rotated.roi_align_rotated import RoIAlignRotated diff --git a/ppocr/ext_op/roi_align_rotated/roi_align_rotated.cc b/ppocr/ext_op/roi_align_rotated/roi_align_rotated.cc new file mode 100644 index 0000000000000000000000000000000000000000..2de86c53730c58bc58b0b6bd5e0098435339d4f9 --- /dev/null +++ b/ppocr/ext_op/roi_align_rotated/roi_align_rotated.cc @@ -0,0 +1,528 @@ + +// This code is refer from: +// https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp + +#include +#include +#include + +#include "paddle/extension.h" + +#define PADDLE_WITH_CUDA +#define CHECK_INPUT_SAME(x1, x2) \ + PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.") +#define CHECK_INPUT_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") + +template struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + T w1; + T w2; + T w3; + T w4; +}; + +template +void pre_calc_for_bilinear_interpolate( + const int height, const int width, const int pooled_height, + const int pooled_width, const int iy_upper, const int ix_upper, + T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w, + int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w, + T cos_theta, T sin_theta, std::vector> &pre_calc) { + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < ix_upper; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta around the center and translate + // In image space, (y, x) is the order for Right Handed System, + // and this is essentially multiplying the point by a rotation matrix + // to rotate it counterclockwise through angle theta. + T y = yy * cos_theta - xx * sin_theta + roi_center_h; + T x = yy * sin_theta + xx * cos_theta + roi_center_w; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc[pre_calc_index] = pc; + pre_calc_index += 1; + continue; + } + + if (y < 0) { + y = 0; + } + if (x < 0) { + x = 0; + } + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + // save weights and indices + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + +template +void roi_align_rotated_cpu_forward(const int nthreads, const T *input, + const T &spatial_scale, const bool aligned, + const bool clockwise, const int channels, + const int height, const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, const T *rois, + T *output) { + int n_rois = nthreads / channels / pooled_width / pooled_height; + // (n, c, ph, pw) is an element in the pooled output + // can be parallelized using omp + // #pragma omp parallel for num_threads(32) + for (int n = 0; n < n_rois; n++) { + int index_n = n * channels * pooled_width * pooled_height; + + const T *current_roi = rois + n * 6; + int roi_batch_ind = current_roi[0]; + + // Do not use rounding; this implementation detail is critical + T offset = aligned ? (T)0.5 : (T)0.0; + T roi_center_w = current_roi[1] * spatial_scale - offset; + T roi_center_h = current_roi[2] * spatial_scale - offset; + T roi_width = current_roi[3] * spatial_scale; + T roi_height = current_roi[4] * spatial_scale; + T theta = current_roi[5]; + if (clockwise) { + theta = -theta; // If clockwise, the angle needs to be reversed. + } + T cos_theta = cos(theta); + T sin_theta = sin(theta); + + if (aligned) { + assert(roi_width >= 0 && roi_height >= 0); + } else { // for backward-compatibility only + roi_width = std::max(roi_width, (T)1.); + roi_height = std::max(roi_height, (T)1.); + } + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceilf(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + + // we want to precalculate indices and weights shared by all channels, + // this is the key point of optimization + std::vector> pre_calc(roi_bin_grid_h * roi_bin_grid_w * + pooled_width * pooled_height); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + T roi_start_h = -roi_height / 2.0; + T roi_start_w = -roi_width / 2.0; + + pre_calc_for_bilinear_interpolate( + height, width, pooled_height, pooled_width, roi_bin_grid_h, + roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w, + roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta, + sin_theta, pre_calc); + + for (int c = 0; c < channels; c++) { + int index_n_c = index_n + c * pooled_width * pooled_height; + const T *offset_input = + input + (roi_batch_ind * channels + c) * height * width; + int pre_calc_index = 0; + + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + int index = index_n_c + ph * pooled_width + pw; + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + PreCalc pc = pre_calc[pre_calc_index]; + output_val += pc.w1 * offset_input[pc.pos1] + + pc.w2 * offset_input[pc.pos2] + + pc.w3 * offset_input[pc.pos3] + + pc.w4 * offset_input[pc.pos4]; + + pre_calc_index += 1; + } + } + output_val /= count; + + output[index] = output_val; + } // for pw + } // for ph + } // for c + } // for n +} + +template +void bilinear_interpolate_gradient(const int height, const int width, T y, T x, + T &w1, T &w2, T &w3, T &w4, int &x_low, + int &x_high, int &y_low, int &y_high) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + w1 = w2 = w3 = w4 = 0.; + x_low = x_high = y_low = y_high = -1; + return; + } + + if (y < 0) { + y = 0; + } + + if (x < 0) { + x = 0; + } + + y_low = (int)y; + x_low = (int)x; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + // reference in forward + // T v1 = input[y_low * width + x_low]; + // T v2 = input[y_low * width + x_high]; + // T v3 = input[y_high * width + x_low]; + // T v4 = input[y_high * width + x_high]; + // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + return; +} + +template inline void add(T *address, const T &val) { + *address += val; +} + +template +void roi_align_rotated_cpu_backward( + const int nthreads, + // may not be contiguous. should index using n_stride, etc + const T *grad_output, const T &spatial_scale, const bool aligned, + const bool clockwise, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int sampling_ratio, + T *grad_input, const T *rois, const int n_stride, const int c_stride, + const int h_stride, const int w_stride) { + for (int index = 0; index < nthreads; index++) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T *current_roi = rois + n * 6; + int roi_batch_ind = current_roi[0]; + + // Do not use rounding; this implementation detail is critical + T offset = aligned ? (T)0.5 : (T)0.0; + T roi_center_w = current_roi[1] * spatial_scale - offset; + T roi_center_h = current_roi[2] * spatial_scale - offset; + T roi_width = current_roi[3] * spatial_scale; + T roi_height = current_roi[4] * spatial_scale; + T theta = current_roi[5]; + if (clockwise) { + theta = -theta; // If clockwise, the angle needs to be reversed. + } + T cos_theta = cos(theta); + T sin_theta = sin(theta); + + if (aligned) { + assert(roi_width >= 0 && roi_height >= 0); + } else { // for backward-compatibility only + roi_width = std::max(roi_width, (T)1.); + roi_height = std::max(roi_height, (T)1.); + } + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + T *offset_grad_input = + grad_input + ((roi_batch_ind * channels + c) * height * width); + + int output_offset = n * n_stride + c * c_stride; + const T *offset_grad_output = grad_output + output_offset; + const T grad_output_this_bin = + offset_grad_output[ph * h_stride + pw * w_stride]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceilf(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + T roi_start_h = -roi_height / 2.0; + T roi_start_w = -roi_width / 2.0; + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta around the center and translate + T y = yy * cos_theta - xx * sin_theta + roi_center_h; + T x = yy * sin_theta + xx * cos_theta + roi_center_w; + + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, + x_low, x_high, y_low, y_high); + + T g1 = grad_output_this_bin * w1 / count; + T g2 = grad_output_this_bin * w2 / count; + T g3 = grad_output_this_bin * w3 / count; + T g4 = grad_output_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + // atomic add is not needed for now since it is single threaded + add(offset_grad_input + y_low * width + x_low, static_cast(g1)); + add(offset_grad_input + y_low * width + x_high, static_cast(g2)); + add(offset_grad_input + y_high * width + x_low, static_cast(g3)); + add(offset_grad_input + y_high * width + x_high, static_cast(g4)); + } // if + } // ix + } // iy + } // for +} // ROIAlignRotatedBackward + +std::vector +RoIAlignRotatedCPUForward(const paddle::Tensor &input, + const paddle::Tensor &rois, int aligned_height, + int aligned_width, float spatial_scale, + int sampling_ratio, bool aligned, bool clockwise) { + CHECK_INPUT_CPU(input); + CHECK_INPUT_CPU(rois); + + auto num_rois = rois.shape()[0]; + + auto channels = input.shape()[1]; + auto height = input.shape()[2]; + auto width = input.shape()[3]; + + auto output = + paddle::empty({num_rois, channels, aligned_height, aligned_width}, + input.type(), paddle::CPUPlace()); + auto output_size = output.numel(); + + PD_DISPATCH_FLOATING_TYPES( + input.type(), "roi_align_rotated_cpu_forward", ([&] { + roi_align_rotated_cpu_forward( + output_size, input.data(), + static_cast(spatial_scale), aligned, clockwise, channels, + height, width, aligned_height, aligned_width, sampling_ratio, + rois.data(), output.data()); + })); + + return {output}; +} + +std::vector RoIAlignRotatedCPUBackward( + const paddle::Tensor &input, const paddle::Tensor &rois, + const paddle::Tensor &grad_output, int aligned_height, int aligned_width, + float spatial_scale, int sampling_ratio, bool aligned, bool clockwise) { + + auto batch_size = input.shape()[0]; + auto channels = input.shape()[1]; + auto height = input.shape()[2]; + auto width = input.shape()[3]; + + auto grad_input = paddle::full({batch_size, channels, height, width}, 0.0, + input.type(), paddle::CPUPlace()); + + // get stride values to ensure indexing into gradients is correct. + int n_stride = grad_output.shape()[0]; + int c_stride = grad_output.shape()[1]; + int h_stride = grad_output.shape()[2]; + int w_stride = grad_output.shape()[3]; + + PD_DISPATCH_FLOATING_TYPES( + grad_output.type(), "roi_align_rotated_cpu_backward", [&] { + roi_align_rotated_cpu_backward( + grad_output.numel(), grad_output.data(), + static_cast(spatial_scale), aligned, clockwise, channels, + height, width, aligned_height, aligned_width, sampling_ratio, + grad_input.data(), rois.data(), n_stride, c_stride, + h_stride, w_stride); + }); + return {grad_input}; +} + +#ifdef PADDLE_WITH_CUDA +std::vector +RoIAlignRotatedCUDAForward(const paddle::Tensor &input, + const paddle::Tensor &rois, int aligned_height, + int aligned_width, float spatial_scale, + int sampling_ratio, bool aligned, bool clockwise); +#endif + +#ifdef PADDLE_WITH_CUDA +std::vector RoIAlignRotatedCUDABackward( + const paddle::Tensor &input, const paddle::Tensor &rois, + const paddle::Tensor &grad_output, int aligned_height, int aligned_width, + float spatial_scale, int sampling_ratio, bool aligned, bool clockwise); +#endif + +std::vector +RoIAlignRotatedForward(const paddle::Tensor &input, const paddle::Tensor &rois, + int aligned_height, int aligned_width, + float spatial_scale, int sampling_ratio, bool aligned, + bool clockwise) { + CHECK_INPUT_SAME(input, rois); + if (input.is_cpu()) { + return RoIAlignRotatedCPUForward(input, rois, aligned_height, aligned_width, + spatial_scale, sampling_ratio, aligned, + clockwise); +#ifdef PADDLE_WITH_CUDA + } else if (input.is_gpu()) { + return RoIAlignRotatedCUDAForward(input, rois, aligned_height, + aligned_width, spatial_scale, + sampling_ratio, aligned, clockwise); +#endif + } else { + PD_THROW("Unsupported device type for forward function of roi align " + "rotated operator."); + } +} + +std::vector +RoIAlignRotatedBackward(const paddle::Tensor &input, const paddle::Tensor &rois, + const paddle::Tensor &grad_output, int aligned_height, + int aligned_width, float spatial_scale, + int sampling_ratio, bool aligned, bool clockwise) { + CHECK_INPUT_SAME(input, rois); + if (input.is_cpu()) { + return RoIAlignRotatedCPUBackward(input, rois, grad_output, aligned_height, + aligned_width, spatial_scale, + sampling_ratio, aligned, clockwise); +#ifdef PADDLE_WITH_CUDA + } else if (input.is_gpu()) { + return RoIAlignRotatedCUDABackward(input, rois, grad_output, aligned_height, + aligned_width, spatial_scale, + sampling_ratio, aligned, clockwise); +#endif + } else { + PD_THROW("Unsupported device type for forward function of roi align " + "rotated operator."); + } +} + +std::vector> InferShape(std::vector input_shape, + std::vector rois_shape) { + return {{rois_shape[0], input_shape[1], input_shape[2], input_shape[3]}}; +} + +std::vector> +InferBackShape(std::vector input_shape, + std::vector rois_shape) { + return {input_shape}; +} + +std::vector InferDtype(paddle::DataType input_dtype, + paddle::DataType rois_dtype) { + return {input_dtype}; +} + +PD_BUILD_OP(roi_align_rotated) + .Inputs({"Input", "Rois"}) + .Outputs({"Output"}) + .Attrs({"aligned_height: int", "aligned_width: int", "spatial_scale: float", + "sampling_ratio: int", "aligned: bool", "clockwise: bool"}) + .SetKernelFn(PD_KERNEL(RoIAlignRotatedForward)) + .SetInferShapeFn(PD_INFER_SHAPE(InferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype)); + +PD_BUILD_GRAD_OP(roi_align_rotated) + .Inputs({"Input", "Rois", paddle::Grad("Output")}) + .Attrs({"aligned_height: int", "aligned_width: int", "spatial_scale: float", + "sampling_ratio: int", "aligned: bool", "clockwise: bool"}) + .Outputs({paddle::Grad("Input")}) + .SetKernelFn(PD_KERNEL(RoIAlignRotatedBackward)) + .SetInferShapeFn(PD_INFER_SHAPE(InferBackShape)); diff --git a/ppocr/ext_op/roi_align_rotated/roi_align_rotated.cu b/ppocr/ext_op/roi_align_rotated/roi_align_rotated.cu new file mode 100644 index 0000000000000000000000000000000000000000..17bd47dc08be732bdb228da9696ee2d163179c73 --- /dev/null +++ b/ppocr/ext_op/roi_align_rotated/roi_align_rotated.cu @@ -0,0 +1,380 @@ + +// This code is refer from: +// https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh + +#include +#include +#include + +#include "paddle/extension.h" +#include + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +#define THREADS_PER_BLOCK 512 + +inline int GET_BLOCKS(const int N) { + int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + int max_block_num = 4096; + return min(optimal_block_num, max_block_num); +} + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 + +static __inline__ __device__ double atomicAdd(double *address, double val) { + unsigned long long int *address_as_ull = (unsigned long long int *)address; + unsigned long long int old = *address_as_ull, assumed; + if (val == 0.0) + return __longlong_as_double(old); + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} + +#endif + +template +__device__ T bilinear_interpolate(const T *input, const int height, + const int width, T y, T x, + const int index /* index for debug only*/) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) + return 0; + + if (y <= 0) + y = 0; + if (x <= 0) + x = 0; + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + // do bilinear interpolation + T v1 = input[y_low * width + x_low]; + T v2 = input[y_low * width + x_high]; + T v3 = input[y_high * width + x_low]; + T v4 = input[y_high * width + x_high]; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + return val; +} + +template +__device__ void +bilinear_interpolate_gradient(const int height, const int width, T y, T x, + T &w1, T &w2, T &w3, T &w4, int &x_low, + int &x_high, int &y_low, int &y_high, + const int index /* index for debug only*/) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + w1 = w2 = w3 = w4 = 0.; + x_low = x_high = y_low = y_high = -1; + return; + } + + if (y <= 0) + y = 0; + if (x <= 0) + x = 0; + + y_low = (int)y; + x_low = (int)x; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + // reference in forward + // T v1 = input[y_low * width + x_low]; + // T v2 = input[y_low * width + x_high]; + // T v3 = input[y_high * width + x_low]; + // T v4 = input[y_high * width + x_high]; + // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + return; +} + +/*** Forward ***/ +template +__global__ void roi_align_rotated_cuda_forward_kernel( + const int nthreads, const scalar_t *bottom_data, + const scalar_t *bottom_rois, const scalar_t spatial_scale, + const int sample_num, const bool aligned, const bool clockwise, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, scalar_t *top_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const scalar_t *offset_bottom_rois = bottom_rois + n * 6; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not using rounding; this implementation detail is critical + scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0; + scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset; + scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset; + scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; + scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; + // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; + scalar_t theta = offset_bottom_rois[5]; + if (clockwise) { + theta = -theta; // If clockwise, the angle needs to be reversed. + } + if (!aligned) { // for backward-compatibility only + // Force malformed ROIs to be 1x1 + roi_width = max(roi_width, (scalar_t)1.); + roi_height = max(roi_height, (scalar_t)1.); + } + scalar_t bin_size_h = static_cast(roi_height) / + static_cast(pooled_height); + scalar_t bin_size_w = + static_cast(roi_width) / static_cast(pooled_width); + + const scalar_t *offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sample_num > 0) + ? sample_num + : ceilf(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + scalar_t roi_start_h = -roi_height / 2.0; + scalar_t roi_start_w = -roi_width / 2.0; + scalar_t cosscalar_theta = cos(theta); + scalar_t sinscalar_theta = sin(theta); + + // We do average (integral) pooling inside a bin + const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + + scalar_t output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 + const scalar_t yy = + roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const scalar_t xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta (counterclockwise) around the center and translate + scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h; + scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w; + + scalar_t val = bilinear_interpolate( + offset_bottom_data, height, width, y, x, index); + output_val += val; + } + } + output_val /= count; + + top_data[index] = output_val; + } +} + +/*** Backward ***/ +template +__global__ void roi_align_rotated_backward_cuda_kernel( + const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois, + const scalar_t spatial_scale, const int sample_num, const bool aligned, + const bool clockwise, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, scalar_t *bottom_diff) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const scalar_t *offset_bottom_rois = bottom_rois + n * 6; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not round + scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0; + scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset; + scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset; + scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; + scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; + // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; + scalar_t theta = offset_bottom_rois[5]; + if (clockwise) { + theta = -theta; // If clockwise, the angle needs to be reversed. + } + if (!aligned) { // for backward-compatibility only + // Force malformed ROIs to be 1x1 + roi_width = max(roi_width, (scalar_t)1.); + roi_height = max(roi_height, (scalar_t)1.); + } + scalar_t bin_size_h = static_cast(roi_height) / + static_cast(pooled_height); + scalar_t bin_size_w = + static_cast(roi_width) / static_cast(pooled_width); + + scalar_t *offset_bottom_diff = + bottom_diff + (roi_batch_ind * channels + c) * height * width; + + int top_offset = (n * channels + c) * pooled_height * pooled_width; + const scalar_t *offset_top_diff = top_diff + top_offset; + const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sample_num > 0) + ? sample_num + : ceilf(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + scalar_t roi_start_h = -roi_height / 2.0; + scalar_t roi_start_w = -roi_width / 2.0; + scalar_t cosTheta = cos(theta); + scalar_t sinTheta = sin(theta); + + // We do average (integral) pooling inside a bin + const scalar_t count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 + const scalar_t yy = + roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const scalar_t xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta around the center and translate + scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h; + scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w; + + scalar_t w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, + w4, x_low, x_high, y_low, + y_high, index); + + scalar_t g1 = top_diff_this_bin * w1 / count; + scalar_t g2 = top_diff_this_bin * w2 / count; + scalar_t g3 = top_diff_this_bin * w3 / count; + scalar_t g4 = top_diff_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + atomicAdd(offset_bottom_diff + y_low * width + x_low, g1); + atomicAdd(offset_bottom_diff + y_low * width + x_high, g2); + atomicAdd(offset_bottom_diff + y_high * width + x_low, g3); + atomicAdd(offset_bottom_diff + y_high * width + x_high, g4); + } // if + } // ix + } // iy + } // CUDA_1D_KERNEL_LOOP +} // RoIAlignBackward + +std::vector +RoIAlignRotatedCUDAForward(const paddle::Tensor &input, + const paddle::Tensor &rois, int aligned_height, + int aligned_width, float spatial_scale, + int sampling_ratio, bool aligned, bool clockwise) { + + auto num_rois = rois.shape()[0]; + + auto channels = input.shape()[1]; + auto height = input.shape()[2]; + auto width = input.shape()[3]; + + auto output = + paddle::empty({num_rois, channels, aligned_height, aligned_width}, + input.type(), paddle::GPUPlace()); + auto output_size = output.numel(); + + PD_DISPATCH_FLOATING_TYPES( + input.type(), "roi_align_rotated_cuda_forward_kernel", ([&] { + roi_align_rotated_cuda_forward_kernel< + data_t><<>>( + output_size, input.data(), rois.data(), + static_cast(spatial_scale), sampling_ratio, aligned, + clockwise, channels, height, width, aligned_height, aligned_width, + output.data()); + })); + + return {output}; +} + +std::vector RoIAlignRotatedCUDABackward( + const paddle::Tensor &input, const paddle::Tensor &rois, + const paddle::Tensor &grad_output, int aligned_height, int aligned_width, + float spatial_scale, int sampling_ratio, bool aligned, bool clockwise) { + + auto num_rois = rois.shape()[0]; + + auto batch_size = input.shape()[0]; + auto channels = input.shape()[1]; + auto height = input.shape()[2]; + auto width = input.shape()[3]; + + auto grad_input = paddle::full({batch_size, channels, height, width}, 0.0, + input.type(), paddle::GPUPlace()); + + const int output_size = num_rois * aligned_height * aligned_width * channels; + + PD_DISPATCH_FLOATING_TYPES( + grad_output.type(), "roi_align_rotated_backward_cuda_kernel", ([&] { + roi_align_rotated_backward_cuda_kernel< + data_t><<>>( + output_size, grad_output.data(), rois.data(), + spatial_scale, sampling_ratio, aligned, clockwise, channels, height, + width, aligned_height, aligned_width, grad_input.data()); + })); + return {grad_input}; +} \ No newline at end of file diff --git a/ppocr/ext_op/roi_align_rotated/roi_align_rotated.py b/ppocr/ext_op/roi_align_rotated/roi_align_rotated.py new file mode 100644 index 0000000000000000000000000000000000000000..dcca285c75f9c68ff15409810edcec887eed2026 --- /dev/null +++ b/ppocr/ext_op/roi_align_rotated/roi_align_rotated.py @@ -0,0 +1,66 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/roi_align_rotated.py +""" + +import paddle +import paddle.nn as nn +from paddle.utils.cpp_extension import load +custom_ops = load( + name="custom_jit_ops", + sources=[ + "ppocr/ext_op/roi_align_rotated/roi_align_rotated.cc", + "ppocr/ext_op/roi_align_rotated/roi_align_rotated.cu" + ]) + +roi_align_rotated = custom_ops.roi_align_rotated + + +class RoIAlignRotated(nn.Layer): + """RoI align pooling layer for rotated proposals. + + """ + + def __init__(self, + out_size, + spatial_scale, + sample_num=0, + aligned=True, + clockwise=False): + super(RoIAlignRotated, self).__init__() + + if isinstance(out_size, int): + self.out_h = out_size + self.out_w = out_size + elif isinstance(out_size, tuple): + assert len(out_size) == 2 + assert isinstance(out_size[0], int) + assert isinstance(out_size[1], int) + self.out_h, self.out_w = out_size + else: + raise TypeError( + '"out_size" must be an integer or tuple of integers') + + self.spatial_scale = float(spatial_scale) + self.sample_num = int(sample_num) + self.aligned = aligned + self.clockwise = clockwise + + def forward(self, feats, rois): + output = roi_align_rotated(feats, rois, self.out_h, self.out_w, + self.spatial_scale, self.sample_num, + self.aligned, self.clockwise) + return output diff --git a/ppocr/losses/__init__.py b/ppocr/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c7142e3e5e73e25764dde4631a47be939905e3be --- /dev/null +++ b/ppocr/losses/__init__.py @@ -0,0 +1,83 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import paddle +import paddle.nn as nn + +# basic_loss +from .basic_loss import LossFromOutput + +# det loss +from .det_db_loss import DBLoss +from .det_east_loss import EASTLoss +from .det_sast_loss import SASTLoss +from .det_pse_loss import PSELoss +from .det_fce_loss import FCELoss +from .det_ct_loss import CTLoss +from .det_drrg_loss import DRRGLoss + +# rec loss +from .rec_ctc_loss import CTCLoss +from .rec_att_loss import AttentionLoss +from .rec_srn_loss import SRNLoss +from .rec_ce_loss import CELoss +from .rec_sar_loss import SARLoss +from .rec_aster_loss import AsterLoss +from .rec_pren_loss import PRENLoss +from .rec_multi_loss import MultiLoss +from .rec_vl_loss import VLLoss +from .rec_spin_att_loss import SPINAttentionLoss +from .rec_rfl_loss import RFLLoss +from .rec_can_loss import CANLoss + +# cls loss +from .cls_loss import ClsLoss + +# e2e loss +from .e2e_pg_loss import PGLoss +from .kie_sdmgr_loss import SDMGRLoss + +# basic loss function +from .basic_loss import DistanceLoss + +# combined loss function +from .combined_loss import CombinedLoss + +# table loss +from .table_att_loss import TableAttentionLoss, SLALoss +from .table_master_loss import TableMasterLoss +# vqa token loss +from .vqa_token_layoutlm_loss import VQASerTokenLayoutLMLoss + +# sr loss +from .stroke_focus_loss import StrokeFocusLoss +from .text_focus_loss import TelescopeLoss + + +def build_loss(config): + support_dict = [ + 'DBLoss', 'PSELoss', 'EASTLoss', 'SASTLoss', 'FCELoss', 'CTCLoss', + 'ClsLoss', 'AttentionLoss', 'SRNLoss', 'PGLoss', 'CombinedLoss', + 'CELoss', 'TableAttentionLoss', 'SARLoss', 'AsterLoss', 'SDMGRLoss', + 'VQASerTokenLayoutLMLoss', 'LossFromOutput', 'PRENLoss', 'MultiLoss', + 'TableMasterLoss', 'SPINAttentionLoss', 'VLLoss', 'StrokeFocusLoss', + 'SLALoss', 'CTLoss', 'RFLLoss', 'DRRGLoss', 'CANLoss', 'TelescopeLoss' + ] + config = copy.deepcopy(config) + module_name = config.pop('name') + assert module_name in support_dict, Exception('loss only support {}'.format( + support_dict)) + module_class = eval(module_name)(**config) + return module_class diff --git a/ppocr/losses/ace_loss.py b/ppocr/losses/ace_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..915b99e6ec1d6cb4641d8032fa188c61006dfbb3 --- /dev/null +++ b/ppocr/losses/ace_loss.py @@ -0,0 +1,52 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This code is refer from: https://github.com/viig99/LS-ACELoss + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn + + +class ACELoss(nn.Layer): + def __init__(self, **kwargs): + super().__init__() + self.loss_func = nn.CrossEntropyLoss( + weight=None, + ignore_index=0, + reduction='none', + soft_label=True, + axis=-1) + + def __call__(self, predicts, batch): + if isinstance(predicts, (list, tuple)): + predicts = predicts[-1] + + B, N = predicts.shape[:2] + div = paddle.to_tensor([N]).astype('float32') + + predicts = nn.functional.softmax(predicts, axis=-1) + aggregation_preds = paddle.sum(predicts, axis=1) + aggregation_preds = paddle.divide(aggregation_preds, div) + + length = batch[2].astype("float32") + batch = batch[3].astype("float32") + batch[:, 0] = paddle.subtract(div, length) + batch = paddle.divide(batch, div) + + loss = self.loss_func(aggregation_preds, batch) + return {"loss_ace": loss} diff --git a/ppocr/losses/basic_loss.py b/ppocr/losses/basic_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..58410b4db2157074c2cb0f7db590c84021e10ace --- /dev/null +++ b/ppocr/losses/basic_loss.py @@ -0,0 +1,167 @@ +#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddle.nn import L1Loss +from paddle.nn import MSELoss as L2Loss +from paddle.nn import SmoothL1Loss + + +class CELoss(nn.Layer): + def __init__(self, epsilon=None): + super().__init__() + if epsilon is not None and (epsilon <= 0 or epsilon >= 1): + epsilon = None + self.epsilon = epsilon + + def _labelsmoothing(self, target, class_num): + if target.shape[-1] != class_num: + one_hot_target = F.one_hot(target, class_num) + else: + one_hot_target = target + soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon) + soft_target = paddle.reshape(soft_target, shape=[-1, class_num]) + return soft_target + + def forward(self, x, label): + loss_dict = {} + if self.epsilon is not None: + class_num = x.shape[-1] + label = self._labelsmoothing(label, class_num) + x = -F.log_softmax(x, axis=-1) + loss = paddle.sum(x * label, axis=-1) + else: + if label.shape[-1] == x.shape[-1]: + label = F.softmax(label, axis=-1) + soft_label = True + else: + soft_label = False + loss = F.cross_entropy(x, label=label, soft_label=soft_label) + return loss + + +class KLJSLoss(object): + def __init__(self, mode='kl'): + assert mode in ['kl', 'js', 'KL', 'JS' + ], "mode can only be one of ['kl', 'KL', 'js', 'JS']" + self.mode = mode + + def __call__(self, p1, p2, reduction="mean", eps=1e-5): + + if self.mode.lower() == 'kl': + loss = paddle.multiply(p2, + paddle.log((p2 + eps) / (p1 + eps) + eps)) + loss += paddle.multiply(p1, + paddle.log((p1 + eps) / (p2 + eps) + eps)) + loss *= 0.5 + elif self.mode.lower() == "js": + loss = paddle.multiply( + p2, paddle.log((2 * p2 + eps) / (p1 + p2 + eps) + eps)) + loss += paddle.multiply( + p1, paddle.log((2 * p1 + eps) / (p1 + p2 + eps) + eps)) + loss *= 0.5 + else: + raise ValueError( + "The mode.lower() if KLJSLoss should be one of ['kl', 'js']") + + if reduction == "mean": + loss = paddle.mean(loss, axis=[1, 2]) + elif reduction == "none" or reduction is None: + return loss + else: + loss = paddle.sum(loss, axis=[1, 2]) + + return loss + + +class DMLLoss(nn.Layer): + """ + DMLLoss + """ + + def __init__(self, act=None, use_log=False): + super().__init__() + if act is not None: + assert act in ["softmax", "sigmoid"] + if act == "softmax": + self.act = nn.Softmax(axis=-1) + elif act == "sigmoid": + self.act = nn.Sigmoid() + else: + self.act = None + + self.use_log = use_log + self.jskl_loss = KLJSLoss(mode="kl") + + def _kldiv(self, x, target): + eps = 1.0e-10 + loss = target * (paddle.log(target + eps) - x) + # batch mean loss + loss = paddle.sum(loss) / loss.shape[0] + return loss + + def forward(self, out1, out2): + if self.act is not None: + out1 = self.act(out1) + 1e-10 + out2 = self.act(out2) + 1e-10 + if self.use_log: + # for recognition distillation, log is needed for feature map + log_out1 = paddle.log(out1) + log_out2 = paddle.log(out2) + loss = ( + self._kldiv(log_out1, out2) + self._kldiv(log_out2, out1)) / 2.0 + else: + # for detection distillation log is not needed + loss = self.jskl_loss(out1, out2) + return loss + + +class DistanceLoss(nn.Layer): + """ + DistanceLoss: + mode: loss mode + """ + + def __init__(self, mode="l2", **kargs): + super().__init__() + assert mode in ["l1", "l2", "smooth_l1"] + if mode == "l1": + self.loss_func = nn.L1Loss(**kargs) + elif mode == "l2": + self.loss_func = nn.MSELoss(**kargs) + elif mode == "smooth_l1": + self.loss_func = nn.SmoothL1Loss(**kargs) + + def forward(self, x, y): + return self.loss_func(x, y) + + +class LossFromOutput(nn.Layer): + def __init__(self, key='loss', reduction='none'): + super().__init__() + self.key = key + self.reduction = reduction + + def forward(self, predicts, batch): + loss = predicts + if self.key is not None and isinstance(predicts, dict): + loss = loss[self.key] + if self.reduction == 'mean': + loss = paddle.mean(loss) + elif self.reduction == 'sum': + loss = paddle.sum(loss) + return {'loss': loss} diff --git a/ppocr/losses/center_loss.py b/ppocr/losses/center_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..f62b8af373ed065a2fedbfa182f9692d6decefda --- /dev/null +++ b/ppocr/losses/center_loss.py @@ -0,0 +1,88 @@ +#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +# This code is refer from: https://github.com/KaiyangZhou/pytorch-center-loss + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import pickle + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class CenterLoss(nn.Layer): + """ + Reference: Wen et al. A Discriminative Feature Learning Approach for Deep Face Recognition. ECCV 2016. + """ + + def __init__(self, num_classes=6625, feat_dim=96, center_file_path=None): + super().__init__() + self.num_classes = num_classes + self.feat_dim = feat_dim + self.centers = paddle.randn( + shape=[self.num_classes, self.feat_dim]).astype("float64") + + if center_file_path is not None: + assert os.path.exists( + center_file_path + ), f"center path({center_file_path}) must exist when it is not None." + with open(center_file_path, 'rb') as f: + char_dict = pickle.load(f) + for key in char_dict.keys(): + self.centers[key] = paddle.to_tensor(char_dict[key]) + + def __call__(self, predicts, batch): + assert isinstance(predicts, (list, tuple)) + features, predicts = predicts + + feats_reshape = paddle.reshape( + features, [-1, features.shape[-1]]).astype("float64") + label = paddle.argmax(predicts, axis=2) + label = paddle.reshape(label, [label.shape[0] * label.shape[1]]) + + batch_size = feats_reshape.shape[0] + + #calc l2 distance between feats and centers + square_feat = paddle.sum(paddle.square(feats_reshape), + axis=1, + keepdim=True) + square_feat = paddle.expand(square_feat, [batch_size, self.num_classes]) + + square_center = paddle.sum(paddle.square(self.centers), + axis=1, + keepdim=True) + square_center = paddle.expand( + square_center, [self.num_classes, batch_size]).astype("float64") + square_center = paddle.transpose(square_center, [1, 0]) + + distmat = paddle.add(square_feat, square_center) + feat_dot_center = paddle.matmul(feats_reshape, + paddle.transpose(self.centers, [1, 0])) + distmat = distmat - 2.0 * feat_dot_center + + #generate the mask + classes = paddle.arange(self.num_classes).astype("int64") + label = paddle.expand( + paddle.unsqueeze(label, 1), (batch_size, self.num_classes)) + mask = paddle.equal( + paddle.expand(classes, [batch_size, self.num_classes]), + label).astype("float64") + dist = paddle.multiply(distmat, mask) + + loss = paddle.sum(paddle.clip(dist, min=1e-12, max=1e+12)) / batch_size + return {'loss_center': loss} diff --git a/ppocr/losses/cls_loss.py b/ppocr/losses/cls_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..abc5e5b72cb055716715345105b59089f0a96edc --- /dev/null +++ b/ppocr/losses/cls_loss.py @@ -0,0 +1,30 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn + + +class ClsLoss(nn.Layer): + def __init__(self, **kwargs): + super(ClsLoss, self).__init__() + self.loss_func = nn.CrossEntropyLoss(reduction='mean') + + def forward(self, predicts, batch): + label = batch[1].astype("int64") + loss = self.loss_func(input=predicts, label=label) + return {'loss': loss} diff --git a/ppocr/losses/combined_loss.py b/ppocr/losses/combined_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..8d697d544b51899cdafeff94be2ecce067b907a2 --- /dev/null +++ b/ppocr/losses/combined_loss.py @@ -0,0 +1,72 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn + +from .rec_ctc_loss import CTCLoss +from .center_loss import CenterLoss +from .ace_loss import ACELoss +from .rec_sar_loss import SARLoss + +from .distillation_loss import DistillationCTCLoss +from .distillation_loss import DistillationSARLoss +from .distillation_loss import DistillationDMLLoss +from .distillation_loss import DistillationDistanceLoss, DistillationDBLoss, DistillationDilaDBLoss +from .distillation_loss import DistillationVQASerTokenLayoutLMLoss, DistillationSERDMLLoss +from .distillation_loss import DistillationLossFromOutput +from .distillation_loss import DistillationVQADistanceLoss + + +class CombinedLoss(nn.Layer): + """ + CombinedLoss: + a combionation of loss function + """ + + def __init__(self, loss_config_list=None): + super().__init__() + self.loss_func = [] + self.loss_weight = [] + assert isinstance(loss_config_list, list), ( + 'operator config should be a list') + for config in loss_config_list: + assert isinstance(config, + dict) and len(config) == 1, "yaml format error" + name = list(config)[0] + param = config[name] + assert "weight" in param, "weight must be in param, but param just contains {}".format( + param.keys()) + self.loss_weight.append(param.pop("weight")) + self.loss_func.append(eval(name)(**param)) + + def forward(self, input, batch, **kargs): + loss_dict = {} + loss_all = 0. + for idx, loss_func in enumerate(self.loss_func): + loss = loss_func(input, batch, **kargs) + if isinstance(loss, paddle.Tensor): + loss = {"loss_{}_{}".format(str(loss), idx): loss} + + weight = self.loss_weight[idx] + + loss = {key: loss[key] * weight for key in loss} + + if "loss" in loss: + loss_all += loss["loss"] + else: + loss_all += paddle.add_n(list(loss.values())) + loss_dict.update(loss) + loss_dict["loss"] = loss_all + return loss_dict diff --git a/ppocr/losses/det_basic_loss.py b/ppocr/losses/det_basic_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..61ea579b41d3cdf7831c168f563a1e3cd72463a0 --- /dev/null +++ b/ppocr/losses/det_basic_loss.py @@ -0,0 +1,153 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/models/losses/basic_loss.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import paddle +from paddle import nn +import paddle.nn.functional as F + + +class BalanceLoss(nn.Layer): + def __init__(self, + balance_loss=True, + main_loss_type='DiceLoss', + negative_ratio=3, + return_origin=False, + eps=1e-6, + **kwargs): + """ + The BalanceLoss for Differentiable Binarization text detection + args: + balance_loss (bool): whether balance loss or not, default is True + main_loss_type (str): can only be one of ['CrossEntropy','DiceLoss', + 'Euclidean','BCELoss', 'MaskL1Loss'], default is 'DiceLoss'. + negative_ratio (int|float): float, default is 3. + return_origin (bool): whether return unbalanced loss or not, default is False. + eps (float): default is 1e-6. + """ + super(BalanceLoss, self).__init__() + self.balance_loss = balance_loss + self.main_loss_type = main_loss_type + self.negative_ratio = negative_ratio + self.return_origin = return_origin + self.eps = eps + + if self.main_loss_type == "CrossEntropy": + self.loss = nn.CrossEntropyLoss() + elif self.main_loss_type == "Euclidean": + self.loss = nn.MSELoss() + elif self.main_loss_type == "DiceLoss": + self.loss = DiceLoss(self.eps) + elif self.main_loss_type == "BCELoss": + self.loss = BCELoss(reduction='none') + elif self.main_loss_type == "MaskL1Loss": + self.loss = MaskL1Loss(self.eps) + else: + loss_type = [ + 'CrossEntropy', 'DiceLoss', 'Euclidean', 'BCELoss', 'MaskL1Loss' + ] + raise Exception( + "main_loss_type in BalanceLoss() can only be one of {}".format( + loss_type)) + + def forward(self, pred, gt, mask=None): + """ + The BalanceLoss for Differentiable Binarization text detection + args: + pred (variable): predicted feature maps. + gt (variable): ground truth feature maps. + mask (variable): masked maps. + return: (variable) balanced loss + """ + positive = gt * mask + negative = (1 - gt) * mask + + positive_count = int(positive.sum()) + negative_count = int( + min(negative.sum(), positive_count * self.negative_ratio)) + loss = self.loss(pred, gt, mask=mask) + + if not self.balance_loss: + return loss + + positive_loss = positive * loss + negative_loss = negative * loss + negative_loss = paddle.reshape(negative_loss, shape=[-1]) + if negative_count > 0: + sort_loss = negative_loss.sort(descending=True) + negative_loss = sort_loss[:negative_count] + # negative_loss, _ = paddle.topk(negative_loss, k=negative_count_int) + balance_loss = (positive_loss.sum() + negative_loss.sum()) / ( + positive_count + negative_count + self.eps) + else: + balance_loss = positive_loss.sum() / (positive_count + self.eps) + if self.return_origin: + return balance_loss, loss + + return balance_loss + + +class DiceLoss(nn.Layer): + def __init__(self, eps=1e-6): + super(DiceLoss, self).__init__() + self.eps = eps + + def forward(self, pred, gt, mask, weights=None): + """ + DiceLoss function. + """ + + assert pred.shape == gt.shape + assert pred.shape == mask.shape + if weights is not None: + assert weights.shape == mask.shape + mask = weights * mask + intersection = paddle.sum(pred * gt * mask) + + union = paddle.sum(pred * mask) + paddle.sum(gt * mask) + self.eps + loss = 1 - 2.0 * intersection / union + assert loss <= 1 + return loss + + +class MaskL1Loss(nn.Layer): + def __init__(self, eps=1e-6): + super(MaskL1Loss, self).__init__() + self.eps = eps + + def forward(self, pred, gt, mask): + """ + Mask L1 Loss + """ + loss = (paddle.abs(pred - gt) * mask).sum() / (mask.sum() + self.eps) + loss = paddle.mean(loss) + return loss + + +class BCELoss(nn.Layer): + def __init__(self, reduction='mean'): + super(BCELoss, self).__init__() + self.reduction = reduction + + def forward(self, input, label, mask=None, weight=None, name=None): + loss = F.binary_cross_entropy(input, label, reduction=self.reduction) + return loss diff --git a/ppocr/losses/det_ct_loss.py b/ppocr/losses/det_ct_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..f48c95be4f84e2d8520363379b3061fa4245c105 --- /dev/null +++ b/ppocr/losses/det_ct_loss.py @@ -0,0 +1,276 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/shengtao96/CentripetalText/tree/main/models/loss +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +import numpy as np + + +def ohem_single(score, gt_text, training_mask): + # online hard example mining + + pos_num = int(paddle.sum(gt_text > 0.5)) - int( + paddle.sum((gt_text > 0.5) & (training_mask <= 0.5))) + + if pos_num == 0: + # selected_mask = gt_text.copy() * 0 # may be not good + selected_mask = training_mask + selected_mask = paddle.cast( + selected_mask.reshape( + (1, selected_mask.shape[0], selected_mask.shape[1])), "float32") + return selected_mask + + neg_num = int(paddle.sum((gt_text <= 0.5) & (training_mask > 0.5))) + neg_num = int(min(pos_num * 3, neg_num)) + + if neg_num == 0: + selected_mask = training_mask + selected_mask = paddle.cast( + selected_mask.reshape( + (1, selected_mask.shape[0], selected_mask.shape[1])), "float32") + return selected_mask + + # hard example + neg_score = score[(gt_text <= 0.5) & (training_mask > 0.5)] + neg_score_sorted = paddle.sort(-neg_score) + threshold = -neg_score_sorted[neg_num - 1] + + selected_mask = ((score >= threshold) | + (gt_text > 0.5)) & (training_mask > 0.5) + selected_mask = paddle.cast( + selected_mask.reshape( + (1, selected_mask.shape[0], selected_mask.shape[1])), "float32") + return selected_mask + + +def ohem_batch(scores, gt_texts, training_masks): + selected_masks = [] + for i in range(scores.shape[0]): + selected_masks.append( + ohem_single(scores[i, :, :], gt_texts[i, :, :], training_masks[ + i, :, :])) + + selected_masks = paddle.cast(paddle.concat(selected_masks, 0), "float32") + return selected_masks + + +def iou_single(a, b, mask, n_class): + EPS = 1e-6 + valid = mask == 1 + a = a[valid] + b = b[valid] + miou = [] + + # iou of each class + for i in range(n_class): + inter = paddle.cast(((a == i) & (b == i)), "float32") + union = paddle.cast(((a == i) | (b == i)), "float32") + + miou.append(paddle.sum(inter) / (paddle.sum(union) + EPS)) + miou = sum(miou) / len(miou) + return miou + + +def iou(a, b, mask, n_class=2, reduce=True): + batch_size = a.shape[0] + + a = a.reshape((batch_size, -1)) + b = b.reshape((batch_size, -1)) + mask = mask.reshape((batch_size, -1)) + + iou = paddle.zeros((batch_size, ), dtype="float32") + for i in range(batch_size): + iou[i] = iou_single(a[i], b[i], mask[i], n_class) + + if reduce: + iou = paddle.mean(iou) + return iou + + +class DiceLoss(nn.Layer): + def __init__(self, loss_weight=1.0): + super(DiceLoss, self).__init__() + self.loss_weight = loss_weight + + def forward(self, input, target, mask, reduce=True): + batch_size = input.shape[0] + input = F.sigmoid(input) # scale to 0-1 + + input = input.reshape((batch_size, -1)) + target = paddle.cast(target.reshape((batch_size, -1)), "float32") + mask = paddle.cast(mask.reshape((batch_size, -1)), "float32") + + input = input * mask + target = target * mask + + a = paddle.sum(input * target, axis=1) + b = paddle.sum(input * input, axis=1) + 0.001 + c = paddle.sum(target * target, axis=1) + 0.001 + d = (2 * a) / (b + c) + loss = 1 - d + + loss = self.loss_weight * loss + + if reduce: + loss = paddle.mean(loss) + + return loss + + +class SmoothL1Loss(nn.Layer): + def __init__(self, beta=1.0, loss_weight=1.0): + super(SmoothL1Loss, self).__init__() + self.beta = beta + self.loss_weight = loss_weight + + np_coord = np.zeros(shape=[640, 640, 2], dtype=np.int64) + for i in range(640): + for j in range(640): + np_coord[i, j, 0] = j + np_coord[i, j, 1] = i + np_coord = np_coord.reshape((-1, 2)) + + self.coord = self.create_parameter( + shape=[640 * 640, 2], + dtype="int32", # NOTE: not support "int64" before paddle 2.3.1 + default_initializer=nn.initializer.Assign(value=np_coord)) + self.coord.stop_gradient = True + + def forward_single(self, input, target, mask, beta=1.0, eps=1e-6): + batch_size = input.shape[0] + + diff = paddle.abs(input - target) * mask.unsqueeze(1) + loss = paddle.where(diff < beta, 0.5 * diff * diff / beta, + diff - 0.5 * beta) + loss = paddle.cast(loss.reshape((batch_size, -1)), "float32") + mask = paddle.cast(mask.reshape((batch_size, -1)), "float32") + loss = paddle.sum(loss, axis=-1) + loss = loss / (mask.sum(axis=-1) + eps) + + return loss + + def select_single(self, distance, gt_instance, gt_kernel_instance, + training_mask): + + with paddle.no_grad(): + # paddle 2.3.1, paddle.slice not support: + # distance[:, self.coord[:, 1], self.coord[:, 0]] + select_distance_list = [] + for i in range(2): + tmp1 = distance[i, :] + tmp2 = tmp1[self.coord[:, 1], self.coord[:, 0]] + select_distance_list.append(tmp2.unsqueeze(0)) + select_distance = paddle.concat(select_distance_list, axis=0) + + off_points = paddle.cast( + self.coord, "float32") + 10 * select_distance.transpose((1, 0)) + + off_points = paddle.cast(off_points, "int64") + off_points = paddle.clip(off_points, 0, distance.shape[-1] - 1) + + selected_mask = ( + gt_instance[self.coord[:, 1], self.coord[:, 0]] != + gt_kernel_instance[off_points[:, 1], off_points[:, 0]]) + selected_mask = paddle.cast( + selected_mask.reshape((1, -1, distance.shape[-1])), "int64") + selected_training_mask = selected_mask * training_mask + + return selected_training_mask + + def forward(self, + distances, + gt_instances, + gt_kernel_instances, + training_masks, + gt_distances, + reduce=True): + + selected_training_masks = [] + for i in range(distances.shape[0]): + selected_training_masks.append( + self.select_single(distances[i, :, :, :], gt_instances[i, :, :], + gt_kernel_instances[i, :, :], training_masks[ + i, :, :])) + selected_training_masks = paddle.cast( + paddle.concat(selected_training_masks, 0), "float32") + + loss = self.forward_single(distances, gt_distances, + selected_training_masks, self.beta) + loss = self.loss_weight * loss + + with paddle.no_grad(): + batch_size = distances.shape[0] + false_num = selected_training_masks.reshape((batch_size, -1)) + false_num = false_num.sum(axis=-1) + total_num = paddle.cast( + training_masks.reshape((batch_size, -1)), "float32") + total_num = total_num.sum(axis=-1) + iou_text = (total_num - false_num) / (total_num + 1e-6) + + if reduce: + loss = paddle.mean(loss) + + return loss, iou_text + + +class CTLoss(nn.Layer): + def __init__(self): + super(CTLoss, self).__init__() + self.kernel_loss = DiceLoss() + self.loc_loss = SmoothL1Loss(beta=0.1, loss_weight=0.05) + + def forward(self, preds, batch): + imgs = batch[0] + out = preds['maps'] + gt_kernels, training_masks, gt_instances, gt_kernel_instances, training_mask_distances, gt_distances = batch[ + 1:] + + kernels = out[:, 0, :, :] + distances = out[:, 1:, :, :] + + # kernel loss + selected_masks = ohem_batch(kernels, gt_kernels, training_masks) + + loss_kernel = self.kernel_loss( + kernels, gt_kernels, selected_masks, reduce=False) + + iou_kernel = iou(paddle.cast((kernels > 0), "int64"), + gt_kernels, + training_masks, + reduce=False) + losses = dict(loss_kernels=loss_kernel, ) + + # loc loss + loss_loc, iou_text = self.loc_loss( + distances, + gt_instances, + gt_kernel_instances, + training_mask_distances, + gt_distances, + reduce=False) + losses.update(dict(loss_loc=loss_loc, )) + + loss_all = loss_kernel + loss_loc + losses = {'loss': loss_all} + + return losses diff --git a/ppocr/losses/det_db_loss.py b/ppocr/losses/det_db_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..708ffbdb47f349304e2bfd781a836e79348475f4 --- /dev/null +++ b/ppocr/losses/det_db_loss.py @@ -0,0 +1,76 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/models/losses/DB_loss.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn + +from .det_basic_loss import BalanceLoss, MaskL1Loss, DiceLoss + + +class DBLoss(nn.Layer): + """ + Differentiable Binarization (DB) Loss Function + args: + param (dict): the super paramter for DB Loss + """ + + def __init__(self, + balance_loss=True, + main_loss_type='DiceLoss', + alpha=5, + beta=10, + ohem_ratio=3, + eps=1e-6, + **kwargs): + super(DBLoss, self).__init__() + self.alpha = alpha + self.beta = beta + self.dice_loss = DiceLoss(eps=eps) + self.l1_loss = MaskL1Loss(eps=eps) + self.bce_loss = BalanceLoss( + balance_loss=balance_loss, + main_loss_type=main_loss_type, + negative_ratio=ohem_ratio) + + def forward(self, predicts, labels): + predict_maps = predicts['maps'] + label_threshold_map, label_threshold_mask, label_shrink_map, label_shrink_mask = labels[ + 1:] + shrink_maps = predict_maps[:, 0, :, :] + threshold_maps = predict_maps[:, 1, :, :] + binary_maps = predict_maps[:, 2, :, :] + + loss_shrink_maps = self.bce_loss(shrink_maps, label_shrink_map, + label_shrink_mask) + loss_threshold_maps = self.l1_loss(threshold_maps, label_threshold_map, + label_threshold_mask) + loss_binary_maps = self.dice_loss(binary_maps, label_shrink_map, + label_shrink_mask) + loss_shrink_maps = self.alpha * loss_shrink_maps + loss_threshold_maps = self.beta * loss_threshold_maps + + loss_all = loss_shrink_maps + loss_threshold_maps \ + + loss_binary_maps + losses = {'loss': loss_all, \ + "loss_shrink_maps": loss_shrink_maps, \ + "loss_threshold_maps": loss_threshold_maps, \ + "loss_binary_maps": loss_binary_maps} + return losses diff --git a/ppocr/losses/det_drrg_loss.py b/ppocr/losses/det_drrg_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..89d4b521c7d1d0f29104abc3f315379827f98af7 --- /dev/null +++ b/ppocr/losses/det_drrg_loss.py @@ -0,0 +1,224 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/losses/drrg_loss.py +""" + +import paddle +import paddle.nn.functional as F +from paddle import nn + + +class DRRGLoss(nn.Layer): + def __init__(self, ohem_ratio=3.0): + super().__init__() + self.ohem_ratio = ohem_ratio + self.downsample_ratio = 1.0 + + def balance_bce_loss(self, pred, gt, mask): + """Balanced Binary-CrossEntropy Loss. + + Args: + pred (Tensor): Shape of :math:`(1, H, W)`. + gt (Tensor): Shape of :math:`(1, H, W)`. + mask (Tensor): Shape of :math:`(1, H, W)`. + + Returns: + Tensor: Balanced bce loss. + """ + assert pred.shape == gt.shape == mask.shape + assert paddle.all(pred >= 0) and paddle.all(pred <= 1) + assert paddle.all(gt >= 0) and paddle.all(gt <= 1) + positive = gt * mask + negative = (1 - gt) * mask + positive_count = int(positive.sum()) + + if positive_count > 0: + loss = F.binary_cross_entropy(pred, gt, reduction='none') + positive_loss = paddle.sum(loss * positive) + negative_loss = loss * negative + negative_count = min( + int(negative.sum()), int(positive_count * self.ohem_ratio)) + else: + positive_loss = paddle.to_tensor(0.0) + loss = F.binary_cross_entropy(pred, gt, reduction='none') + negative_loss = loss * negative + negative_count = 100 + negative_loss, _ = paddle.topk( + negative_loss.reshape([-1]), negative_count) + + balance_loss = (positive_loss + paddle.sum(negative_loss)) / ( + float(positive_count + negative_count) + 1e-5) + + return balance_loss + + def gcn_loss(self, gcn_data): + """CrossEntropy Loss from gcn module. + + Args: + gcn_data (tuple(Tensor, Tensor)): The first is the + prediction with shape :math:`(N, 2)` and the + second is the gt label with shape :math:`(m, n)` + where :math:`m * n = N`. + + Returns: + Tensor: CrossEntropy loss. + """ + gcn_pred, gt_labels = gcn_data + gt_labels = gt_labels.reshape([-1]) + loss = F.cross_entropy(gcn_pred, gt_labels) + + return loss + + def bitmasks2tensor(self, bitmasks, target_sz): + """Convert Bitmasks to tensor. + + Args: + bitmasks (list[BitmapMasks]): The BitmapMasks list. Each item is + for one img. + target_sz (tuple(int, int)): The target tensor of size + :math:`(H, W)`. + + Returns: + list[Tensor]: The list of kernel tensors. Each element stands for + one kernel level. + """ + batch_size = len(bitmasks) + results = [] + + kernel = [] + for batch_inx in range(batch_size): + mask = bitmasks[batch_inx] + # hxw + mask_sz = mask.shape + # left, right, top, bottom + pad = [0, target_sz[1] - mask_sz[1], 0, target_sz[0] - mask_sz[0]] + mask = F.pad(mask, pad, mode='constant', value=0) + kernel.append(mask) + kernel = paddle.stack(kernel) + results.append(kernel) + + return results + + def forward(self, preds, labels): + """Compute Drrg loss. + """ + + assert isinstance(preds, tuple) + gt_text_mask, gt_center_region_mask, gt_mask, gt_top_height_map, gt_bot_height_map, gt_sin_map, gt_cos_map = labels[ + 1:8] + + downsample_ratio = self.downsample_ratio + + pred_maps, gcn_data = preds + pred_text_region = pred_maps[:, 0, :, :] + pred_center_region = pred_maps[:, 1, :, :] + pred_sin_map = pred_maps[:, 2, :, :] + pred_cos_map = pred_maps[:, 3, :, :] + pred_top_height_map = pred_maps[:, 4, :, :] + pred_bot_height_map = pred_maps[:, 5, :, :] + feature_sz = pred_maps.shape + + # bitmask 2 tensor + mapping = { + 'gt_text_mask': paddle.cast(gt_text_mask, 'float32'), + 'gt_center_region_mask': + paddle.cast(gt_center_region_mask, 'float32'), + 'gt_mask': paddle.cast(gt_mask, 'float32'), + 'gt_top_height_map': paddle.cast(gt_top_height_map, 'float32'), + 'gt_bot_height_map': paddle.cast(gt_bot_height_map, 'float32'), + 'gt_sin_map': paddle.cast(gt_sin_map, 'float32'), + 'gt_cos_map': paddle.cast(gt_cos_map, 'float32') + } + gt = {} + for key, value in mapping.items(): + gt[key] = value + if abs(downsample_ratio - 1.0) < 1e-2: + gt[key] = self.bitmasks2tensor(gt[key], feature_sz[2:]) + else: + gt[key] = [item.rescale(downsample_ratio) for item in gt[key]] + gt[key] = self.bitmasks2tensor(gt[key], feature_sz[2:]) + if key in ['gt_top_height_map', 'gt_bot_height_map']: + gt[key] = [item * downsample_ratio for item in gt[key]] + gt[key] = [item for item in gt[key]] + + scale = paddle.sqrt(1.0 / (pred_sin_map**2 + pred_cos_map**2 + 1e-8)) + pred_sin_map = pred_sin_map * scale + pred_cos_map = pred_cos_map * scale + + loss_text = self.balance_bce_loss( + F.sigmoid(pred_text_region), gt['gt_text_mask'][0], + gt['gt_mask'][0]) + + text_mask = (gt['gt_text_mask'][0] * gt['gt_mask'][0]) + negative_text_mask = ((1 - gt['gt_text_mask'][0]) * gt['gt_mask'][0]) + loss_center_map = F.binary_cross_entropy( + F.sigmoid(pred_center_region), + gt['gt_center_region_mask'][0], + reduction='none') + if int(text_mask.sum()) > 0: + loss_center_positive = paddle.sum(loss_center_map * + text_mask) / paddle.sum(text_mask) + else: + loss_center_positive = paddle.to_tensor(0.0) + loss_center_negative = paddle.sum( + loss_center_map * + negative_text_mask) / paddle.sum(negative_text_mask) + loss_center = loss_center_positive + 0.5 * loss_center_negative + + center_mask = (gt['gt_center_region_mask'][0] * gt['gt_mask'][0]) + if int(center_mask.sum()) > 0: + map_sz = pred_top_height_map.shape + ones = paddle.ones(map_sz, dtype='float32') + loss_top = F.smooth_l1_loss( + pred_top_height_map / (gt['gt_top_height_map'][0] + 1e-2), + ones, + reduction='none') + loss_bot = F.smooth_l1_loss( + pred_bot_height_map / (gt['gt_bot_height_map'][0] + 1e-2), + ones, + reduction='none') + gt_height = ( + gt['gt_top_height_map'][0] + gt['gt_bot_height_map'][0]) + loss_height = paddle.sum( + (paddle.log(gt_height + 1) * + (loss_top + loss_bot)) * center_mask) / paddle.sum(center_mask) + + loss_sin = paddle.sum( + F.smooth_l1_loss( + pred_sin_map, gt['gt_sin_map'][0], + reduction='none') * center_mask) / paddle.sum(center_mask) + loss_cos = paddle.sum( + F.smooth_l1_loss( + pred_cos_map, gt['gt_cos_map'][0], + reduction='none') * center_mask) / paddle.sum(center_mask) + else: + loss_height = paddle.to_tensor(0.0) + loss_sin = paddle.to_tensor(0.0) + loss_cos = paddle.to_tensor(0.0) + + loss_gcn = self.gcn_loss(gcn_data) + + loss = loss_text + loss_center + loss_height + loss_sin + loss_cos + loss_gcn + results = dict( + loss=loss, + loss_text=loss_text, + loss_center=loss_center, + loss_height=loss_height, + loss_sin=loss_sin, + loss_cos=loss_cos, + loss_gcn=loss_gcn) + + return results diff --git a/ppocr/losses/det_east_loss.py b/ppocr/losses/det_east_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..bcf5372b72d49ebeb34c0ce127e7221d563a6c35 --- /dev/null +++ b/ppocr/losses/det_east_loss.py @@ -0,0 +1,63 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +from .det_basic_loss import DiceLoss + + +class EASTLoss(nn.Layer): + """ + """ + + def __init__(self, + eps=1e-6, + **kwargs): + super(EASTLoss, self).__init__() + self.dice_loss = DiceLoss(eps=eps) + + def forward(self, predicts, labels): + l_score, l_geo, l_mask = labels[1:] + f_score = predicts['f_score'] + f_geo = predicts['f_geo'] + + dice_loss = self.dice_loss(f_score, l_score, l_mask) + + #smoooth_l1_loss + channels = 8 + l_geo_split = paddle.split( + l_geo, num_or_sections=channels + 1, axis=1) + f_geo_split = paddle.split(f_geo, num_or_sections=channels, axis=1) + smooth_l1 = 0 + for i in range(0, channels): + geo_diff = l_geo_split[i] - f_geo_split[i] + abs_geo_diff = paddle.abs(geo_diff) + smooth_l1_sign = paddle.less_than(abs_geo_diff, l_score) + smooth_l1_sign = paddle.cast(smooth_l1_sign, dtype='float32') + in_loss = abs_geo_diff * abs_geo_diff * smooth_l1_sign + \ + (abs_geo_diff - 0.5) * (1.0 - smooth_l1_sign) + out_loss = l_geo_split[-1] / channels * in_loss * l_score + smooth_l1 += out_loss + smooth_l1_loss = paddle.mean(smooth_l1 * l_score) + + dice_loss = dice_loss * 0.01 + total_loss = dice_loss + smooth_l1_loss + losses = {"loss":total_loss, \ + "dice_loss":dice_loss,\ + "smooth_l1_loss":smooth_l1_loss} + return losses diff --git a/ppocr/losses/det_fce_loss.py b/ppocr/losses/det_fce_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..d7dfb5aa6c6b2ac7eaf03bcfb18b1b2859cbc521 --- /dev/null +++ b/ppocr/losses/det_fce_loss.py @@ -0,0 +1,227 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/losses/fce_loss.py +""" + +import numpy as np +from paddle import nn +import paddle +import paddle.nn.functional as F +from functools import partial + + +def multi_apply(func, *args, **kwargs): + pfunc = partial(func, **kwargs) if kwargs else func + map_results = map(pfunc, *args) + return tuple(map(list, zip(*map_results))) + + +class FCELoss(nn.Layer): + """The class for implementing FCENet loss + FCENet(CVPR2021): Fourier Contour Embedding for Arbitrary-shaped + Text Detection + + [https://arxiv.org/abs/2104.10442] + + Args: + fourier_degree (int) : The maximum Fourier transform degree k. + num_sample (int) : The sampling points number of regression + loss. If it is too small, fcenet tends to be overfitting. + ohem_ratio (float): the negative/positive ratio in OHEM. + """ + + def __init__(self, fourier_degree, num_sample, ohem_ratio=3.): + super().__init__() + self.fourier_degree = fourier_degree + self.num_sample = num_sample + self.ohem_ratio = ohem_ratio + + def forward(self, preds, labels): + assert isinstance(preds, dict) + preds = preds['levels'] + + p3_maps, p4_maps, p5_maps = labels[1:] + assert p3_maps[0].shape[0] == 4 * self.fourier_degree + 5,\ + 'fourier degree not equal in FCEhead and FCEtarget' + + # to tensor + gts = [p3_maps, p4_maps, p5_maps] + for idx, maps in enumerate(gts): + gts[idx] = paddle.to_tensor(np.stack(maps)) + + losses = multi_apply(self.forward_single, preds, gts) + + loss_tr = paddle.to_tensor(0.).astype('float32') + loss_tcl = paddle.to_tensor(0.).astype('float32') + loss_reg_x = paddle.to_tensor(0.).astype('float32') + loss_reg_y = paddle.to_tensor(0.).astype('float32') + loss_all = paddle.to_tensor(0.).astype('float32') + + for idx, loss in enumerate(losses): + loss_all += sum(loss) + if idx == 0: + loss_tr += sum(loss) + elif idx == 1: + loss_tcl += sum(loss) + elif idx == 2: + loss_reg_x += sum(loss) + else: + loss_reg_y += sum(loss) + + results = dict( + loss=loss_all, + loss_text=loss_tr, + loss_center=loss_tcl, + loss_reg_x=loss_reg_x, + loss_reg_y=loss_reg_y, ) + return results + + def forward_single(self, pred, gt): + cls_pred = paddle.transpose(pred[0], (0, 2, 3, 1)) + reg_pred = paddle.transpose(pred[1], (0, 2, 3, 1)) + gt = paddle.transpose(gt, (0, 2, 3, 1)) + + k = 2 * self.fourier_degree + 1 + tr_pred = paddle.reshape(cls_pred[:, :, :, :2], (-1, 2)) + tcl_pred = paddle.reshape(cls_pred[:, :, :, 2:], (-1, 2)) + x_pred = paddle.reshape(reg_pred[:, :, :, 0:k], (-1, k)) + y_pred = paddle.reshape(reg_pred[:, :, :, k:2 * k], (-1, k)) + + tr_mask = gt[:, :, :, :1].reshape([-1]) + tcl_mask = gt[:, :, :, 1:2].reshape([-1]) + train_mask = gt[:, :, :, 2:3].reshape([-1]) + x_map = paddle.reshape(gt[:, :, :, 3:3 + k], (-1, k)) + y_map = paddle.reshape(gt[:, :, :, 3 + k:], (-1, k)) + + tr_train_mask = (train_mask * tr_mask).astype('bool') + tr_train_mask2 = paddle.concat( + [tr_train_mask.unsqueeze(1), tr_train_mask.unsqueeze(1)], axis=1) + # tr loss + loss_tr = self.ohem(tr_pred, tr_mask, train_mask) + # tcl loss + loss_tcl = paddle.to_tensor(0.).astype('float32') + tr_neg_mask = tr_train_mask.logical_not() + tr_neg_mask2 = paddle.concat( + [tr_neg_mask.unsqueeze(1), tr_neg_mask.unsqueeze(1)], axis=1) + if tr_train_mask.sum().item() > 0: + loss_tcl_pos = F.cross_entropy( + tcl_pred.masked_select(tr_train_mask2).reshape([-1, 2]), + tcl_mask.masked_select(tr_train_mask).astype('int64')) + loss_tcl_neg = F.cross_entropy( + tcl_pred.masked_select(tr_neg_mask2).reshape([-1, 2]), + tcl_mask.masked_select(tr_neg_mask).astype('int64')) + loss_tcl = loss_tcl_pos + 0.5 * loss_tcl_neg + + # regression loss + loss_reg_x = paddle.to_tensor(0.).astype('float32') + loss_reg_y = paddle.to_tensor(0.).astype('float32') + if tr_train_mask.sum().item() > 0: + weight = (tr_mask.masked_select(tr_train_mask.astype('bool')) + .astype('float32') + tcl_mask.masked_select( + tr_train_mask.astype('bool')).astype('float32')) / 2 + weight = weight.reshape([-1, 1]) + + ft_x, ft_y = self.fourier2poly(x_map, y_map) + ft_x_pre, ft_y_pre = self.fourier2poly(x_pred, y_pred) + + dim = ft_x.shape[1] + + tr_train_mask3 = paddle.concat( + [tr_train_mask.unsqueeze(1) for i in range(dim)], axis=1) + + loss_reg_x = paddle.mean(weight * F.smooth_l1_loss( + ft_x_pre.masked_select(tr_train_mask3).reshape([-1, dim]), + ft_x.masked_select(tr_train_mask3).reshape([-1, dim]), + reduction='none')) + loss_reg_y = paddle.mean(weight * F.smooth_l1_loss( + ft_y_pre.masked_select(tr_train_mask3).reshape([-1, dim]), + ft_y.masked_select(tr_train_mask3).reshape([-1, dim]), + reduction='none')) + + return loss_tr, loss_tcl, loss_reg_x, loss_reg_y + + def ohem(self, predict, target, train_mask): + + pos = (target * train_mask).astype('bool') + neg = ((1 - target) * train_mask).astype('bool') + + pos2 = paddle.concat([pos.unsqueeze(1), pos.unsqueeze(1)], axis=1) + neg2 = paddle.concat([neg.unsqueeze(1), neg.unsqueeze(1)], axis=1) + + n_pos = pos.astype('float32').sum() + + if n_pos.item() > 0: + loss_pos = F.cross_entropy( + predict.masked_select(pos2).reshape([-1, 2]), + target.masked_select(pos).astype('int64'), + reduction='sum') + loss_neg = F.cross_entropy( + predict.masked_select(neg2).reshape([-1, 2]), + target.masked_select(neg).astype('int64'), + reduction='none') + n_neg = min( + int(neg.astype('float32').sum().item()), + int(self.ohem_ratio * n_pos.astype('float32'))) + else: + loss_pos = paddle.to_tensor(0.) + loss_neg = F.cross_entropy( + predict.masked_select(neg2).reshape([-1, 2]), + target.masked_select(neg).astype('int64'), + reduction='none') + n_neg = 100 + if len(loss_neg) > n_neg: + loss_neg, _ = paddle.topk(loss_neg, n_neg) + + return (loss_pos + loss_neg.sum()) / (n_pos + n_neg).astype('float32') + + def fourier2poly(self, real_maps, imag_maps): + """Transform Fourier coefficient maps to polygon maps. + + Args: + real_maps (tensor): A map composed of the real parts of the + Fourier coefficients, whose shape is (-1, 2k+1) + imag_maps (tensor):A map composed of the imag parts of the + Fourier coefficients, whose shape is (-1, 2k+1) + + Returns + x_maps (tensor): A map composed of the x value of the polygon + represented by n sample points (xn, yn), whose shape is (-1, n) + y_maps (tensor): A map composed of the y value of the polygon + represented by n sample points (xn, yn), whose shape is (-1, n) + """ + + k_vect = paddle.arange( + -self.fourier_degree, self.fourier_degree + 1, + dtype='float32').reshape([-1, 1]) + i_vect = paddle.arange( + 0, self.num_sample, dtype='float32').reshape([1, -1]) + + transform_matrix = 2 * np.pi / self.num_sample * paddle.matmul(k_vect, + i_vect) + + x1 = paddle.einsum('ak, kn-> an', real_maps, + paddle.cos(transform_matrix)) + x2 = paddle.einsum('ak, kn-> an', imag_maps, + paddle.sin(transform_matrix)) + y1 = paddle.einsum('ak, kn-> an', real_maps, + paddle.sin(transform_matrix)) + y2 = paddle.einsum('ak, kn-> an', imag_maps, + paddle.cos(transform_matrix)) + + x_maps = x1 - x2 + y_maps = y1 + y2 + + return x_maps, y_maps diff --git a/ppocr/losses/det_pse_loss.py b/ppocr/losses/det_pse_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..6b31343ed4d1687ee8ca44592fba0331b0b287dc --- /dev/null +++ b/ppocr/losses/det_pse_loss.py @@ -0,0 +1,149 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py +""" + +import paddle +from paddle import nn +from paddle.nn import functional as F +import numpy as np +from ppocr.utils.iou import iou + + +class PSELoss(nn.Layer): + def __init__(self, + alpha, + ohem_ratio=3, + kernel_sample_mask='pred', + reduction='sum', + eps=1e-6, + **kwargs): + """Implement PSE Loss. + """ + super(PSELoss, self).__init__() + assert reduction in ['sum', 'mean', 'none'] + self.alpha = alpha + self.ohem_ratio = ohem_ratio + self.kernel_sample_mask = kernel_sample_mask + self.reduction = reduction + self.eps = eps + + def forward(self, outputs, labels): + predicts = outputs['maps'] + predicts = F.interpolate(predicts, scale_factor=4) + + texts = predicts[:, 0, :, :] + kernels = predicts[:, 1:, :, :] + gt_texts, gt_kernels, training_masks = labels[1:] + + # text loss + selected_masks = self.ohem_batch(texts, gt_texts, training_masks) + + loss_text = self.dice_loss(texts, gt_texts, selected_masks) + iou_text = iou((texts > 0).astype('int64'), + gt_texts, + training_masks, + reduce=False) + losses = dict(loss_text=loss_text, iou_text=iou_text) + + # kernel loss + loss_kernels = [] + if self.kernel_sample_mask == 'gt': + selected_masks = gt_texts * training_masks + elif self.kernel_sample_mask == 'pred': + selected_masks = ( + F.sigmoid(texts) > 0.5).astype('float32') * training_masks + + for i in range(kernels.shape[1]): + kernel_i = kernels[:, i, :, :] + gt_kernel_i = gt_kernels[:, i, :, :] + loss_kernel_i = self.dice_loss(kernel_i, gt_kernel_i, + selected_masks) + loss_kernels.append(loss_kernel_i) + loss_kernels = paddle.mean(paddle.stack(loss_kernels, axis=1), axis=1) + iou_kernel = iou((kernels[:, -1, :, :] > 0).astype('int64'), + gt_kernels[:, -1, :, :], + training_masks * gt_texts, + reduce=False) + losses.update(dict(loss_kernels=loss_kernels, iou_kernel=iou_kernel)) + loss = self.alpha * loss_text + (1 - self.alpha) * loss_kernels + losses['loss'] = loss + if self.reduction == 'sum': + losses = {x: paddle.sum(v) for x, v in losses.items()} + elif self.reduction == 'mean': + losses = {x: paddle.mean(v) for x, v in losses.items()} + return losses + + def dice_loss(self, input, target, mask): + input = F.sigmoid(input) + + input = input.reshape([input.shape[0], -1]) + target = target.reshape([target.shape[0], -1]) + mask = mask.reshape([mask.shape[0], -1]) + + input = input * mask + target = target * mask + + a = paddle.sum(input * target, 1) + b = paddle.sum(input * input, 1) + self.eps + c = paddle.sum(target * target, 1) + self.eps + d = (2 * a) / (b + c) + return 1 - d + + def ohem_single(self, score, gt_text, training_mask, ohem_ratio=3): + pos_num = int(paddle.sum((gt_text > 0.5).astype('float32'))) - int( + paddle.sum( + paddle.logical_and((gt_text > 0.5), (training_mask <= 0.5)) + .astype('float32'))) + + if pos_num == 0: + selected_mask = training_mask + selected_mask = selected_mask.reshape( + [1, selected_mask.shape[0], selected_mask.shape[1]]).astype( + 'float32') + return selected_mask + + neg_num = int(paddle.sum((gt_text <= 0.5).astype('float32'))) + neg_num = int(min(pos_num * ohem_ratio, neg_num)) + + if neg_num == 0: + selected_mask = training_mask + selected_mask = selected_mask.reshape( + [1, selected_mask.shape[0], selected_mask.shape[1]]).astype( + 'float32') + return selected_mask + + neg_score = paddle.masked_select(score, gt_text <= 0.5) + neg_score_sorted = paddle.sort(-neg_score) + threshold = -neg_score_sorted[neg_num - 1] + + selected_mask = paddle.logical_and( + paddle.logical_or((score >= threshold), (gt_text > 0.5)), + (training_mask > 0.5)) + selected_mask = selected_mask.reshape( + [1, selected_mask.shape[0], selected_mask.shape[1]]).astype( + 'float32') + return selected_mask + + def ohem_batch(self, scores, gt_texts, training_masks, ohem_ratio=3): + selected_masks = [] + for i in range(scores.shape[0]): + selected_masks.append( + self.ohem_single(scores[i, :, :], gt_texts[i, :, :], + training_masks[i, :, :], ohem_ratio)) + + selected_masks = paddle.concat(selected_masks, 0).astype('float32') + return selected_masks diff --git a/ppocr/losses/det_sast_loss.py b/ppocr/losses/det_sast_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..2e0c756bd4ebe4157ed397a6c2e9b7e94054b4e7 --- /dev/null +++ b/ppocr/losses/det_sast_loss.py @@ -0,0 +1,121 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +from .det_basic_loss import DiceLoss +import numpy as np + + +class SASTLoss(nn.Layer): + """ + """ + + def __init__(self, eps=1e-6, **kwargs): + super(SASTLoss, self).__init__() + self.dice_loss = DiceLoss(eps=eps) + + def forward(self, predicts, labels): + """ + tcl_pos: N x 128 x 3 + tcl_mask: N x 128 x 1 + tcl_label: N x X list or LoDTensor + """ + + f_score = predicts['f_score'] + f_border = predicts['f_border'] + f_tvo = predicts['f_tvo'] + f_tco = predicts['f_tco'] + + l_score, l_border, l_mask, l_tvo, l_tco = labels[1:] + + #score_loss + intersection = paddle.sum(f_score * l_score * l_mask) + union = paddle.sum(f_score * l_mask) + paddle.sum(l_score * l_mask) + score_loss = 1.0 - 2 * intersection / (union + 1e-5) + + #border loss + l_border_split, l_border_norm = paddle.split( + l_border, num_or_sections=[4, 1], axis=1) + f_border_split = f_border + border_ex_shape = l_border_norm.shape * np.array([1, 4, 1, 1]) + l_border_norm_split = paddle.expand( + x=l_border_norm, shape=border_ex_shape) + l_border_score = paddle.expand(x=l_score, shape=border_ex_shape) + l_border_mask = paddle.expand(x=l_mask, shape=border_ex_shape) + + border_diff = l_border_split - f_border_split + abs_border_diff = paddle.abs(border_diff) + border_sign = abs_border_diff < 1.0 + border_sign = paddle.cast(border_sign, dtype='float32') + border_sign.stop_gradient = True + border_in_loss = 0.5 * abs_border_diff * abs_border_diff * border_sign + \ + (abs_border_diff - 0.5) * (1.0 - border_sign) + border_out_loss = l_border_norm_split * border_in_loss + border_loss = paddle.sum(border_out_loss * l_border_score * l_border_mask) / \ + (paddle.sum(l_border_score * l_border_mask) + 1e-5) + + #tvo_loss + l_tvo_split, l_tvo_norm = paddle.split( + l_tvo, num_or_sections=[8, 1], axis=1) + f_tvo_split = f_tvo + tvo_ex_shape = l_tvo_norm.shape * np.array([1, 8, 1, 1]) + l_tvo_norm_split = paddle.expand(x=l_tvo_norm, shape=tvo_ex_shape) + l_tvo_score = paddle.expand(x=l_score, shape=tvo_ex_shape) + l_tvo_mask = paddle.expand(x=l_mask, shape=tvo_ex_shape) + # + tvo_geo_diff = l_tvo_split - f_tvo_split + abs_tvo_geo_diff = paddle.abs(tvo_geo_diff) + tvo_sign = abs_tvo_geo_diff < 1.0 + tvo_sign = paddle.cast(tvo_sign, dtype='float32') + tvo_sign.stop_gradient = True + tvo_in_loss = 0.5 * abs_tvo_geo_diff * abs_tvo_geo_diff * tvo_sign + \ + (abs_tvo_geo_diff - 0.5) * (1.0 - tvo_sign) + tvo_out_loss = l_tvo_norm_split * tvo_in_loss + tvo_loss = paddle.sum(tvo_out_loss * l_tvo_score * l_tvo_mask) / \ + (paddle.sum(l_tvo_score * l_tvo_mask) + 1e-5) + + #tco_loss + l_tco_split, l_tco_norm = paddle.split( + l_tco, num_or_sections=[2, 1], axis=1) + f_tco_split = f_tco + tco_ex_shape = l_tco_norm.shape * np.array([1, 2, 1, 1]) + l_tco_norm_split = paddle.expand(x=l_tco_norm, shape=tco_ex_shape) + l_tco_score = paddle.expand(x=l_score, shape=tco_ex_shape) + l_tco_mask = paddle.expand(x=l_mask, shape=tco_ex_shape) + + tco_geo_diff = l_tco_split - f_tco_split + abs_tco_geo_diff = paddle.abs(tco_geo_diff) + tco_sign = abs_tco_geo_diff < 1.0 + tco_sign = paddle.cast(tco_sign, dtype='float32') + tco_sign.stop_gradient = True + tco_in_loss = 0.5 * abs_tco_geo_diff * abs_tco_geo_diff * tco_sign + \ + (abs_tco_geo_diff - 0.5) * (1.0 - tco_sign) + tco_out_loss = l_tco_norm_split * tco_in_loss + tco_loss = paddle.sum(tco_out_loss * l_tco_score * l_tco_mask) / \ + (paddle.sum(l_tco_score * l_tco_mask) + 1e-5) + + # total loss + tvo_lw, tco_lw = 1.5, 1.5 + score_lw, border_lw = 1.0, 1.0 + total_loss = score_loss * score_lw + border_loss * border_lw + \ + tvo_loss * tvo_lw + tco_loss * tco_lw + + losses = {'loss':total_loss, "score_loss":score_loss,\ + "border_loss":border_loss, 'tvo_loss':tvo_loss, 'tco_loss':tco_loss} + return losses diff --git a/ppocr/losses/distillation_loss.py b/ppocr/losses/distillation_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..4bfbed75a338e2bd3bca0b80d16028030bf2f0b5 --- /dev/null +++ b/ppocr/losses/distillation_loss.py @@ -0,0 +1,461 @@ +#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +import paddle +import paddle.nn as nn +import numpy as np +import cv2 + +from .rec_ctc_loss import CTCLoss +from .rec_sar_loss import SARLoss +from .basic_loss import DMLLoss +from .basic_loss import DistanceLoss +from .basic_loss import LossFromOutput +from .det_db_loss import DBLoss +from .det_basic_loss import BalanceLoss, MaskL1Loss, DiceLoss +from .vqa_token_layoutlm_loss import VQASerTokenLayoutLMLoss + + +def _sum_loss(loss_dict): + if "loss" in loss_dict.keys(): + return loss_dict + else: + loss_dict["loss"] = 0. + for k, value in loss_dict.items(): + if k == "loss": + continue + else: + loss_dict["loss"] += value + return loss_dict + + +class DistillationDMLLoss(DMLLoss): + """ + """ + + def __init__(self, + model_name_pairs=[], + act=None, + use_log=False, + key=None, + multi_head=False, + dis_head='ctc', + maps_name=None, + name="dml"): + super().__init__(act=act, use_log=use_log) + assert isinstance(model_name_pairs, list) + self.key = key + self.multi_head = multi_head + self.dis_head = dis_head + self.model_name_pairs = self._check_model_name_pairs(model_name_pairs) + self.name = name + self.maps_name = self._check_maps_name(maps_name) + + def _check_model_name_pairs(self, model_name_pairs): + if not isinstance(model_name_pairs, list): + return [] + elif isinstance(model_name_pairs[0], list) and isinstance( + model_name_pairs[0][0], str): + return model_name_pairs + else: + return [model_name_pairs] + + def _check_maps_name(self, maps_name): + if maps_name is None: + return None + elif type(maps_name) == str: + return [maps_name] + elif type(maps_name) == list: + return [maps_name] + else: + return None + + def _slice_out(self, outs): + new_outs = {} + for k in self.maps_name: + if k == "thrink_maps": + new_outs[k] = outs[:, 0, :, :] + elif k == "threshold_maps": + new_outs[k] = outs[:, 1, :, :] + elif k == "binary_maps": + new_outs[k] = outs[:, 2, :, :] + else: + continue + return new_outs + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, pair in enumerate(self.model_name_pairs): + out1 = predicts[pair[0]] + out2 = predicts[pair[1]] + if self.key is not None: + out1 = out1[self.key] + out2 = out2[self.key] + + if self.maps_name is None: + if self.multi_head: + loss = super().forward(out1[self.dis_head], + out2[self.dis_head]) + else: + loss = super().forward(out1, out2) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}_{}".format(key, pair[0], pair[1], + idx)] = loss[key] + else: + loss_dict["{}_{}".format(self.name, idx)] = loss + else: + outs1 = self._slice_out(out1) + outs2 = self._slice_out(out2) + for _c, k in enumerate(outs1.keys()): + loss = super().forward(outs1[k], outs2[k]) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}_{}_{}".format(key, pair[ + 0], pair[1], self.maps_name, idx)] = loss[key] + else: + loss_dict["{}_{}_{}".format(self.name, self.maps_name[ + _c], idx)] = loss + + loss_dict = _sum_loss(loss_dict) + + return loss_dict + + +class DistillationCTCLoss(CTCLoss): + def __init__(self, + model_name_list=[], + key=None, + multi_head=False, + name="loss_ctc"): + super().__init__() + self.model_name_list = model_name_list + self.key = key + self.name = name + self.multi_head = multi_head + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, model_name in enumerate(self.model_name_list): + out = predicts[model_name] + if self.key is not None: + out = out[self.key] + if self.multi_head: + assert 'ctc' in out, 'multi head has multi out' + loss = super().forward(out['ctc'], batch[:2] + batch[3:]) + else: + loss = super().forward(out, batch) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}".format(self.name, model_name, + idx)] = loss[key] + else: + loss_dict["{}_{}".format(self.name, model_name)] = loss + return loss_dict + + +class DistillationSARLoss(SARLoss): + def __init__(self, + model_name_list=[], + key=None, + multi_head=False, + name="loss_sar", + **kwargs): + ignore_index = kwargs.get('ignore_index', 92) + super().__init__(ignore_index=ignore_index) + self.model_name_list = model_name_list + self.key = key + self.name = name + self.multi_head = multi_head + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, model_name in enumerate(self.model_name_list): + out = predicts[model_name] + if self.key is not None: + out = out[self.key] + if self.multi_head: + assert 'sar' in out, 'multi head has multi out' + loss = super().forward(out['sar'], batch[:1] + batch[2:]) + else: + loss = super().forward(out, batch) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}".format(self.name, model_name, + idx)] = loss[key] + else: + loss_dict["{}_{}".format(self.name, model_name)] = loss + return loss_dict + + +class DistillationDBLoss(DBLoss): + def __init__(self, + model_name_list=[], + balance_loss=True, + main_loss_type='DiceLoss', + alpha=5, + beta=10, + ohem_ratio=3, + eps=1e-6, + name="db", + **kwargs): + super().__init__() + self.model_name_list = model_name_list + self.name = name + self.key = None + + def forward(self, predicts, batch): + loss_dict = {} + for idx, model_name in enumerate(self.model_name_list): + out = predicts[model_name] + if self.key is not None: + out = out[self.key] + loss = super().forward(out, batch) + + if isinstance(loss, dict): + for key in loss.keys(): + if key == "loss": + continue + name = "{}_{}_{}".format(self.name, model_name, key) + loss_dict[name] = loss[key] + else: + loss_dict["{}_{}".format(self.name, model_name)] = loss + + loss_dict = _sum_loss(loss_dict) + return loss_dict + + +class DistillationDilaDBLoss(DBLoss): + def __init__(self, + model_name_pairs=[], + key=None, + balance_loss=True, + main_loss_type='DiceLoss', + alpha=5, + beta=10, + ohem_ratio=3, + eps=1e-6, + name="dila_dbloss"): + super().__init__() + self.model_name_pairs = model_name_pairs + self.name = name + self.key = key + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, pair in enumerate(self.model_name_pairs): + stu_outs = predicts[pair[0]] + tch_outs = predicts[pair[1]] + if self.key is not None: + stu_preds = stu_outs[self.key] + tch_preds = tch_outs[self.key] + + stu_shrink_maps = stu_preds[:, 0, :, :] + stu_binary_maps = stu_preds[:, 2, :, :] + + # dilation to teacher prediction + dilation_w = np.array([[1, 1], [1, 1]]) + th_shrink_maps = tch_preds[:, 0, :, :] + th_shrink_maps = th_shrink_maps.numpy() > 0.3 # thresh = 0.3 + dilate_maps = np.zeros_like(th_shrink_maps).astype(np.float32) + for i in range(th_shrink_maps.shape[0]): + dilate_maps[i] = cv2.dilate( + th_shrink_maps[i, :, :].astype(np.uint8), dilation_w) + th_shrink_maps = paddle.to_tensor(dilate_maps) + + label_threshold_map, label_threshold_mask, label_shrink_map, label_shrink_mask = batch[ + 1:] + + # calculate the shrink map loss + bce_loss = self.alpha * self.bce_loss( + stu_shrink_maps, th_shrink_maps, label_shrink_mask) + loss_binary_maps = self.dice_loss(stu_binary_maps, th_shrink_maps, + label_shrink_mask) + + # k = f"{self.name}_{pair[0]}_{pair[1]}" + k = "{}_{}_{}".format(self.name, pair[0], pair[1]) + loss_dict[k] = bce_loss + loss_binary_maps + + loss_dict = _sum_loss(loss_dict) + return loss_dict + + +class DistillationDistanceLoss(DistanceLoss): + """ + """ + + def __init__(self, + mode="l2", + model_name_pairs=[], + key=None, + name="loss_distance", + **kargs): + super().__init__(mode=mode, **kargs) + assert isinstance(model_name_pairs, list) + self.key = key + self.model_name_pairs = model_name_pairs + self.name = name + "_l2" + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, pair in enumerate(self.model_name_pairs): + out1 = predicts[pair[0]] + out2 = predicts[pair[1]] + if self.key is not None: + out1 = out1[self.key] + out2 = out2[self.key] + loss = super().forward(out1, out2) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}_{}".format(self.name, key, idx)] = loss[ + key] + else: + loss_dict["{}_{}_{}_{}".format(self.name, pair[0], pair[1], + idx)] = loss + return loss_dict + + +class DistillationVQASerTokenLayoutLMLoss(VQASerTokenLayoutLMLoss): + def __init__(self, + num_classes, + model_name_list=[], + key=None, + name="loss_ser"): + super().__init__(num_classes=num_classes) + self.model_name_list = model_name_list + self.key = key + self.name = name + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, model_name in enumerate(self.model_name_list): + out = predicts[model_name] + if self.key is not None: + out = out[self.key] + loss = super().forward(out, batch) + loss_dict["{}_{}".format(self.name, model_name)] = loss["loss"] + return loss_dict + + +class DistillationLossFromOutput(LossFromOutput): + def __init__(self, + reduction="none", + model_name_list=[], + dist_key=None, + key="loss", + name="loss_re"): + super().__init__(key=key, reduction=reduction) + self.model_name_list = model_name_list + self.name = name + self.dist_key = dist_key + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, model_name in enumerate(self.model_name_list): + out = predicts[model_name] + if self.dist_key is not None: + out = out[self.dist_key] + loss = super().forward(out, batch) + loss_dict["{}_{}".format(self.name, model_name)] = loss["loss"] + return loss_dict + + +class DistillationSERDMLLoss(DMLLoss): + """ + """ + + def __init__(self, + act="softmax", + use_log=True, + num_classes=7, + model_name_pairs=[], + key=None, + name="loss_dml_ser"): + super().__init__(act=act, use_log=use_log) + assert isinstance(model_name_pairs, list) + self.key = key + self.name = name + self.num_classes = num_classes + self.model_name_pairs = model_name_pairs + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, pair in enumerate(self.model_name_pairs): + out1 = predicts[pair[0]] + out2 = predicts[pair[1]] + if self.key is not None: + out1 = out1[self.key] + out2 = out2[self.key] + out1 = out1.reshape([-1, out1.shape[-1]]) + out2 = out2.reshape([-1, out2.shape[-1]]) + + attention_mask = batch[2] + if attention_mask is not None: + active_output = attention_mask.reshape([-1, ]) == 1 + out1 = out1[active_output] + out2 = out2[active_output] + + loss_dict["{}_{}".format(self.name, idx)] = super().forward(out1, + out2) + + return loss_dict + + +class DistillationVQADistanceLoss(DistanceLoss): + def __init__(self, + mode="l2", + model_name_pairs=[], + key=None, + index=None, + name="loss_distance", + **kargs): + super().__init__(mode=mode, **kargs) + assert isinstance(model_name_pairs, list) + self.key = key + self.index = index + self.model_name_pairs = model_name_pairs + self.name = name + "_l2" + + def forward(self, predicts, batch): + loss_dict = dict() + for idx, pair in enumerate(self.model_name_pairs): + out1 = predicts[pair[0]] + out2 = predicts[pair[1]] + attention_mask = batch[2] + if self.key is not None: + out1 = out1[self.key] + out2 = out2[self.key] + if self.index is not None: + out1 = out1[:, self.index, :, :] + out2 = out2[:, self.index, :, :] + if attention_mask is not None: + max_len = attention_mask.shape[-1] + out1 = out1[:, :max_len] + out2 = out2[:, :max_len] + out1 = out1.reshape([-1, out1.shape[-1]]) + out2 = out2.reshape([-1, out2.shape[-1]]) + if attention_mask is not None: + active_output = attention_mask.reshape([-1, ]) == 1 + out1 = out1[active_output] + out2 = out2[active_output] + + loss = super().forward(out1, out2) + if isinstance(loss, dict): + for key in loss: + loss_dict["{}_{}nohu_{}".format(self.name, key, + idx)] = loss[key] + else: + loss_dict["{}_{}_{}_{}".format(self.name, pair[0], pair[1], + idx)] = loss + return loss_dict diff --git a/ppocr/losses/e2e_pg_loss.py b/ppocr/losses/e2e_pg_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..aff67b7ce3c208bf9c7b1371e095eac8c70ce9df --- /dev/null +++ b/ppocr/losses/e2e_pg_loss.py @@ -0,0 +1,141 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn +import paddle + +from .det_basic_loss import DiceLoss +from ppocr.utils.e2e_utils.extract_batchsize import pre_process + + +class PGLoss(nn.Layer): + def __init__(self, + tcl_bs, + max_text_length, + max_text_nums, + pad_num, + eps=1e-6, + **kwargs): + super(PGLoss, self).__init__() + self.tcl_bs = tcl_bs + self.max_text_nums = max_text_nums + self.max_text_length = max_text_length + self.pad_num = pad_num + self.dice_loss = DiceLoss(eps=eps) + + def border_loss(self, f_border, l_border, l_score, l_mask): + l_border_split, l_border_norm = paddle.tensor.split( + l_border, num_or_sections=[4, 1], axis=1) + f_border_split = f_border + b, c, h, w = l_border_norm.shape + l_border_norm_split = paddle.expand( + x=l_border_norm, shape=[b, 4 * c, h, w]) + b, c, h, w = l_score.shape + l_border_score = paddle.expand(x=l_score, shape=[b, 4 * c, h, w]) + b, c, h, w = l_mask.shape + l_border_mask = paddle.expand(x=l_mask, shape=[b, 4 * c, h, w]) + border_diff = l_border_split - f_border_split + abs_border_diff = paddle.abs(border_diff) + border_sign = abs_border_diff < 1.0 + border_sign = paddle.cast(border_sign, dtype='float32') + border_sign.stop_gradient = True + border_in_loss = 0.5 * abs_border_diff * abs_border_diff * border_sign + \ + (abs_border_diff - 0.5) * (1.0 - border_sign) + border_out_loss = l_border_norm_split * border_in_loss + border_loss = paddle.sum(border_out_loss * l_border_score * l_border_mask) / \ + (paddle.sum(l_border_score * l_border_mask) + 1e-5) + return border_loss + + def direction_loss(self, f_direction, l_direction, l_score, l_mask): + l_direction_split, l_direction_norm = paddle.tensor.split( + l_direction, num_or_sections=[2, 1], axis=1) + f_direction_split = f_direction + b, c, h, w = l_direction_norm.shape + l_direction_norm_split = paddle.expand( + x=l_direction_norm, shape=[b, 2 * c, h, w]) + b, c, h, w = l_score.shape + l_direction_score = paddle.expand(x=l_score, shape=[b, 2 * c, h, w]) + b, c, h, w = l_mask.shape + l_direction_mask = paddle.expand(x=l_mask, shape=[b, 2 * c, h, w]) + direction_diff = l_direction_split - f_direction_split + abs_direction_diff = paddle.abs(direction_diff) + direction_sign = abs_direction_diff < 1.0 + direction_sign = paddle.cast(direction_sign, dtype='float32') + direction_sign.stop_gradient = True + direction_in_loss = 0.5 * abs_direction_diff * abs_direction_diff * direction_sign + \ + (abs_direction_diff - 0.5) * (1.0 - direction_sign) + direction_out_loss = l_direction_norm_split * direction_in_loss + direction_loss = paddle.sum(direction_out_loss * l_direction_score * l_direction_mask) / \ + (paddle.sum(l_direction_score * l_direction_mask) + 1e-5) + return direction_loss + + def ctcloss(self, f_char, tcl_pos, tcl_mask, tcl_label, label_t): + f_char = paddle.transpose(f_char, [0, 2, 3, 1]) + tcl_pos = paddle.reshape(tcl_pos, [-1, 3]) + tcl_pos = paddle.cast(tcl_pos, dtype=int) + f_tcl_char = paddle.gather_nd(f_char, tcl_pos) + f_tcl_char = paddle.reshape( + f_tcl_char, [-1, 64, self.pad_num + 1]) # len(Lexicon_Table)+1 + f_tcl_char_fg, f_tcl_char_bg = paddle.split( + f_tcl_char, [self.pad_num, 1], axis=2) + f_tcl_char_bg = f_tcl_char_bg * tcl_mask + (1.0 - tcl_mask) * 20.0 + b, c, l = tcl_mask.shape + tcl_mask_fg = paddle.expand(x=tcl_mask, shape=[b, c, self.pad_num * l]) + tcl_mask_fg.stop_gradient = True + f_tcl_char_fg = f_tcl_char_fg * tcl_mask_fg + (1.0 - tcl_mask_fg) * ( + -20.0) + f_tcl_char_mask = paddle.concat([f_tcl_char_fg, f_tcl_char_bg], axis=2) + f_tcl_char_ld = paddle.transpose(f_tcl_char_mask, (1, 0, 2)) + N, B, _ = f_tcl_char_ld.shape + input_lengths = paddle.to_tensor([N] * B, dtype='int64') + cost = paddle.nn.functional.ctc_loss( + log_probs=f_tcl_char_ld, + labels=tcl_label, + input_lengths=input_lengths, + label_lengths=label_t, + blank=self.pad_num, + reduction='none') + cost = cost.mean() + return cost + + def forward(self, predicts, labels): + images, tcl_maps, tcl_label_maps, border_maps \ + , direction_maps, training_masks, label_list, pos_list, pos_mask = labels + # for all the batch_size + pos_list, pos_mask, label_list, label_t = pre_process( + label_list, pos_list, pos_mask, self.max_text_length, + self.max_text_nums, self.pad_num, self.tcl_bs) + + f_score, f_border, f_direction, f_char = predicts['f_score'], predicts['f_border'], predicts['f_direction'], \ + predicts['f_char'] + score_loss = self.dice_loss(f_score, tcl_maps, training_masks) + border_loss = self.border_loss(f_border, border_maps, tcl_maps, + training_masks) + direction_loss = self.direction_loss(f_direction, direction_maps, + tcl_maps, training_masks) + ctc_loss = self.ctcloss(f_char, pos_list, pos_mask, label_list, label_t) + loss_all = score_loss + border_loss + direction_loss + 5 * ctc_loss + + losses = { + 'loss': loss_all, + "score_loss": score_loss, + "border_loss": border_loss, + "direction_loss": direction_loss, + "ctc_loss": ctc_loss + } + return losses diff --git a/ppocr/losses/kie_sdmgr_loss.py b/ppocr/losses/kie_sdmgr_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..745671f58da91c108624097faea72d55c1877f6b --- /dev/null +++ b/ppocr/losses/kie_sdmgr_loss.py @@ -0,0 +1,115 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# reference from : https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/kie/losses/sdmgr_loss.py + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn +import paddle + + +class SDMGRLoss(nn.Layer): + def __init__(self, node_weight=1.0, edge_weight=1.0, ignore=0): + super().__init__() + self.loss_node = nn.CrossEntropyLoss(ignore_index=ignore) + self.loss_edge = nn.CrossEntropyLoss(ignore_index=-1) + self.node_weight = node_weight + self.edge_weight = edge_weight + self.ignore = ignore + + def pre_process(self, gts, tag): + gts, tag = gts.numpy(), tag.numpy().tolist() + temp_gts = [] + batch = len(tag) + for i in range(batch): + num, recoder_len = tag[i][0], tag[i][1] + temp_gts.append( + paddle.to_tensor( + gts[i, :num, :num + 1], dtype='int64')) + return temp_gts + + def accuracy(self, pred, target, topk=1, thresh=None): + """Calculate accuracy according to the prediction and target. + + Args: + pred (torch.Tensor): The model prediction, shape (N, num_class) + target (torch.Tensor): The target of each prediction, shape (N, ) + topk (int | tuple[int], optional): If the predictions in ``topk`` + matches the target, the predictions will be regarded as + correct ones. Defaults to 1. + thresh (float, optional): If not None, predictions with scores under + this threshold are considered incorrect. Default to None. + + Returns: + float | tuple[float]: If the input ``topk`` is a single integer, + the function will return a single float as accuracy. If + ``topk`` is a tuple containing multiple integers, the + function will return a tuple containing accuracies of + each ``topk`` number. + """ + assert isinstance(topk, (int, tuple)) + if isinstance(topk, int): + topk = (topk, ) + return_single = True + else: + return_single = False + + maxk = max(topk) + if pred.shape[0] == 0: + accu = [pred.new_tensor(0.) for i in range(len(topk))] + return accu[0] if return_single else accu + pred_value, pred_label = paddle.topk(pred, maxk, axis=1) + pred_label = pred_label.transpose( + [1, 0]) # transpose to shape (maxk, N) + correct = paddle.equal(pred_label, + (target.reshape([1, -1]).expand_as(pred_label))) + res = [] + for k in topk: + correct_k = paddle.sum(correct[:k].reshape([-1]).astype('float32'), + axis=0, + keepdim=True) + res.append( + paddle.multiply(correct_k, + paddle.to_tensor(100.0 / pred.shape[0]))) + return res[0] if return_single else res + + def forward(self, pred, batch): + node_preds, edge_preds = pred + gts, tag = batch[4], batch[5] + gts = self.pre_process(gts, tag) + node_gts, edge_gts = [], [] + for gt in gts: + node_gts.append(gt[:, 0]) + edge_gts.append(gt[:, 1:].reshape([-1])) + node_gts = paddle.concat(node_gts) + edge_gts = paddle.concat(edge_gts) + + node_valids = paddle.nonzero(node_gts != self.ignore).reshape([-1]) + edge_valids = paddle.nonzero(edge_gts != -1).reshape([-1]) + loss_node = self.loss_node(node_preds, node_gts) + loss_edge = self.loss_edge(edge_preds, edge_gts) + loss = self.node_weight * loss_node + self.edge_weight * loss_edge + return dict( + loss=loss, + loss_node=loss_node, + loss_edge=loss_edge, + acc_node=self.accuracy( + paddle.gather(node_preds, node_valids), + paddle.gather(node_gts, node_valids)), + acc_edge=self.accuracy( + paddle.gather(edge_preds, edge_valids), + paddle.gather(edge_gts, edge_valids))) diff --git a/ppocr/losses/rec_aster_loss.py b/ppocr/losses/rec_aster_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..9b0a34eeac57089ae1d45ad9d8c0427b234c50c9 --- /dev/null +++ b/ppocr/losses/rec_aster_loss.py @@ -0,0 +1,99 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class CosineEmbeddingLoss(nn.Layer): + def __init__(self, margin=0.): + super(CosineEmbeddingLoss, self).__init__() + self.margin = margin + self.epsilon = 1e-12 + + def forward(self, x1, x2, target): + similarity = paddle.sum( + x1 * x2, axis=-1) / (paddle.norm( + x1, axis=-1) * paddle.norm( + x2, axis=-1) + self.epsilon) + one_list = paddle.full_like(target, fill_value=1) + out = paddle.mean( + paddle.where( + paddle.equal(target, one_list), 1. - similarity, + paddle.maximum( + paddle.zeros_like(similarity), similarity - self.margin))) + + return out + + +class AsterLoss(nn.Layer): + def __init__(self, + weight=None, + size_average=True, + ignore_index=-100, + sequence_normalize=False, + sample_normalize=True, + **kwargs): + super(AsterLoss, self).__init__() + self.weight = weight + self.size_average = size_average + self.ignore_index = ignore_index + self.sequence_normalize = sequence_normalize + self.sample_normalize = sample_normalize + self.loss_sem = CosineEmbeddingLoss() + self.is_cosin_loss = True + self.loss_func_rec = nn.CrossEntropyLoss(weight=None, reduction='none') + + def forward(self, predicts, batch): + targets = batch[1].astype("int64") + label_lengths = batch[2].astype('int64') + sem_target = batch[3].astype('float32') + embedding_vectors = predicts['embedding_vectors'] + rec_pred = predicts['rec_pred'] + + if not self.is_cosin_loss: + sem_loss = paddle.sum(self.loss_sem(embedding_vectors, sem_target)) + else: + label_target = paddle.ones([embedding_vectors.shape[0]]) + sem_loss = paddle.sum( + self.loss_sem(embedding_vectors, sem_target, label_target)) + + # rec loss + batch_size, def_max_length = targets.shape[0], targets.shape[1] + + mask = paddle.zeros([batch_size, def_max_length]) + for i in range(batch_size): + mask[i, :label_lengths[i]] = 1 + mask = paddle.cast(mask, "float32") + max_length = max(label_lengths) + assert max_length == rec_pred.shape[1] + targets = targets[:, :max_length] + mask = mask[:, :max_length] + rec_pred = paddle.reshape(rec_pred, [-1, rec_pred.shape[2]]) + input = nn.functional.log_softmax(rec_pred, axis=1) + targets = paddle.reshape(targets, [-1, 1]) + mask = paddle.reshape(mask, [-1, 1]) + output = -paddle.index_sample(input, index=targets) * mask + output = paddle.sum(output) + if self.sequence_normalize: + output = output / paddle.sum(mask) + if self.sample_normalize: + output = output / batch_size + + loss = output + sem_loss * 0.1 + return {'loss': loss} diff --git a/ppocr/losses/rec_att_loss.py b/ppocr/losses/rec_att_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..6e2f67483c86a45f3aa1feb1e1fac1a5013bfb46 --- /dev/null +++ b/ppocr/losses/rec_att_loss.py @@ -0,0 +1,39 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class AttentionLoss(nn.Layer): + def __init__(self, **kwargs): + super(AttentionLoss, self).__init__() + self.loss_func = nn.CrossEntropyLoss(weight=None, reduction='none') + + def forward(self, predicts, batch): + targets = batch[1].astype("int64") + label_lengths = batch[2].astype('int64') + batch_size, num_steps, num_classes = predicts.shape[0], predicts.shape[ + 1], predicts.shape[2] + assert len(targets.shape) == len(list(predicts.shape)) - 1, \ + "The target's shape and inputs's shape is [N, d] and [N, num_steps]" + + inputs = paddle.reshape(predicts, [-1, predicts.shape[-1]]) + targets = paddle.reshape(targets, [-1]) + + return {'loss': paddle.sum(self.loss_func(inputs, targets))} diff --git a/ppocr/losses/rec_can_loss.py b/ppocr/losses/rec_can_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..b6ac5abd18752707f99d8cf7476025f6d2778963 --- /dev/null +++ b/ppocr/losses/rec_can_loss.py @@ -0,0 +1,79 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/LBH1024/CAN/models/can.py +""" + +import paddle +import paddle.nn as nn +import numpy as np + + +class CANLoss(nn.Layer): + ''' + CANLoss is consist of two part: + word_average_loss: average accuracy of the symbol + counting_loss: counting loss of every symbol + ''' + + def __init__(self): + super(CANLoss, self).__init__() + + self.use_label_mask = False + self.out_channel = 111 + self.cross = nn.CrossEntropyLoss( + reduction='none') if self.use_label_mask else nn.CrossEntropyLoss() + self.counting_loss = nn.SmoothL1Loss(reduction='mean') + self.ratio = 16 + + def forward(self, preds, batch): + word_probs = preds[0] + counting_preds = preds[1] + counting_preds1 = preds[2] + counting_preds2 = preds[3] + labels = batch[2] + labels_mask = batch[3] + counting_labels = gen_counting_label(labels, self.out_channel, True) + counting_loss = self.counting_loss(counting_preds1, counting_labels) + self.counting_loss(counting_preds2, counting_labels) \ + + self.counting_loss(counting_preds, counting_labels) + + word_loss = self.cross( + paddle.reshape(word_probs, [-1, word_probs.shape[-1]]), + paddle.reshape(labels, [-1])) + word_average_loss = paddle.sum( + paddle.reshape(word_loss * labels_mask, [-1])) / ( + paddle.sum(labels_mask) + 1e-10 + ) if self.use_label_mask else word_loss + loss = word_average_loss + counting_loss + return {'loss': loss} + + +def gen_counting_label(labels, channel, tag): + b, t = labels.shape + counting_labels = np.zeros([b, channel]) + + if tag: + ignore = [0, 1, 107, 108, 109, 110] + else: + ignore = [] + for i in range(b): + for j in range(t): + k = labels[i][j] + if k in ignore: + continue + else: + counting_labels[i][k] += 1 + counting_labels = paddle.to_tensor(counting_labels, dtype='float32') + return counting_labels diff --git a/ppocr/losses/rec_ce_loss.py b/ppocr/losses/rec_ce_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..614384de863c15b106aef831f8e938b89dadc246 --- /dev/null +++ b/ppocr/losses/rec_ce_loss.py @@ -0,0 +1,66 @@ +import paddle +from paddle import nn +import paddle.nn.functional as F + + +class CELoss(nn.Layer): + def __init__(self, + smoothing=False, + with_all=False, + ignore_index=-1, + **kwargs): + super(CELoss, self).__init__() + if ignore_index >= 0: + self.loss_func = nn.CrossEntropyLoss( + reduction='mean', ignore_index=ignore_index) + else: + self.loss_func = nn.CrossEntropyLoss(reduction='mean') + self.smoothing = smoothing + self.with_all = with_all + + def forward(self, pred, batch): + + if isinstance(pred, dict): # for ABINet + loss = {} + loss_sum = [] + for name, logits in pred.items(): + if isinstance(logits, list): + logit_num = len(logits) + all_tgt = paddle.concat([batch[1]] * logit_num, 0) + all_logits = paddle.concat(logits, 0) + flt_logtis = all_logits.reshape([-1, all_logits.shape[2]]) + flt_tgt = all_tgt.reshape([-1]) + else: + flt_logtis = logits.reshape([-1, logits.shape[2]]) + flt_tgt = batch[1].reshape([-1]) + loss[name + '_loss'] = self.loss_func(flt_logtis, flt_tgt) + loss_sum.append(loss[name + '_loss']) + loss['loss'] = sum(loss_sum) + return loss + else: + if self.with_all: # for ViTSTR + tgt = batch[1] + pred = pred.reshape([-1, pred.shape[2]]) + tgt = tgt.reshape([-1]) + loss = self.loss_func(pred, tgt) + return {'loss': loss} + else: # for NRTR + max_len = batch[2].max() + tgt = batch[1][:, 1:2 + max_len] + pred = pred.reshape([-1, pred.shape[2]]) + tgt = tgt.reshape([-1]) + if self.smoothing: + eps = 0.1 + n_class = pred.shape[1] + one_hot = F.one_hot(tgt, pred.shape[1]) + one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / ( + n_class - 1) + log_prb = F.log_softmax(pred, axis=1) + non_pad_mask = paddle.not_equal( + tgt, paddle.zeros( + tgt.shape, dtype=tgt.dtype)) + loss = -(one_hot * log_prb).sum(axis=1) + loss = loss.masked_select(non_pad_mask).mean() + else: + loss = self.loss_func(pred, tgt) + return {'loss': loss} diff --git a/ppocr/losses/rec_ctc_loss.py b/ppocr/losses/rec_ctc_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..502fc8c5227460c1e299ae2a46d464d29ddbe374 --- /dev/null +++ b/ppocr/losses/rec_ctc_loss.py @@ -0,0 +1,45 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class CTCLoss(nn.Layer): + def __init__(self, use_focal_loss=False, **kwargs): + super(CTCLoss, self).__init__() + self.loss_func = nn.CTCLoss(blank=0, reduction='none') + self.use_focal_loss = use_focal_loss + + def forward(self, predicts, batch): + if isinstance(predicts, (list, tuple)): + predicts = predicts[-1] + predicts = predicts.transpose((1, 0, 2)) + N, B, _ = predicts.shape + preds_lengths = paddle.to_tensor( + [N] * B, dtype='int64', place=paddle.CPUPlace()) + labels = batch[1].astype("int32") + label_lengths = batch[2].astype('int64') + loss = self.loss_func(predicts, labels, preds_lengths, label_lengths) + if self.use_focal_loss: + weight = paddle.exp(-loss) + weight = paddle.subtract(paddle.to_tensor([1.0]), weight) + weight = paddle.square(weight) + loss = paddle.multiply(loss, weight) + loss = loss.mean() + return {'loss': loss} diff --git a/ppocr/losses/rec_enhanced_ctc_loss.py b/ppocr/losses/rec_enhanced_ctc_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..b57be6468e2ec75811442e7449525267e7d9e82e --- /dev/null +++ b/ppocr/losses/rec_enhanced_ctc_loss.py @@ -0,0 +1,70 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +from .ace_loss import ACELoss +from .center_loss import CenterLoss +from .rec_ctc_loss import CTCLoss + + +class EnhancedCTCLoss(nn.Layer): + def __init__(self, + use_focal_loss=False, + use_ace_loss=False, + ace_loss_weight=0.1, + use_center_loss=False, + center_loss_weight=0.05, + num_classes=6625, + feat_dim=96, + init_center=False, + center_file_path=None, + **kwargs): + super(EnhancedCTCLoss, self).__init__() + self.ctc_loss_func = CTCLoss(use_focal_loss=use_focal_loss) + + self.use_ace_loss = False + if use_ace_loss: + self.use_ace_loss = use_ace_loss + self.ace_loss_func = ACELoss() + self.ace_loss_weight = ace_loss_weight + + self.use_center_loss = False + if use_center_loss: + self.use_center_loss = use_center_loss + self.center_loss_func = CenterLoss( + num_classes=num_classes, + feat_dim=feat_dim, + init_center=init_center, + center_file_path=center_file_path) + self.center_loss_weight = center_loss_weight + + def __call__(self, predicts, batch): + loss = self.ctc_loss_func(predicts, batch)["loss"] + + if self.use_center_loss: + center_loss = self.center_loss_func( + predicts, batch)["loss_center"] * self.center_loss_weight + loss = loss + center_loss + + if self.use_ace_loss: + ace_loss = self.ace_loss_func( + predicts, batch)["loss_ace"] * self.ace_loss_weight + loss = loss + ace_loss + + return {'enhanced_ctc_loss': loss} diff --git a/ppocr/losses/rec_multi_loss.py b/ppocr/losses/rec_multi_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..09f007afe6303e83b9a6948df553ec0fca8b6b2d --- /dev/null +++ b/ppocr/losses/rec_multi_loss.py @@ -0,0 +1,58 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + +from .rec_ctc_loss import CTCLoss +from .rec_sar_loss import SARLoss + + +class MultiLoss(nn.Layer): + def __init__(self, **kwargs): + super().__init__() + self.loss_funcs = {} + self.loss_list = kwargs.pop('loss_config_list') + self.weight_1 = kwargs.get('weight_1', 1.0) + self.weight_2 = kwargs.get('weight_2', 1.0) + self.gtc_loss = kwargs.get('gtc_loss', 'sar') + for loss_info in self.loss_list: + for name, param in loss_info.items(): + if param is not None: + kwargs.update(param) + loss = eval(name)(**kwargs) + self.loss_funcs[name] = loss + + def forward(self, predicts, batch): + self.total_loss = {} + total_loss = 0.0 + # batch [image, label_ctc, label_sar, length, valid_ratio] + for name, loss_func in self.loss_funcs.items(): + if name == 'CTCLoss': + loss = loss_func(predicts['ctc'], + batch[:2] + batch[3:])['loss'] * self.weight_1 + elif name == 'SARLoss': + loss = loss_func(predicts['sar'], + batch[:1] + batch[2:])['loss'] * self.weight_2 + else: + raise NotImplementedError( + '{} is not supported in MultiLoss yet'.format(name)) + self.total_loss[name] = loss + total_loss += loss + self.total_loss['loss'] = total_loss + return self.total_loss diff --git a/ppocr/losses/rec_pren_loss.py b/ppocr/losses/rec_pren_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..7bc53d29b2474eeb7897586d1bb7b8c6df2d400e --- /dev/null +++ b/ppocr/losses/rec_pren_loss.py @@ -0,0 +1,30 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn + + +class PRENLoss(nn.Layer): + def __init__(self, **kwargs): + super(PRENLoss, self).__init__() + # note: 0 is padding idx + self.loss_func = nn.CrossEntropyLoss(reduction='mean', ignore_index=0) + + def forward(self, predicts, batch): + loss = self.loss_func(predicts, batch[1].astype('int64')) + return {'loss': loss} diff --git a/ppocr/losses/rec_rfl_loss.py b/ppocr/losses/rec_rfl_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..be0f06d903475d79c78e8e5b6b8a56c856a07ba2 --- /dev/null +++ b/ppocr/losses/rec_rfl_loss.py @@ -0,0 +1,68 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/hikopensource/DAVAR-Lab-OCR/blob/main/davarocr/davar_common/models/loss/cross_entropy_loss.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + +from .basic_loss import CELoss, DistanceLoss + + +class RFLLoss(nn.Layer): + def __init__(self, ignore_index=-100, **kwargs): + super().__init__() + + self.cnt_loss = nn.MSELoss(**kwargs) + self.seq_loss = nn.CrossEntropyLoss(ignore_index=ignore_index) + + def forward(self, predicts, batch): + + self.total_loss = {} + total_loss = 0.0 + if isinstance(predicts, tuple) or isinstance(predicts, list): + cnt_outputs, seq_outputs = predicts + else: + cnt_outputs, seq_outputs = predicts, None + # batch [image, label, length, cnt_label] + if cnt_outputs is not None: + cnt_loss = self.cnt_loss(cnt_outputs, + paddle.cast(batch[3], paddle.float32)) + self.total_loss['cnt_loss'] = cnt_loss + total_loss += cnt_loss + + if seq_outputs is not None: + targets = batch[1].astype("int64") + label_lengths = batch[2].astype('int64') + batch_size, num_steps, num_classes = seq_outputs.shape[ + 0], seq_outputs.shape[1], seq_outputs.shape[2] + assert len(targets.shape) == len(list(seq_outputs.shape)) - 1, \ + "The target's shape and inputs's shape is [N, d] and [N, num_steps]" + + inputs = seq_outputs[:, :-1, :] + targets = targets[:, 1:] + + inputs = paddle.reshape(inputs, [-1, inputs.shape[-1]]) + targets = paddle.reshape(targets, [-1]) + seq_loss = self.seq_loss(inputs, targets) + self.total_loss['seq_loss'] = seq_loss + total_loss += seq_loss + + self.total_loss['loss'] = total_loss + return self.total_loss diff --git a/ppocr/losses/rec_sar_loss.py b/ppocr/losses/rec_sar_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..a4f83f03c08e4c4e6bab308aebc2daa8aa612400 --- /dev/null +++ b/ppocr/losses/rec_sar_loss.py @@ -0,0 +1,29 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class SARLoss(nn.Layer): + def __init__(self, **kwargs): + super(SARLoss, self).__init__() + ignore_index = kwargs.get('ignore_index', 92) # 6626 + self.loss_func = paddle.nn.loss.CrossEntropyLoss( + reduction="mean", ignore_index=ignore_index) + + def forward(self, predicts, batch): + predict = predicts[:, : + -1, :] # ignore last index of outputs to be in same seq_len with targets + label = batch[1].astype( + "int64")[:, 1:] # ignore first index of target in loss calculation + batch_size, num_steps, num_classes = predict.shape[0], predict.shape[ + 1], predict.shape[2] + assert len(label.shape) == len(list(predict.shape)) - 1, \ + "The target's shape and inputs's shape is [N, d] and [N, num_steps]" + + inputs = paddle.reshape(predict, [-1, num_classes]) + targets = paddle.reshape(label, [-1]) + loss = self.loss_func(inputs, targets) + return {'loss': loss} diff --git a/ppocr/losses/rec_spin_att_loss.py b/ppocr/losses/rec_spin_att_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..195780c7bfaf4aae5dd23bd72ace268bed9c1d4f --- /dev/null +++ b/ppocr/losses/rec_spin_att_loss.py @@ -0,0 +1,45 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + +'''This code is refer from: +https://github.com/hikopensource/DAVAR-Lab-OCR +''' + +class SPINAttentionLoss(nn.Layer): + def __init__(self, reduction='mean', ignore_index=-100, **kwargs): + super(SPINAttentionLoss, self).__init__() + self.loss_func = nn.CrossEntropyLoss(weight=None, reduction=reduction, ignore_index=ignore_index) + + def forward(self, predicts, batch): + targets = batch[1].astype("int64") + targets = targets[:, 1:] # remove [eos] in label + + label_lengths = batch[2].astype('int64') + batch_size, num_steps, num_classes = predicts.shape[0], predicts.shape[ + 1], predicts.shape[2] + assert len(targets.shape) == len(list(predicts.shape)) - 1, \ + "The target's shape and inputs's shape is [N, d] and [N, num_steps]" + + inputs = paddle.reshape(predicts, [-1, predicts.shape[-1]]) + targets = paddle.reshape(targets, [-1]) + + return {'loss': self.loss_func(inputs, targets)} diff --git a/ppocr/losses/rec_srn_loss.py b/ppocr/losses/rec_srn_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..7d5b65ebaf1ee135d1fefe8d93ddc3f77985b132 --- /dev/null +++ b/ppocr/losses/rec_srn_loss.py @@ -0,0 +1,47 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class SRNLoss(nn.Layer): + def __init__(self, **kwargs): + super(SRNLoss, self).__init__() + self.loss_func = paddle.nn.loss.CrossEntropyLoss(reduction="sum") + + def forward(self, predicts, batch): + predict = predicts['predict'] + word_predict = predicts['word_out'] + gsrm_predict = predicts['gsrm_out'] + label = batch[1] + + casted_label = paddle.cast(x=label, dtype='int64') + casted_label = paddle.reshape(x=casted_label, shape=[-1, 1]) + + cost_word = self.loss_func(word_predict, label=casted_label) + cost_gsrm = self.loss_func(gsrm_predict, label=casted_label) + cost_vsfd = self.loss_func(predict, label=casted_label) + + cost_word = paddle.reshape(x=paddle.sum(cost_word), shape=[1]) + cost_gsrm = paddle.reshape(x=paddle.sum(cost_gsrm), shape=[1]) + cost_vsfd = paddle.reshape(x=paddle.sum(cost_vsfd), shape=[1]) + + sum_cost = cost_word * 3.0 + cost_vsfd + cost_gsrm * 0.15 + + return {'loss': sum_cost, 'word_loss': cost_word, 'img_loss': cost_vsfd} diff --git a/ppocr/losses/rec_vl_loss.py b/ppocr/losses/rec_vl_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..5cd87c709bbf81d2fd83d721d49086256f2ab629 --- /dev/null +++ b/ppocr/losses/rec_vl_loss.py @@ -0,0 +1,70 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/wangyuxin87/VisionLAN +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class VLLoss(nn.Layer): + def __init__(self, mode='LF_1', weight_res=0.5, weight_mas=0.5, **kwargs): + super(VLLoss, self).__init__() + self.loss_func = paddle.nn.loss.CrossEntropyLoss(reduction="mean") + assert mode in ['LF_1', 'LF_2', 'LA'] + self.mode = mode + self.weight_res = weight_res + self.weight_mas = weight_mas + + def flatten_label(self, target): + label_flatten = [] + label_length = [] + for i in range(0, target.shape[0]): + cur_label = target[i].tolist() + label_flatten += cur_label[:cur_label.index(0) + 1] + label_length.append(cur_label.index(0) + 1) + label_flatten = paddle.to_tensor(label_flatten, dtype='int64') + label_length = paddle.to_tensor(label_length, dtype='int32') + return (label_flatten, label_length) + + def _flatten(self, sources, lengths): + return paddle.concat([t[:l] for t, l in zip(sources, lengths)]) + + def forward(self, predicts, batch): + text_pre = predicts[0] + target = batch[1].astype('int64') + label_flatten, length = self.flatten_label(target) + text_pre = self._flatten(text_pre, length) + if self.mode == 'LF_1': + loss = self.loss_func(text_pre, label_flatten) + else: + text_rem = predicts[1] + text_mas = predicts[2] + target_res = batch[2].astype('int64') + target_sub = batch[3].astype('int64') + label_flatten_res, length_res = self.flatten_label(target_res) + label_flatten_sub, length_sub = self.flatten_label(target_sub) + text_rem = self._flatten(text_rem, length_res) + text_mas = self._flatten(text_mas, length_sub) + loss_ori = self.loss_func(text_pre, label_flatten) + loss_res = self.loss_func(text_rem, label_flatten_res) + loss_mas = self.loss_func(text_mas, label_flatten_sub) + loss = loss_ori + loss_res * self.weight_res + loss_mas * self.weight_mas + return {'loss': loss} diff --git a/ppocr/losses/stroke_focus_loss.py b/ppocr/losses/stroke_focus_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..002bbc34774cc80599015492762ca448f593df0f --- /dev/null +++ b/ppocr/losses/stroke_focus_loss.py @@ -0,0 +1,68 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FudanVI/FudanOCR/blob/main/text-gestalt/loss/stroke_focus_loss.py +""" +import cv2 +import sys +import time +import string +import random +import numpy as np +import paddle.nn as nn +import paddle + + +class StrokeFocusLoss(nn.Layer): + def __init__(self, character_dict_path=None, **kwargs): + super(StrokeFocusLoss, self).__init__(character_dict_path) + self.mse_loss = nn.MSELoss() + self.ce_loss = nn.CrossEntropyLoss() + self.l1_loss = nn.L1Loss() + self.english_stroke_alphabet = '0123456789' + self.english_stroke_dict = {} + for index in range(len(self.english_stroke_alphabet)): + self.english_stroke_dict[self.english_stroke_alphabet[ + index]] = index + + stroke_decompose_lines = open(character_dict_path, 'r').readlines() + self.dic = {} + for line in stroke_decompose_lines: + line = line.strip() + character, sequence = line.split() + self.dic[character] = sequence + + def forward(self, pred, data): + + sr_img = pred["sr_img"] + hr_img = pred["hr_img"] + + mse_loss = self.mse_loss(sr_img, hr_img) + word_attention_map_gt = pred["word_attention_map_gt"] + word_attention_map_pred = pred["word_attention_map_pred"] + + hr_pred = pred["hr_pred"] + sr_pred = pred["sr_pred"] + + attention_loss = paddle.nn.functional.l1_loss(word_attention_map_gt, + word_attention_map_pred) + + loss = (mse_loss + attention_loss * 50) * 100 + + return { + "mse_loss": mse_loss, + "attention_loss": attention_loss, + "loss": loss + } diff --git a/ppocr/losses/table_att_loss.py b/ppocr/losses/table_att_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..f1771847b46b99d8cf2a3ae69e7e990ee02f26a5 --- /dev/null +++ b/ppocr/losses/table_att_loss.py @@ -0,0 +1,93 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +from paddle.nn import functional as F + + +class TableAttentionLoss(nn.Layer): + def __init__(self, structure_weight, loc_weight, **kwargs): + super(TableAttentionLoss, self).__init__() + self.loss_func = nn.CrossEntropyLoss(weight=None, reduction='none') + self.structure_weight = structure_weight + self.loc_weight = loc_weight + + def forward(self, predicts, batch): + structure_probs = predicts['structure_probs'] + structure_targets = batch[1].astype("int64") + structure_targets = structure_targets[:, 1:] + structure_probs = paddle.reshape(structure_probs, + [-1, structure_probs.shape[-1]]) + structure_targets = paddle.reshape(structure_targets, [-1]) + structure_loss = self.loss_func(structure_probs, structure_targets) + + structure_loss = paddle.mean(structure_loss) * self.structure_weight + + loc_preds = predicts['loc_preds'] + loc_targets = batch[2].astype("float32") + loc_targets_mask = batch[3].astype("float32") + loc_targets = loc_targets[:, 1:, :] + loc_targets_mask = loc_targets_mask[:, 1:, :] + loc_loss = F.mse_loss(loc_preds * loc_targets_mask, + loc_targets) * self.loc_weight + + total_loss = structure_loss + loc_loss + return { + 'loss': total_loss, + "structure_loss": structure_loss, + "loc_loss": loc_loss + } + + +class SLALoss(nn.Layer): + def __init__(self, structure_weight, loc_weight, loc_loss='mse', **kwargs): + super(SLALoss, self).__init__() + self.loss_func = nn.CrossEntropyLoss(weight=None, reduction='mean') + self.structure_weight = structure_weight + self.loc_weight = loc_weight + self.loc_loss = loc_loss + self.eps = 1e-12 + + def forward(self, predicts, batch): + structure_probs = predicts['structure_probs'] + structure_targets = batch[1].astype("int64") + structure_targets = structure_targets[:, 1:] + + structure_loss = self.loss_func(structure_probs, structure_targets) + + structure_loss = paddle.mean(structure_loss) * self.structure_weight + + loc_preds = predicts['loc_preds'] + loc_targets = batch[2].astype("float32") + loc_targets_mask = batch[3].astype("float32") + loc_targets = loc_targets[:, 1:, :] + loc_targets_mask = loc_targets_mask[:, 1:, :] + + loc_loss = F.smooth_l1_loss( + loc_preds * loc_targets_mask, + loc_targets * loc_targets_mask, + reduction='sum') * self.loc_weight + + loc_loss = loc_loss / (loc_targets_mask.sum() + self.eps) + total_loss = structure_loss + loc_loss + return { + 'loss': total_loss, + "structure_loss": structure_loss, + "loc_loss": loc_loss + } diff --git a/ppocr/losses/table_master_loss.py b/ppocr/losses/table_master_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..dca982dbd43e2c14f15503e1e98d6fe6c18878c5 --- /dev/null +++ b/ppocr/losses/table_master_loss.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/JiaquanYe/TableMASTER-mmocr/tree/master/mmocr/models/textrecog/losses +""" + +import paddle +from paddle import nn + + +class TableMasterLoss(nn.Layer): + def __init__(self, ignore_index=-1): + super(TableMasterLoss, self).__init__() + self.structure_loss = nn.CrossEntropyLoss( + ignore_index=ignore_index, reduction='mean') + self.box_loss = nn.L1Loss(reduction='sum') + self.eps = 1e-12 + + def forward(self, predicts, batch): + # structure_loss + structure_probs = predicts['structure_probs'] + structure_targets = batch[1] + structure_targets = structure_targets[:, 1:] + structure_probs = structure_probs.reshape( + [-1, structure_probs.shape[-1]]) + structure_targets = structure_targets.reshape([-1]) + + structure_loss = self.structure_loss(structure_probs, structure_targets) + structure_loss = structure_loss.mean() + losses = dict(structure_loss=structure_loss) + + # box loss + bboxes_preds = predicts['loc_preds'] + bboxes_targets = batch[2][:, 1:, :] + bbox_masks = batch[3][:, 1:] + # mask empty-bbox or non-bbox structure token's bbox. + + masked_bboxes_preds = bboxes_preds * bbox_masks + masked_bboxes_targets = bboxes_targets * bbox_masks + + # horizon loss (x and width) + horizon_sum_loss = self.box_loss(masked_bboxes_preds[:, :, 0::2], + masked_bboxes_targets[:, :, 0::2]) + horizon_loss = horizon_sum_loss / (bbox_masks.sum() + self.eps) + # vertical loss (y and height) + vertical_sum_loss = self.box_loss(masked_bboxes_preds[:, :, 1::2], + masked_bboxes_targets[:, :, 1::2]) + vertical_loss = vertical_sum_loss / (bbox_masks.sum() + self.eps) + + horizon_loss = horizon_loss.mean() + vertical_loss = vertical_loss.mean() + all_loss = structure_loss + horizon_loss + vertical_loss + losses.update({ + 'loss': all_loss, + 'horizon_bbox_loss': horizon_loss, + 'vertical_bbox_loss': vertical_loss + }) + return losses diff --git a/ppocr/losses/text_focus_loss.py b/ppocr/losses/text_focus_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..b50628405b58c8589719cafb8c0efdaa7db05aa5 --- /dev/null +++ b/ppocr/losses/text_focus_loss.py @@ -0,0 +1,91 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FudanVI/FudanOCR/blob/main/scene-text-telescope/loss/text_focus_loss.py +""" + +import paddle.nn as nn +import paddle +import numpy as np +import pickle as pkl + +standard_alphebet = '-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' +standard_dict = {} +for index in range(len(standard_alphebet)): + standard_dict[standard_alphebet[index]] = index + + +def load_confuse_matrix(confuse_dict_path): + f = open(confuse_dict_path, 'rb') + data = pkl.load(f) + f.close() + number = data[:10] + upper = data[10:36] + lower = data[36:] + end = np.ones((1, 62)) + pad = np.ones((63, 1)) + rearrange_data = np.concatenate((end, number, lower, upper), axis=0) + rearrange_data = np.concatenate((pad, rearrange_data), axis=1) + rearrange_data = 1 / rearrange_data + rearrange_data[rearrange_data == np.inf] = 1 + rearrange_data = paddle.to_tensor(rearrange_data) + + lower_alpha = 'abcdefghijklmnopqrstuvwxyz' + # upper_alpha = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + for i in range(63): + for j in range(63): + if i != j and standard_alphebet[j] in lower_alpha: + rearrange_data[i][j] = max(rearrange_data[i][j], rearrange_data[i][j + 26]) + rearrange_data = rearrange_data[:37, :37] + + return rearrange_data + + +def weight_cross_entropy(pred, gt, weight_table): + batch = gt.shape[0] + weight = weight_table[gt] + pred_exp = paddle.exp(pred) + pred_exp_weight = weight * pred_exp + loss = 0 + for i in range(len(gt)): + loss -= paddle.log(pred_exp_weight[i][gt[i]] / paddle.sum(pred_exp_weight, 1)[i]) + return loss / batch + + +class TelescopeLoss(nn.Layer): + def __init__(self, confuse_dict_path): + super(TelescopeLoss, self).__init__() + self.weight_table = load_confuse_matrix(confuse_dict_path) + self.mse_loss = nn.MSELoss() + self.ce_loss = nn.CrossEntropyLoss() + self.l1_loss = nn.L1Loss() + + def forward(self, pred, data): + sr_img = pred["sr_img"] + hr_img = pred["hr_img"] + sr_pred = pred["sr_pred"] + text_gt = pred["text_gt"] + + word_attention_map_gt = pred["word_attention_map_gt"] + word_attention_map_pred = pred["word_attention_map_pred"] + mse_loss = self.mse_loss(sr_img, hr_img) + attention_loss = self.l1_loss(word_attention_map_gt, word_attention_map_pred) + recognition_loss = weight_cross_entropy(sr_pred, text_gt, self.weight_table) + loss = mse_loss + attention_loss * 10 + recognition_loss * 0.0005 + return { + "mse_loss": mse_loss, + "attention_loss": attention_loss, + "loss": loss + } diff --git a/ppocr/losses/vqa_token_layoutlm_loss.py b/ppocr/losses/vqa_token_layoutlm_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..5d564c0e26f8fea1359ba3aa489359873a033cb9 --- /dev/null +++ b/ppocr/losses/vqa_token_layoutlm_loss.py @@ -0,0 +1,46 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn +from ppocr.losses.basic_loss import DMLLoss + + +class VQASerTokenLayoutLMLoss(nn.Layer): + def __init__(self, num_classes, key=None): + super().__init__() + self.loss_class = nn.CrossEntropyLoss() + self.num_classes = num_classes + self.ignore_index = self.loss_class.ignore_index + self.key = key + + def forward(self, predicts, batch): + if isinstance(predicts, dict) and self.key is not None: + predicts = predicts[self.key] + labels = batch[5] + attention_mask = batch[2] + if attention_mask is not None: + active_loss = attention_mask.reshape([-1, ]) == 1 + active_output = predicts.reshape( + [-1, self.num_classes])[active_loss] + active_label = labels.reshape([-1, ])[active_loss] + loss = self.loss_class(active_output, active_label) + else: + loss = self.loss_class( + predicts.reshape([-1, self.num_classes]), + labels.reshape([-1, ])) + return {'loss': loss} \ No newline at end of file diff --git a/ppocr/metrics/__init__.py b/ppocr/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5e840a194adc2683e92c308f232dc869df34de8e --- /dev/null +++ b/ppocr/metrics/__init__.py @@ -0,0 +1,49 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import copy + +__all__ = ["build_metric"] + +from .det_metric import DetMetric, DetFCEMetric +from .rec_metric import RecMetric, CNTMetric, CANMetric +from .cls_metric import ClsMetric +from .e2e_metric import E2EMetric +from .distillation_metric import DistillationMetric +from .table_metric import TableMetric +from .kie_metric import KIEMetric +from .vqa_token_ser_metric import VQASerTokenMetric +from .vqa_token_re_metric import VQAReTokenMetric +from .sr_metric import SRMetric +from .ct_metric import CTMetric + + +def build_metric(config): + support_dict = [ + "DetMetric", "DetFCEMetric", "RecMetric", "ClsMetric", "E2EMetric", + "DistillationMetric", "TableMetric", 'KIEMetric', 'VQASerTokenMetric', + 'VQAReTokenMetric', 'SRMetric', 'CTMetric', 'CNTMetric', 'CANMetric' + ] + + config = copy.deepcopy(config) + module_name = config.pop("name") + assert module_name in support_dict, Exception( + "metric only support {}".format(support_dict)) + module_class = eval(module_name)(**config) + return module_class diff --git a/ppocr/metrics/cls_metric.py b/ppocr/metrics/cls_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..6c077518ce205d4ec4d426aaedb8c0af880122ee --- /dev/null +++ b/ppocr/metrics/cls_metric.py @@ -0,0 +1,46 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class ClsMetric(object): + def __init__(self, main_indicator='acc', **kwargs): + self.main_indicator = main_indicator + self.eps = 1e-5 + self.reset() + + def __call__(self, pred_label, *args, **kwargs): + preds, labels = pred_label + correct_num = 0 + all_num = 0 + for (pred, pred_conf), (target, _) in zip(preds, labels): + if pred == target: + correct_num += 1 + all_num += 1 + self.correct_num += correct_num + self.all_num += all_num + return {'acc': correct_num / (all_num + self.eps), } + + def get_metric(self): + """ + return metrics { + 'acc': 0 + } + """ + acc = self.correct_num / (self.all_num + self.eps) + self.reset() + return {'acc': acc} + + def reset(self): + self.correct_num = 0 + self.all_num = 0 diff --git a/ppocr/metrics/ct_metric.py b/ppocr/metrics/ct_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..a7634230a23027a5dd5c32a7b8eb87ee4a229076 --- /dev/null +++ b/ppocr/metrics/ct_metric.py @@ -0,0 +1,52 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +from scipy import io +import numpy as np + +from ppocr.utils.e2e_metric.Deteval import combine_results, get_score_C + + +class CTMetric(object): + def __init__(self, main_indicator, delimiter='\t', **kwargs): + self.delimiter = delimiter + self.main_indicator = main_indicator + self.reset() + + def reset(self): + self.results = [] # clear results + + def __call__(self, preds, batch, **kwargs): + # NOTE: only support bs=1 now, as the label length of different sample is Unequal + assert len( + preds) == 1, "CentripetalText test now only suuport batch_size=1." + label = batch[2] + text = batch[3] + pred = preds[0]['points'] + result = get_score_C(label, text, pred) + + self.results.append(result) + + def get_metric(self): + """ + Input format: y0,x0, ..... yn,xn. Each detection is separated by the end of line token ('\n')' + """ + metrics = combine_results(self.results, rec_flag=False) + self.reset() + return metrics diff --git a/ppocr/metrics/det_metric.py b/ppocr/metrics/det_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..dca94c092790b121c46c4f1c0f0355391ef8dba9 --- /dev/null +++ b/ppocr/metrics/det_metric.py @@ -0,0 +1,154 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +__all__ = ['DetMetric', 'DetFCEMetric'] + +from .eval_det_iou import DetectionIoUEvaluator + + +class DetMetric(object): + def __init__(self, main_indicator='hmean', **kwargs): + self.evaluator = DetectionIoUEvaluator() + self.main_indicator = main_indicator + self.reset() + + def __call__(self, preds, batch, **kwargs): + ''' + batch: a list produced by dataloaders. + image: np.ndarray of shape (N, C, H, W). + ratio_list: np.ndarray of shape(N,2) + polygons: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions. + ignore_tags: np.ndarray of shape (N, K), indicates whether a region is ignorable or not. + preds: a list of dict produced by post process + points: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions. + ''' + gt_polyons_batch = batch[2] + ignore_tags_batch = batch[3] + for pred, gt_polyons, ignore_tags in zip(preds, gt_polyons_batch, + ignore_tags_batch): + # prepare gt + gt_info_list = [{ + 'points': gt_polyon, + 'text': '', + 'ignore': ignore_tag + } for gt_polyon, ignore_tag in zip(gt_polyons, ignore_tags)] + # prepare det + det_info_list = [{ + 'points': det_polyon, + 'text': '' + } for det_polyon in pred['points']] + result = self.evaluator.evaluate_image(gt_info_list, det_info_list) + self.results.append(result) + + def get_metric(self): + """ + return metrics { + 'precision': 0, + 'recall': 0, + 'hmean': 0 + } + """ + + metrics = self.evaluator.combine_results(self.results) + self.reset() + return metrics + + def reset(self): + self.results = [] # clear results + + +class DetFCEMetric(object): + def __init__(self, main_indicator='hmean', **kwargs): + self.evaluator = DetectionIoUEvaluator() + self.main_indicator = main_indicator + self.reset() + + def __call__(self, preds, batch, **kwargs): + ''' + batch: a list produced by dataloaders. + image: np.ndarray of shape (N, C, H, W). + ratio_list: np.ndarray of shape(N,2) + polygons: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions. + ignore_tags: np.ndarray of shape (N, K), indicates whether a region is ignorable or not. + preds: a list of dict produced by post process + points: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions. + ''' + gt_polyons_batch = batch[2] + ignore_tags_batch = batch[3] + + for pred, gt_polyons, ignore_tags in zip(preds, gt_polyons_batch, + ignore_tags_batch): + # prepare gt + gt_info_list = [{ + 'points': gt_polyon, + 'text': '', + 'ignore': ignore_tag + } for gt_polyon, ignore_tag in zip(gt_polyons, ignore_tags)] + # prepare det + det_info_list = [{ + 'points': det_polyon, + 'text': '', + 'score': score + } for det_polyon, score in zip(pred['points'], pred['scores'])] + + for score_thr in self.results.keys(): + det_info_list_thr = [ + det_info for det_info in det_info_list + if det_info['score'] >= score_thr + ] + result = self.evaluator.evaluate_image(gt_info_list, + det_info_list_thr) + self.results[score_thr].append(result) + + def get_metric(self): + """ + return metrics {'heman':0, + 'thr 0.3':'precision: 0 recall: 0 hmean: 0', + 'thr 0.4':'precision: 0 recall: 0 hmean: 0', + 'thr 0.5':'precision: 0 recall: 0 hmean: 0', + 'thr 0.6':'precision: 0 recall: 0 hmean: 0', + 'thr 0.7':'precision: 0 recall: 0 hmean: 0', + 'thr 0.8':'precision: 0 recall: 0 hmean: 0', + 'thr 0.9':'precision: 0 recall: 0 hmean: 0', + } + """ + metrics = {} + hmean = 0 + for score_thr in self.results.keys(): + metric = self.evaluator.combine_results(self.results[score_thr]) + # for key, value in metric.items(): + # metrics['{}_{}'.format(key, score_thr)] = value + metric_str = 'precision:{:.5f} recall:{:.5f} hmean:{:.5f}'.format( + metric['precision'], metric['recall'], metric['hmean']) + metrics['thr {}'.format(score_thr)] = metric_str + hmean = max(hmean, metric['hmean']) + metrics['hmean'] = hmean + + self.reset() + return metrics + + def reset(self): + self.results = { + 0.3: [], + 0.4: [], + 0.5: [], + 0.6: [], + 0.7: [], + 0.8: [], + 0.9: [] + } # clear results diff --git a/ppocr/metrics/distillation_metric.py b/ppocr/metrics/distillation_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..e2cbc4dc07c4e7b234ca964eb9fc0259dfab6ab4 --- /dev/null +++ b/ppocr/metrics/distillation_metric.py @@ -0,0 +1,75 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import copy + +from .rec_metric import RecMetric +from .det_metric import DetMetric +from .e2e_metric import E2EMetric +from .cls_metric import ClsMetric +from .vqa_token_ser_metric import VQASerTokenMetric +from .vqa_token_re_metric import VQAReTokenMetric + + +class DistillationMetric(object): + def __init__(self, + key=None, + base_metric_name=None, + main_indicator=None, + **kwargs): + self.main_indicator = main_indicator + self.key = key + self.main_indicator = main_indicator + self.base_metric_name = base_metric_name + self.kwargs = kwargs + self.metrics = None + + def _init_metrcis(self, preds): + self.metrics = dict() + mod = importlib.import_module(__name__) + for key in preds: + self.metrics[key] = getattr(mod, self.base_metric_name)( + main_indicator=self.main_indicator, **self.kwargs) + self.metrics[key].reset() + + def __call__(self, preds, batch, **kwargs): + assert isinstance(preds, dict) + if self.metrics is None: + self._init_metrcis(preds) + output = dict() + for key in preds: + self.metrics[key].__call__(preds[key], batch, **kwargs) + + def get_metric(self): + """ + return metrics { + 'acc': 0, + 'norm_edit_dis': 0, + } + """ + output = dict() + for key in self.metrics: + metric = self.metrics[key].get_metric() + # main indicator + if key == self.key: + output.update(metric) + else: + for sub_key in metric: + output["{}_{}".format(key, sub_key)] = metric[sub_key] + return output + + def reset(self): + for key in self.metrics: + self.metrics[key].reset() diff --git a/ppocr/metrics/e2e_metric.py b/ppocr/metrics/e2e_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..2f8ba3b222022099501e6bda48266ef51a40e9db --- /dev/null +++ b/ppocr/metrics/e2e_metric.py @@ -0,0 +1,86 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +__all__ = ['E2EMetric'] + +from ppocr.utils.e2e_metric.Deteval import get_socre_A, get_socre_B, combine_results +from ppocr.utils.e2e_utils.extract_textpoint_slow import get_dict + + +class E2EMetric(object): + def __init__(self, + mode, + gt_mat_dir, + character_dict_path, + main_indicator='f_score_e2e', + **kwargs): + self.mode = mode + self.gt_mat_dir = gt_mat_dir + self.label_list = get_dict(character_dict_path) + self.max_index = len(self.label_list) + self.main_indicator = main_indicator + self.reset() + + def __call__(self, preds, batch, **kwargs): + if self.mode == 'A': + gt_polyons_batch = batch[2] + temp_gt_strs_batch = batch[3][0] + ignore_tags_batch = batch[4] + gt_strs_batch = [] + + for temp_list in temp_gt_strs_batch: + t = "" + for index in temp_list: + if index < self.max_index: + t += self.label_list[index] + gt_strs_batch.append(t) + + for pred, gt_polyons, gt_strs, ignore_tags in zip( + [preds], gt_polyons_batch, [gt_strs_batch], ignore_tags_batch): + # prepare gt + gt_info_list = [{ + 'points': gt_polyon, + 'text': gt_str, + 'ignore': ignore_tag + } for gt_polyon, gt_str, ignore_tag in + zip(gt_polyons, gt_strs, ignore_tags)] + # prepare det + e2e_info_list = [{ + 'points': det_polyon, + 'texts': pred_str + } for det_polyon, pred_str in + zip(pred['points'], pred['texts'])] + + result = get_socre_A(gt_info_list, e2e_info_list) + self.results.append(result) + else: + img_id = batch[5][0] + e2e_info_list = [{ + 'points': det_polyon, + 'texts': pred_str + } for det_polyon, pred_str in zip(preds['points'], preds['texts'])] + result = get_socre_B(self.gt_mat_dir, img_id, e2e_info_list) + self.results.append(result) + + def get_metric(self): + metrics = combine_results(self.results) + self.reset() + return metrics + + def reset(self): + self.results = [] # clear results diff --git a/ppocr/metrics/eval_det_iou.py b/ppocr/metrics/eval_det_iou.py new file mode 100644 index 0000000000000000000000000000000000000000..c144886b3f84a458a88931d6beb2153054eba7d0 --- /dev/null +++ b/ppocr/metrics/eval_det_iou.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from collections import namedtuple +import numpy as np +from shapely.geometry import Polygon +""" +reference from : +https://github.com/MhLiao/DB/blob/3c32b808d4412680310d3d28eeb6a2d5bf1566c5/concern/icdar2015_eval/detection/iou.py#L8 +""" + + +class DetectionIoUEvaluator(object): + def __init__(self, iou_constraint=0.5, area_precision_constraint=0.5): + self.iou_constraint = iou_constraint + self.area_precision_constraint = area_precision_constraint + + def evaluate_image(self, gt, pred): + def get_union(pD, pG): + return Polygon(pD).union(Polygon(pG)).area + + def get_intersection_over_union(pD, pG): + return get_intersection(pD, pG) / get_union(pD, pG) + + def get_intersection(pD, pG): + return Polygon(pD).intersection(Polygon(pG)).area + + def compute_ap(confList, matchList, numGtCare): + correct = 0 + AP = 0 + if len(confList) > 0: + confList = np.array(confList) + matchList = np.array(matchList) + sorted_ind = np.argsort(-confList) + confList = confList[sorted_ind] + matchList = matchList[sorted_ind] + for n in range(len(confList)): + match = matchList[n] + if match: + correct += 1 + AP += float(correct) / (n + 1) + + if numGtCare > 0: + AP /= numGtCare + + return AP + + perSampleMetrics = {} + + matchedSum = 0 + + Rectangle = namedtuple('Rectangle', 'xmin ymin xmax ymax') + + numGlobalCareGt = 0 + numGlobalCareDet = 0 + + arrGlobalConfidences = [] + arrGlobalMatches = [] + + recall = 0 + precision = 0 + hmean = 0 + + detMatched = 0 + + iouMat = np.empty([1, 1]) + + gtPols = [] + detPols = [] + + gtPolPoints = [] + detPolPoints = [] + + # Array of Ground Truth Polygons' keys marked as don't Care + gtDontCarePolsNum = [] + # Array of Detected Polygons' matched with a don't Care GT + detDontCarePolsNum = [] + + pairs = [] + detMatchedNums = [] + + arrSampleConfidences = [] + arrSampleMatch = [] + + evaluationLog = "" + + for n in range(len(gt)): + points = gt[n]['points'] + dontCare = gt[n]['ignore'] + if not Polygon(points).is_valid: + continue + + gtPol = points + gtPols.append(gtPol) + gtPolPoints.append(points) + if dontCare: + gtDontCarePolsNum.append(len(gtPols) - 1) + + evaluationLog += "GT polygons: " + str(len(gtPols)) + ( + " (" + str(len(gtDontCarePolsNum)) + " don't care)\n" + if len(gtDontCarePolsNum) > 0 else "\n") + + for n in range(len(pred)): + points = pred[n]['points'] + if not Polygon(points).is_valid: + continue + + detPol = points + detPols.append(detPol) + detPolPoints.append(points) + if len(gtDontCarePolsNum) > 0: + for dontCarePol in gtDontCarePolsNum: + dontCarePol = gtPols[dontCarePol] + intersected_area = get_intersection(dontCarePol, detPol) + pdDimensions = Polygon(detPol).area + precision = 0 if pdDimensions == 0 else intersected_area / pdDimensions + if (precision > self.area_precision_constraint): + detDontCarePolsNum.append(len(detPols) - 1) + break + + evaluationLog += "DET polygons: " + str(len(detPols)) + ( + " (" + str(len(detDontCarePolsNum)) + " don't care)\n" + if len(detDontCarePolsNum) > 0 else "\n") + + if len(gtPols) > 0 and len(detPols) > 0: + # Calculate IoU and precision matrixs + outputShape = [len(gtPols), len(detPols)] + iouMat = np.empty(outputShape) + gtRectMat = np.zeros(len(gtPols), np.int8) + detRectMat = np.zeros(len(detPols), np.int8) + for gtNum in range(len(gtPols)): + for detNum in range(len(detPols)): + pG = gtPols[gtNum] + pD = detPols[detNum] + iouMat[gtNum, detNum] = get_intersection_over_union(pD, pG) + + for gtNum in range(len(gtPols)): + for detNum in range(len(detPols)): + if gtRectMat[gtNum] == 0 and detRectMat[ + detNum] == 0 and gtNum not in gtDontCarePolsNum and detNum not in detDontCarePolsNum: + if iouMat[gtNum, detNum] > self.iou_constraint: + gtRectMat[gtNum] = 1 + detRectMat[detNum] = 1 + detMatched += 1 + pairs.append({'gt': gtNum, 'det': detNum}) + detMatchedNums.append(detNum) + evaluationLog += "Match GT #" + \ + str(gtNum) + " with Det #" + str(detNum) + "\n" + + numGtCare = (len(gtPols) - len(gtDontCarePolsNum)) + numDetCare = (len(detPols) - len(detDontCarePolsNum)) + if numGtCare == 0: + recall = float(1) + precision = float(0) if numDetCare > 0 else float(1) + else: + recall = float(detMatched) / numGtCare + precision = 0 if numDetCare == 0 else float(detMatched) / numDetCare + + hmean = 0 if (precision + recall) == 0 else 2.0 * \ + precision * recall / (precision + recall) + + matchedSum += detMatched + numGlobalCareGt += numGtCare + numGlobalCareDet += numDetCare + + perSampleMetrics = { + 'gtCare': numGtCare, + 'detCare': numDetCare, + 'detMatched': detMatched, + } + return perSampleMetrics + + def combine_results(self, results): + numGlobalCareGt = 0 + numGlobalCareDet = 0 + matchedSum = 0 + for result in results: + numGlobalCareGt += result['gtCare'] + numGlobalCareDet += result['detCare'] + matchedSum += result['detMatched'] + + methodRecall = 0 if numGlobalCareGt == 0 else float( + matchedSum) / numGlobalCareGt + methodPrecision = 0 if numGlobalCareDet == 0 else float( + matchedSum) / numGlobalCareDet + methodHmean = 0 if methodRecall + methodPrecision == 0 else 2 * \ + methodRecall * methodPrecision / ( + methodRecall + methodPrecision) + methodMetrics = { + 'precision': methodPrecision, + 'recall': methodRecall, + 'hmean': methodHmean + } + + return methodMetrics + + +if __name__ == '__main__': + evaluator = DetectionIoUEvaluator() + gts = [[{ + 'points': [(0, 0), (1, 0), (1, 1), (0, 1)], + 'text': 1234, + 'ignore': False, + }, { + 'points': [(2, 2), (3, 2), (3, 3), (2, 3)], + 'text': 5678, + 'ignore': False, + }]] + preds = [[{ + 'points': [(0.1, 0.1), (1, 0), (1, 1), (0, 1)], + 'text': 123, + 'ignore': False, + }]] + results = [] + for gt, pred in zip(gts, preds): + results.append(evaluator.evaluate_image(gt, pred)) + metrics = evaluator.combine_results(results) + print(metrics) diff --git a/ppocr/metrics/kie_metric.py b/ppocr/metrics/kie_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..28ab22b807ffe4347ca95be5a44143539db4c97c --- /dev/null +++ b/ppocr/metrics/kie_metric.py @@ -0,0 +1,71 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# The code is refer from: https://github.com/open-mmlab/mmocr/blob/main/mmocr/core/evaluation/kie_metric.py + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle + +__all__ = ['KIEMetric'] + + +class KIEMetric(object): + def __init__(self, main_indicator='hmean', **kwargs): + self.main_indicator = main_indicator + self.reset() + self.node = [] + self.gt = [] + + def __call__(self, preds, batch, **kwargs): + nodes, _ = preds + gts, tag = batch[4].squeeze(0), batch[5].tolist()[0] + gts = gts[:tag[0], :1].reshape([-1]) + self.node.append(nodes.numpy()) + self.gt.append(gts) + # result = self.compute_f1_score(nodes, gts) + # self.results.append(result) + + def compute_f1_score(self, preds, gts): + ignores = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 25] + C = preds.shape[1] + classes = np.array(sorted(set(range(C)) - set(ignores))) + hist = np.bincount( + (gts * C).astype('int64') + preds.argmax(1), minlength=C + **2).reshape([C, C]).astype('float32') + diag = np.diag(hist) + recalls = diag / hist.sum(1).clip(min=1) + precisions = diag / hist.sum(0).clip(min=1) + f1 = 2 * recalls * precisions / (recalls + precisions).clip(min=1e-8) + return f1[classes] + + def combine_results(self, results): + node = np.concatenate(self.node, 0) + gts = np.concatenate(self.gt, 0) + results = self.compute_f1_score(node, gts) + data = {'hmean': results.mean()} + return data + + def get_metric(self): + + metrics = self.combine_results(self.results) + self.reset() + return metrics + + def reset(self): + self.results = [] # clear results + self.node = [] + self.gt = [] diff --git a/ppocr/metrics/rec_metric.py b/ppocr/metrics/rec_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..305b913c72da5842b6654f1fc9b27e6e2b46b436 --- /dev/null +++ b/ppocr/metrics/rec_metric.py @@ -0,0 +1,179 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from rapidfuzz.distance import Levenshtein +from difflib import SequenceMatcher + +import numpy as np +import string + + +class RecMetric(object): + def __init__(self, + main_indicator='acc', + is_filter=False, + ignore_space=True, + **kwargs): + self.main_indicator = main_indicator + self.is_filter = is_filter + self.ignore_space = ignore_space + self.eps = 1e-5 + self.reset() + + def _normalize_text(self, text): + text = ''.join( + filter(lambda x: x in (string.digits + string.ascii_letters), text)) + return text.lower() + + def __call__(self, pred_label, *args, **kwargs): + preds, labels = pred_label + correct_num = 0 + all_num = 0 + norm_edit_dis = 0.0 + for (pred, pred_conf), (target, _) in zip(preds, labels): + if self.ignore_space: + pred = pred.replace(" ", "") + target = target.replace(" ", "") + if self.is_filter: + pred = self._normalize_text(pred) + target = self._normalize_text(target) + norm_edit_dis += Levenshtein.normalized_distance(pred, target) + if pred == target: + correct_num += 1 + all_num += 1 + self.correct_num += correct_num + self.all_num += all_num + self.norm_edit_dis += norm_edit_dis + return { + 'acc': correct_num / (all_num + self.eps), + 'norm_edit_dis': 1 - norm_edit_dis / (all_num + self.eps) + } + + def get_metric(self): + """ + return metrics { + 'acc': 0, + 'norm_edit_dis': 0, + } + """ + acc = 1.0 * self.correct_num / (self.all_num + self.eps) + norm_edit_dis = 1 - self.norm_edit_dis / (self.all_num + self.eps) + self.reset() + return {'acc': acc, 'norm_edit_dis': norm_edit_dis} + + def reset(self): + self.correct_num = 0 + self.all_num = 0 + self.norm_edit_dis = 0 + + +class CNTMetric(object): + def __init__(self, main_indicator='acc', **kwargs): + self.main_indicator = main_indicator + self.eps = 1e-5 + self.reset() + + def __call__(self, pred_label, *args, **kwargs): + preds, labels = pred_label + correct_num = 0 + all_num = 0 + for pred, target in zip(preds, labels): + if pred == target: + correct_num += 1 + all_num += 1 + self.correct_num += correct_num + self.all_num += all_num + return {'acc': correct_num / (all_num + self.eps), } + + def get_metric(self): + """ + return metrics { + 'acc': 0, + } + """ + acc = 1.0 * self.correct_num / (self.all_num + self.eps) + self.reset() + return {'acc': acc} + + def reset(self): + self.correct_num = 0 + self.all_num = 0 + + +class CANMetric(object): + def __init__(self, main_indicator='exp_rate', **kwargs): + self.main_indicator = main_indicator + self.word_right = [] + self.exp_right = [] + self.word_total_length = 0 + self.exp_total_num = 0 + self.word_rate = 0 + self.exp_rate = 0 + self.reset() + self.epoch_reset() + + def __call__(self, preds, batch, **kwargs): + for k, v in kwargs.items(): + epoch_reset = v + if epoch_reset: + self.epoch_reset() + word_probs = preds + word_label, word_label_mask = batch + line_right = 0 + if word_probs is not None: + word_pred = word_probs.argmax(2) + word_pred = word_pred.cpu().detach().numpy() + word_scores = [ + SequenceMatcher( + None, + s1[:int(np.sum(s3))], + s2[:int(np.sum(s3))], + autojunk=False).ratio() * ( + len(s1[:int(np.sum(s3))]) + len(s2[:int(np.sum(s3))])) / + len(s1[:int(np.sum(s3))]) / 2 + for s1, s2, s3 in zip(word_label, word_pred, word_label_mask) + ] + batch_size = len(word_scores) + for i in range(batch_size): + if word_scores[i] == 1: + line_right += 1 + self.word_rate = np.mean(word_scores) #float + self.exp_rate = line_right / batch_size #float + exp_length, word_length = word_label.shape[:2] + self.word_right.append(self.word_rate * word_length) + self.exp_right.append(self.exp_rate * exp_length) + self.word_total_length = self.word_total_length + word_length + self.exp_total_num = self.exp_total_num + exp_length + + def get_metric(self): + """ + return { + 'word_rate': 0, + "exp_rate": 0, + } + """ + cur_word_rate = sum(self.word_right) / self.word_total_length + cur_exp_rate = sum(self.exp_right) / self.exp_total_num + self.reset() + return {'word_rate': cur_word_rate, "exp_rate": cur_exp_rate} + + def reset(self): + self.word_rate = 0 + self.exp_rate = 0 + + def epoch_reset(self): + self.word_right = [] + self.exp_right = [] + self.word_total_length = 0 + self.exp_total_num = 0 diff --git a/ppocr/metrics/sr_metric.py b/ppocr/metrics/sr_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..51c3ad66564e61abdd91432e6dc9ea1d8918583b --- /dev/null +++ b/ppocr/metrics/sr_metric.py @@ -0,0 +1,155 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +https://github.com/FudanVI/FudanOCR/blob/main/text-gestalt/utils/ssim_psnr.py +""" + +from math import exp + +import paddle +import paddle.nn.functional as F +import paddle.nn as nn +import string + + +class SSIM(nn.Layer): + def __init__(self, window_size=11, size_average=True): + super(SSIM, self).__init__() + self.window_size = window_size + self.size_average = size_average + self.channel = 1 + self.window = self.create_window(window_size, self.channel) + + def gaussian(self, window_size, sigma): + gauss = paddle.to_tensor([ + exp(-(x - window_size // 2)**2 / float(2 * sigma**2)) + for x in range(window_size) + ]) + return gauss / gauss.sum() + + def create_window(self, window_size, channel): + _1D_window = self.gaussian(window_size, 1.5).unsqueeze(1) + _2D_window = _1D_window.mm(_1D_window.t()).unsqueeze(0).unsqueeze(0) + window = _2D_window.expand([channel, 1, window_size, window_size]) + return window + + def _ssim(self, img1, img2, window, window_size, channel, + size_average=True): + mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel) + mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel) + + mu1_sq = mu1.pow(2) + mu2_sq = mu2.pow(2) + mu1_mu2 = mu1 * mu2 + + sigma1_sq = F.conv2d( + img1 * img1, window, padding=window_size // 2, + groups=channel) - mu1_sq + sigma2_sq = F.conv2d( + img2 * img2, window, padding=window_size // 2, + groups=channel) - mu2_sq + sigma12 = F.conv2d( + img1 * img2, window, padding=window_size // 2, + groups=channel) - mu1_mu2 + + C1 = 0.01**2 + C2 = 0.03**2 + + ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ( + (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)) + + if size_average: + return ssim_map.mean() + else: + return ssim_map.mean([1, 2, 3]) + + def ssim(self, img1, img2, window_size=11, size_average=True): + (_, channel, _, _) = img1.shape + window = self.create_window(window_size, channel) + + return self._ssim(img1, img2, window, window_size, channel, + size_average) + + def forward(self, img1, img2): + (_, channel, _, _) = img1.shape + + if channel == self.channel and self.window.dtype == img1.dtype: + window = self.window + else: + window = self.create_window(self.window_size, channel) + + self.window = window + self.channel = channel + + return self._ssim(img1, img2, window, self.window_size, channel, + self.size_average) + + +class SRMetric(object): + def __init__(self, main_indicator='all', **kwargs): + self.main_indicator = main_indicator + self.eps = 1e-5 + self.psnr_result = [] + self.ssim_result = [] + self.calculate_ssim = SSIM() + self.reset() + + def reset(self): + self.correct_num = 0 + self.all_num = 0 + self.norm_edit_dis = 0 + self.psnr_result = [] + self.ssim_result = [] + + def calculate_psnr(self, img1, img2): + # img1 and img2 have range [0, 1] + mse = ((img1 * 255 - img2 * 255)**2).mean() + if mse == 0: + return float('inf') + return 20 * paddle.log10(255.0 / paddle.sqrt(mse)) + + def _normalize_text(self, text): + text = ''.join( + filter(lambda x: x in (string.digits + string.ascii_letters), text)) + return text.lower() + + def __call__(self, pred_label, *args, **kwargs): + metric = {} + images_sr = pred_label["sr_img"] + images_hr = pred_label["hr_img"] + psnr = self.calculate_psnr(images_sr, images_hr) + ssim = self.calculate_ssim(images_sr, images_hr) + self.psnr_result.append(psnr) + self.ssim_result.append(ssim) + + def get_metric(self): + """ + return metrics { + 'acc': 0, + 'norm_edit_dis': 0, + } + """ + self.psnr_avg = sum(self.psnr_result) / len(self.psnr_result) + self.psnr_avg = round(self.psnr_avg.item(), 6) + self.ssim_avg = sum(self.ssim_result) / len(self.ssim_result) + self.ssim_avg = round(self.ssim_avg.item(), 6) + + self.all_avg = self.psnr_avg + self.ssim_avg + + self.reset() + return { + 'psnr_avg': self.psnr_avg, + "ssim_avg": self.ssim_avg, + "all": self.all_avg + } diff --git a/ppocr/metrics/table_metric.py b/ppocr/metrics/table_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..c0b247efa672caacb9a9a09a8ef0da58e47367e4 --- /dev/null +++ b/ppocr/metrics/table_metric.py @@ -0,0 +1,156 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +from ppocr.metrics.det_metric import DetMetric + + +class TableStructureMetric(object): + def __init__(self, + main_indicator='acc', + eps=1e-6, + del_thead_tbody=False, + **kwargs): + self.main_indicator = main_indicator + self.eps = eps + self.del_thead_tbody = del_thead_tbody + self.reset() + + def __call__(self, pred_label, batch=None, *args, **kwargs): + preds, labels = pred_label + pred_structure_batch_list = preds['structure_batch_list'] + gt_structure_batch_list = labels['structure_batch_list'] + correct_num = 0 + all_num = 0 + for (pred, pred_conf), target in zip(pred_structure_batch_list, + gt_structure_batch_list): + pred_str = ''.join(pred) + target_str = ''.join(target) + if self.del_thead_tbody: + pred_str = pred_str.replace('', '').replace( + '', '').replace('', '').replace('', + '') + target_str = target_str.replace('', '').replace( + '', '').replace('', '').replace('', + '') + if pred_str == target_str: + correct_num += 1 + all_num += 1 + self.correct_num += correct_num + self.all_num += all_num + + def get_metric(self): + """ + return metrics { + 'acc': 0, + } + """ + acc = 1.0 * self.correct_num / (self.all_num + self.eps) + self.reset() + return {'acc': acc} + + def reset(self): + self.correct_num = 0 + self.all_num = 0 + self.len_acc_num = 0 + self.token_nums = 0 + self.anys_dict = dict() + + +class TableMetric(object): + def __init__(self, + main_indicator='acc', + compute_bbox_metric=False, + box_format='xyxy', + del_thead_tbody=False, + **kwargs): + """ + + @param sub_metrics: configs of sub_metric + @param main_matric: main_matric for save best_model + @param kwargs: + """ + self.structure_metric = TableStructureMetric( + del_thead_tbody=del_thead_tbody) + self.bbox_metric = DetMetric() if compute_bbox_metric else None + self.main_indicator = main_indicator + self.box_format = box_format + self.reset() + + def __call__(self, pred_label, batch=None, *args, **kwargs): + self.structure_metric(pred_label) + if self.bbox_metric is not None: + self.bbox_metric(*self.prepare_bbox_metric_input(pred_label)) + + def prepare_bbox_metric_input(self, pred_label): + pred_bbox_batch_list = [] + gt_ignore_tags_batch_list = [] + gt_bbox_batch_list = [] + preds, labels = pred_label + + batch_num = len(preds['bbox_batch_list']) + for batch_idx in range(batch_num): + # pred + pred_bbox_list = [ + self.format_box(pred_box) + for pred_box in preds['bbox_batch_list'][batch_idx] + ] + pred_bbox_batch_list.append({'points': pred_bbox_list}) + + # gt + gt_bbox_list = [] + gt_ignore_tags_list = [] + for gt_box in labels['bbox_batch_list'][batch_idx]: + gt_bbox_list.append(self.format_box(gt_box)) + gt_ignore_tags_list.append(0) + gt_bbox_batch_list.append(gt_bbox_list) + gt_ignore_tags_batch_list.append(gt_ignore_tags_list) + + return [ + pred_bbox_batch_list, + [0, 0, gt_bbox_batch_list, gt_ignore_tags_batch_list] + ] + + def get_metric(self): + structure_metric = self.structure_metric.get_metric() + if self.bbox_metric is None: + return structure_metric + bbox_metric = self.bbox_metric.get_metric() + if self.main_indicator == self.bbox_metric.main_indicator: + output = bbox_metric + for sub_key in structure_metric: + output["structure_metric_{}".format( + sub_key)] = structure_metric[sub_key] + else: + output = structure_metric + for sub_key in bbox_metric: + output["bbox_metric_{}".format(sub_key)] = bbox_metric[sub_key] + return output + + def reset(self): + self.structure_metric.reset() + if self.bbox_metric is not None: + self.bbox_metric.reset() + + def format_box(self, box): + if self.box_format == 'xyxy': + x1, y1, x2, y2 = box + box = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]] + elif self.box_format == 'xywh': + x, y, w, h = box + x1, y1, x2, y2 = x - w // 2, y - h // 2, x + w // 2, y + h // 2 + box = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]] + elif self.box_format == 'xyxyxyxy': + x1, y1, x2, y2, x3, y3, x4, y4 = box + box = [[x1, y1], [x2, y2], [x3, y3], [x4, y4]] + return box diff --git a/ppocr/metrics/vqa_token_re_metric.py b/ppocr/metrics/vqa_token_re_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..0509984f7e7e85fc1dae859761fedb7356a02477 --- /dev/null +++ b/ppocr/metrics/vqa_token_re_metric.py @@ -0,0 +1,181 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle + +__all__ = ['KIEMetric'] + + +class VQAReTokenMetric(object): + def __init__(self, main_indicator='hmean', **kwargs): + self.main_indicator = main_indicator + self.reset() + + def __call__(self, preds, batch, **kwargs): + pred_relations, relations, entities = preds + self.pred_relations_list.extend(pred_relations) + self.relations_list.extend(relations) + self.entities_list.extend(entities) + + def get_metric(self): + gt_relations = [] + for b in range(len(self.relations_list)): + rel_sent = [] + relation_list = self.relations_list[b] + entitie_list = self.entities_list[b] + head_len = relation_list[0, 0] + if head_len > 0: + entitie_start_list = entitie_list[1:entitie_list[0, 0] + 1, 0] + entitie_end_list = entitie_list[1:entitie_list[0, 1] + 1, 1] + entitie_label_list = entitie_list[1:entitie_list[0, 2] + 1, 2] + for head, tail in zip(relation_list[1:head_len + 1, 0], + relation_list[1:head_len + 1, 1]): + rel = {} + rel["head_id"] = head + rel["head"] = (entitie_start_list[head], + entitie_end_list[head]) + rel["head_type"] = entitie_label_list[head] + + rel["tail_id"] = tail + rel["tail"] = (entitie_start_list[tail], + entitie_end_list[tail]) + rel["tail_type"] = entitie_label_list[tail] + + rel["type"] = 1 + rel_sent.append(rel) + gt_relations.append(rel_sent) + re_metrics = self.re_score( + self.pred_relations_list, gt_relations, mode="boundaries") + metrics = { + "precision": re_metrics["ALL"]["p"], + "recall": re_metrics["ALL"]["r"], + "hmean": re_metrics["ALL"]["f1"], + } + self.reset() + return metrics + + def reset(self): + self.pred_relations_list = [] + self.relations_list = [] + self.entities_list = [] + + def re_score(self, pred_relations, gt_relations, mode="strict"): + """Evaluate RE predictions + + Args: + pred_relations (list) : list of list of predicted relations (several relations in each sentence) + gt_relations (list) : list of list of ground truth relations + + rel = { "head": (start_idx (inclusive), end_idx (exclusive)), + "tail": (start_idx (inclusive), end_idx (exclusive)), + "head_type": ent_type, + "tail_type": ent_type, + "type": rel_type} + + vocab (Vocab) : dataset vocabulary + mode (str) : in 'strict' or 'boundaries'""" + + assert mode in ["strict", "boundaries"] + + relation_types = [v for v in [0, 1] if not v == 0] + scores = { + rel: { + "tp": 0, + "fp": 0, + "fn": 0 + } + for rel in relation_types + ["ALL"] + } + + # Count GT relations and Predicted relations + n_sents = len(gt_relations) + n_rels = sum([len([rel for rel in sent]) for sent in gt_relations]) + n_found = sum([len([rel for rel in sent]) for sent in pred_relations]) + + # Count TP, FP and FN per type + for pred_sent, gt_sent in zip(pred_relations, gt_relations): + for rel_type in relation_types: + # strict mode takes argument types into account + if mode == "strict": + pred_rels = {(rel["head"], rel["head_type"], rel["tail"], + rel["tail_type"]) + for rel in pred_sent + if rel["type"] == rel_type} + gt_rels = {(rel["head"], rel["head_type"], rel["tail"], + rel["tail_type"]) + for rel in gt_sent if rel["type"] == rel_type} + + # boundaries mode only takes argument spans into account + elif mode == "boundaries": + pred_rels = {(rel["head"], rel["tail"]) + for rel in pred_sent + if rel["type"] == rel_type} + gt_rels = {(rel["head"], rel["tail"]) + for rel in gt_sent if rel["type"] == rel_type} + + scores[rel_type]["tp"] += len(pred_rels & gt_rels) + scores[rel_type]["fp"] += len(pred_rels - gt_rels) + scores[rel_type]["fn"] += len(gt_rels - pred_rels) + + # Compute per entity Precision / Recall / F1 + for rel_type in scores.keys(): + if scores[rel_type]["tp"]: + scores[rel_type]["p"] = scores[rel_type]["tp"] / ( + scores[rel_type]["fp"] + scores[rel_type]["tp"]) + scores[rel_type]["r"] = scores[rel_type]["tp"] / ( + scores[rel_type]["fn"] + scores[rel_type]["tp"]) + else: + scores[rel_type]["p"], scores[rel_type]["r"] = 0, 0 + + if not scores[rel_type]["p"] + scores[rel_type]["r"] == 0: + scores[rel_type]["f1"] = ( + 2 * scores[rel_type]["p"] * scores[rel_type]["r"] / + (scores[rel_type]["p"] + scores[rel_type]["r"])) + else: + scores[rel_type]["f1"] = 0 + + # Compute micro F1 Scores + tp = sum([scores[rel_type]["tp"] for rel_type in relation_types]) + fp = sum([scores[rel_type]["fp"] for rel_type in relation_types]) + fn = sum([scores[rel_type]["fn"] for rel_type in relation_types]) + + if tp: + precision = tp / (tp + fp) + recall = tp / (tp + fn) + f1 = 2 * precision * recall / (precision + recall) + + else: + precision, recall, f1 = 0, 0, 0 + + scores["ALL"]["p"] = precision + scores["ALL"]["r"] = recall + scores["ALL"]["f1"] = f1 + scores["ALL"]["tp"] = tp + scores["ALL"]["fp"] = fp + scores["ALL"]["fn"] = fn + + # Compute Macro F1 Scores + scores["ALL"]["Macro_f1"] = np.mean( + [scores[ent_type]["f1"] for ent_type in relation_types]) + scores["ALL"]["Macro_p"] = np.mean( + [scores[ent_type]["p"] for ent_type in relation_types]) + scores["ALL"]["Macro_r"] = np.mean( + [scores[ent_type]["r"] for ent_type in relation_types]) + + return scores diff --git a/ppocr/metrics/vqa_token_ser_metric.py b/ppocr/metrics/vqa_token_ser_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..286d8addaf32ba8e4fa12751c6e270c853775bd4 --- /dev/null +++ b/ppocr/metrics/vqa_token_ser_metric.py @@ -0,0 +1,47 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle + +__all__ = ['KIEMetric'] + + +class VQASerTokenMetric(object): + def __init__(self, main_indicator='hmean', **kwargs): + self.main_indicator = main_indicator + self.reset() + + def __call__(self, preds, batch, **kwargs): + preds, labels = preds + self.pred_list.extend(preds) + self.gt_list.extend(labels) + + def get_metric(self): + from seqeval.metrics import f1_score, precision_score, recall_score + metrics = { + "precision": precision_score(self.gt_list, self.pred_list), + "recall": recall_score(self.gt_list, self.pred_list), + "hmean": f1_score(self.gt_list, self.pred_list), + } + self.reset() + return metrics + + def reset(self): + self.pred_list = [] + self.gt_list = [] diff --git a/ppocr/modeling/architectures/__init__.py b/ppocr/modeling/architectures/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1c955ef3abe9c38e816616cc9b5399c6832aa5f1 --- /dev/null +++ b/ppocr/modeling/architectures/__init__.py @@ -0,0 +1,68 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import importlib + +from paddle.jit import to_static +from paddle.static import InputSpec + +from .base_model import BaseModel +from .distillation_model import DistillationModel + +__all__ = ["build_model", "apply_to_static"] + + +def build_model(config): + config = copy.deepcopy(config) + if not "name" in config: + arch = BaseModel(config) + else: + name = config.pop("name") + mod = importlib.import_module(__name__) + arch = getattr(mod, name)(config) + return arch + + +def apply_to_static(model, config, logger): + if config["Global"].get("to_static", False) is not True: + return model + assert "image_shape" in config[ + "Global"], "image_shape must be assigned for static training mode..." + supported_list = ["DB", "SVTR"] + if config["Architecture"]["algorithm"] in ["Distillation"]: + algo = list(config["Architecture"]["Models"].values())[0]["algorithm"] + else: + algo = config["Architecture"]["algorithm"] + assert algo in supported_list, f"algorithms that supports static training must in in {supported_list} but got {algo}" + + specs = [ + InputSpec( + [None] + config["Global"]["image_shape"], dtype='float32') + ] + + if algo == "SVTR": + specs.append([ + InputSpec( + [None, config["Global"]["max_text_length"]], + dtype='int64'), InputSpec( + [None, config["Global"]["max_text_length"]], dtype='int64'), + InputSpec( + [None], dtype='int64'), InputSpec( + [None], dtype='float64') + ]) + + model = to_static(model, input_spec=specs) + logger.info("Successfully to apply @to_static with specs: {}".format(specs)) + return model diff --git a/ppocr/modeling/architectures/base_model.py b/ppocr/modeling/architectures/base_model.py new file mode 100644 index 0000000000000000000000000000000000000000..5612d366ea9ccf3f45ab675fbaa374fd4fe5d773 --- /dev/null +++ b/ppocr/modeling/architectures/base_model.py @@ -0,0 +1,118 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn +from ppocr.modeling.transforms import build_transform +from ppocr.modeling.backbones import build_backbone +from ppocr.modeling.necks import build_neck +from ppocr.modeling.heads import build_head + +__all__ = ['BaseModel'] + + +class BaseModel(nn.Layer): + def __init__(self, config): + """ + the module for OCR. + args: + config (dict): the super parameters for module. + """ + super(BaseModel, self).__init__() + in_channels = config.get('in_channels', 3) + model_type = config['model_type'] + # build transfrom, + # for rec, transfrom can be TPS,None + # for det and cls, transfrom shoule to be None, + # if you make model differently, you can use transfrom in det and cls + if 'Transform' not in config or config['Transform'] is None: + self.use_transform = False + else: + self.use_transform = True + config['Transform']['in_channels'] = in_channels + self.transform = build_transform(config['Transform']) + in_channels = self.transform.out_channels + + # build backbone, backbone is need for del, rec and cls + if 'Backbone' not in config or config['Backbone'] is None: + self.use_backbone = False + else: + self.use_backbone = True + config["Backbone"]['in_channels'] = in_channels + self.backbone = build_backbone(config["Backbone"], model_type) + in_channels = self.backbone.out_channels + + # build neck + # for rec, neck can be cnn,rnn or reshape(None) + # for det, neck can be FPN, BIFPN and so on. + # for cls, neck should be none + if 'Neck' not in config or config['Neck'] is None: + self.use_neck = False + else: + self.use_neck = True + config['Neck']['in_channels'] = in_channels + self.neck = build_neck(config['Neck']) + in_channels = self.neck.out_channels + + # # build head, head is need for det, rec and cls + if 'Head' not in config or config['Head'] is None: + self.use_head = False + else: + self.use_head = True + config["Head"]['in_channels'] = in_channels + self.head = build_head(config["Head"]) + + self.return_all_feats = config.get("return_all_feats", False) + + def forward(self, x, data=None): + + y = dict() + if self.use_transform: + x = self.transform(x) + if self.use_backbone: + x = self.backbone(x) + if isinstance(x, dict): + y.update(x) + else: + y["backbone_out"] = x + final_name = "backbone_out" + if self.use_neck: + x = self.neck(x) + if isinstance(x, dict): + y.update(x) + else: + y["neck_out"] = x + final_name = "neck_out" + if self.use_head: + x = self.head(x, targets=data) + # for multi head, save ctc neck out for udml + if isinstance(x, dict) and 'ctc_neck' in x.keys(): + y["neck_out"] = x["ctc_neck"] + y["head_out"] = x + elif isinstance(x, dict): + y.update(x) + else: + y["head_out"] = x + final_name = "head_out" + if self.return_all_feats: + if self.training: + return y + elif isinstance(x, dict): + return x + else: + return {final_name: x} + else: + return x \ No newline at end of file diff --git a/ppocr/modeling/architectures/distillation_model.py b/ppocr/modeling/architectures/distillation_model.py new file mode 100644 index 0000000000000000000000000000000000000000..cce8fd311d4e847afda0fbb035743f0a10564c7d --- /dev/null +++ b/ppocr/modeling/architectures/distillation_model.py @@ -0,0 +1,60 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn +from ppocr.modeling.transforms import build_transform +from ppocr.modeling.backbones import build_backbone +from ppocr.modeling.necks import build_neck +from ppocr.modeling.heads import build_head +from .base_model import BaseModel +from ppocr.utils.save_load import load_pretrained_params + +__all__ = ['DistillationModel'] + + +class DistillationModel(nn.Layer): + def __init__(self, config): + """ + the module for OCR distillation. + args: + config (dict): the super parameters for module. + """ + super().__init__() + self.model_list = [] + self.model_name_list = [] + for key in config["Models"]: + model_config = config["Models"][key] + freeze_params = False + pretrained = None + if "freeze_params" in model_config: + freeze_params = model_config.pop("freeze_params") + if "pretrained" in model_config: + pretrained = model_config.pop("pretrained") + model = BaseModel(model_config) + if pretrained is not None: + load_pretrained_params(model, pretrained) + if freeze_params: + for param in model.parameters(): + param.trainable = False + self.model_list.append(self.add_sublayer(key, model)) + self.model_name_list.append(key) + + def forward(self, x, data=None): + result_dict = dict() + for idx, model_name in enumerate(self.model_name_list): + result_dict[model_name] = self.model_list[idx](x, data) + return result_dict diff --git a/ppocr/modeling/backbones/__init__.py b/ppocr/modeling/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e2c2e9c4a4ed526b36d512d824ae8a8a701c17bc --- /dev/null +++ b/ppocr/modeling/backbones/__init__.py @@ -0,0 +1,75 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ["build_backbone"] + + +def build_backbone(config, model_type): + if model_type == "det" or model_type == "table": + from .det_mobilenet_v3 import MobileNetV3 + from .det_resnet import ResNet + from .det_resnet_vd import ResNet_vd + from .det_resnet_vd_sast import ResNet_SAST + from .det_pp_lcnet import PPLCNet + support_dict = [ + "MobileNetV3", "ResNet", "ResNet_vd", "ResNet_SAST", "PPLCNet" + ] + if model_type == "table": + from .table_master_resnet import TableResNetExtra + support_dict.append('TableResNetExtra') + elif model_type == "rec" or model_type == "cls": + from .rec_mobilenet_v3 import MobileNetV3 + from .rec_resnet_vd import ResNet + from .rec_resnet_fpn import ResNetFPN + from .rec_mv1_enhance import MobileNetV1Enhance + from .rec_nrtr_mtb import MTB + from .rec_resnet_31 import ResNet31 + from .rec_resnet_32 import ResNet32 + from .rec_resnet_45 import ResNet45 + from .rec_resnet_aster import ResNet_ASTER + from .rec_micronet import MicroNet + from .rec_efficientb3_pren import EfficientNetb3_PREN + from .rec_svtrnet import SVTRNet + from .rec_vitstr import ViTSTR + from .rec_resnet_rfl import ResNetRFL + from .rec_densenet import DenseNet + support_dict = [ + 'MobileNetV1Enhance', 'MobileNetV3', 'ResNet', 'ResNetFPN', 'MTB', + 'ResNet31', 'ResNet45', 'ResNet_ASTER', 'MicroNet', + 'EfficientNetb3_PREN', 'SVTRNet', 'ViTSTR', 'ResNet32', 'ResNetRFL', + 'DenseNet' + ] + elif model_type == 'e2e': + from .e2e_resnet_vd_pg import ResNet + support_dict = ['ResNet'] + elif model_type == 'kie': + from .kie_unet_sdmgr import Kie_backbone + from .vqa_layoutlm import LayoutLMForSer, LayoutLMv2ForSer, LayoutLMv2ForRe, LayoutXLMForSer, LayoutXLMForRe + support_dict = [ + 'Kie_backbone', 'LayoutLMForSer', 'LayoutLMv2ForSer', + 'LayoutLMv2ForRe', 'LayoutXLMForSer', 'LayoutXLMForRe' + ] + elif model_type == 'table': + from .table_resnet_vd import ResNet + from .table_mobilenet_v3 import MobileNetV3 + support_dict = ['ResNet', 'MobileNetV3'] + else: + raise NotImplementedError + + module_name = config.pop('name') + assert module_name in support_dict, Exception( + "when model typs is {}, backbone only support {}".format(model_type, + support_dict)) + module_class = eval(module_name)(**config) + return module_class diff --git a/ppocr/modeling/backbones/det_mobilenet_v3.py b/ppocr/modeling/backbones/det_mobilenet_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..05113ea8419aa302c952adfd74e9083055c35dca --- /dev/null +++ b/ppocr/modeling/backbones/det_mobilenet_v3.py @@ -0,0 +1,268 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + +__all__ = ['MobileNetV3'] + + +def make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class MobileNetV3(nn.Layer): + def __init__(self, + in_channels=3, + model_name='large', + scale=0.5, + disable_se=False, + **kwargs): + """ + the MobilenetV3 backbone network for detection module. + Args: + params(dict): the super parameters for build network + """ + super(MobileNetV3, self).__init__() + + self.disable_se = disable_se + + if model_name == "large": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, False, 'relu', 1], + [3, 64, 24, False, 'relu', 2], + [3, 72, 24, False, 'relu', 1], + [5, 72, 40, True, 'relu', 2], + [5, 120, 40, True, 'relu', 1], + [5, 120, 40, True, 'relu', 1], + [3, 240, 80, False, 'hardswish', 2], + [3, 200, 80, False, 'hardswish', 1], + [3, 184, 80, False, 'hardswish', 1], + [3, 184, 80, False, 'hardswish', 1], + [3, 480, 112, True, 'hardswish', 1], + [3, 672, 112, True, 'hardswish', 1], + [5, 672, 160, True, 'hardswish', 2], + [5, 960, 160, True, 'hardswish', 1], + [5, 960, 160, True, 'hardswish', 1], + ] + cls_ch_squeeze = 960 + elif model_name == "small": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, True, 'relu', 2], + [3, 72, 24, False, 'relu', 2], + [3, 88, 24, False, 'relu', 1], + [5, 96, 40, True, 'hardswish', 2], + [5, 240, 40, True, 'hardswish', 1], + [5, 240, 40, True, 'hardswish', 1], + [5, 120, 48, True, 'hardswish', 1], + [5, 144, 48, True, 'hardswish', 1], + [5, 288, 96, True, 'hardswish', 2], + [5, 576, 96, True, 'hardswish', 1], + [5, 576, 96, True, 'hardswish', 1], + ] + cls_ch_squeeze = 576 + else: + raise NotImplementedError("mode[" + model_name + + "_model] is not implemented!") + + supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25] + assert scale in supported_scale, \ + "supported scale are {} but input scale is {}".format(supported_scale, scale) + inplanes = 16 + # conv1 + self.conv = ConvBNLayer( + in_channels=in_channels, + out_channels=make_divisible(inplanes * scale), + kernel_size=3, + stride=2, + padding=1, + groups=1, + if_act=True, + act='hardswish') + + self.stages = [] + self.out_channels = [] + block_list = [] + i = 0 + inplanes = make_divisible(inplanes * scale) + for (k, exp, c, se, nl, s) in cfg: + se = se and not self.disable_se + start_idx = 2 if model_name == 'large' else 0 + if s == 2 and i > start_idx: + self.out_channels.append(inplanes) + self.stages.append(nn.Sequential(*block_list)) + block_list = [] + block_list.append( + ResidualUnit( + in_channels=inplanes, + mid_channels=make_divisible(scale * exp), + out_channels=make_divisible(scale * c), + kernel_size=k, + stride=s, + use_se=se, + act=nl)) + inplanes = make_divisible(scale * c) + i += 1 + block_list.append( + ConvBNLayer( + in_channels=inplanes, + out_channels=make_divisible(scale * cls_ch_squeeze), + kernel_size=1, + stride=1, + padding=0, + groups=1, + if_act=True, + act='hardswish')) + self.stages.append(nn.Sequential(*block_list)) + self.out_channels.append(make_divisible(scale * cls_ch_squeeze)) + for i, stage in enumerate(self.stages): + self.add_sublayer(sublayer=stage, name="stage{}".format(i)) + + def forward(self, x): + x = self.conv(x) + out_list = [] + for stage in self.stages: + x = stage(x) + out_list.append(x) + return out_list + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias_attr=False) + + self.bn = nn.BatchNorm(num_channels=out_channels, act=None) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.if_act: + if self.act == "relu": + x = F.relu(x) + elif self.act == "hardswish": + x = F.hardswish(x) + else: + print("The activation function({}) is selected incorrectly.". + format(self.act)) + exit() + return x + + +class ResidualUnit(nn.Layer): + def __init__(self, + in_channels, + mid_channels, + out_channels, + kernel_size, + stride, + use_se, + act=None): + super(ResidualUnit, self).__init__() + self.if_shortcut = stride == 1 and in_channels == out_channels + self.if_se = use_se + + self.expand_conv = ConvBNLayer( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=1, + stride=1, + padding=0, + if_act=True, + act=act) + self.bottleneck_conv = ConvBNLayer( + in_channels=mid_channels, + out_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + padding=int((kernel_size - 1) // 2), + groups=mid_channels, + if_act=True, + act=act) + if self.if_se: + self.mid_se = SEModule(mid_channels) + self.linear_conv = ConvBNLayer( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + if_act=False, + act=None) + + def forward(self, inputs): + x = self.expand_conv(inputs) + x = self.bottleneck_conv(x) + if self.if_se: + x = self.mid_se(x) + x = self.linear_conv(x) + if self.if_shortcut: + x = paddle.add(inputs, x) + return x + + +class SEModule(nn.Layer): + def __init__(self, in_channels, reduction=4): + super(SEModule, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2D(1) + self.conv1 = nn.Conv2D( + in_channels=in_channels, + out_channels=in_channels // reduction, + kernel_size=1, + stride=1, + padding=0) + self.conv2 = nn.Conv2D( + in_channels=in_channels // reduction, + out_channels=in_channels, + kernel_size=1, + stride=1, + padding=0) + + def forward(self, inputs): + outputs = self.avg_pool(inputs) + outputs = self.conv1(outputs) + outputs = F.relu(outputs) + outputs = self.conv2(outputs) + outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5) + return inputs * outputs diff --git a/ppocr/modeling/backbones/det_pp_lcnet.py b/ppocr/modeling/backbones/det_pp_lcnet.py new file mode 100644 index 0000000000000000000000000000000000000000..3f719e92bc67452b482e5b2053ee1a09540ffc0e --- /dev/null +++ b/ppocr/modeling/backbones/det_pp_lcnet.py @@ -0,0 +1,271 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import, division, print_function + +import os +import paddle +import paddle.nn as nn +from paddle import ParamAttr +from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear +from paddle.regularizer import L2Decay +from paddle.nn.initializer import KaimingNormal +from paddle.utils.download import get_path_from_url + +MODEL_URLS = { + "PPLCNet_x0.25": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_25_pretrained.pdparams", + "PPLCNet_x0.35": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_35_pretrained.pdparams", + "PPLCNet_x0.5": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_5_pretrained.pdparams", + "PPLCNet_x0.75": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_75_pretrained.pdparams", + "PPLCNet_x1.0": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_0_pretrained.pdparams", + "PPLCNet_x1.5": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_5_pretrained.pdparams", + "PPLCNet_x2.0": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x2_0_pretrained.pdparams", + "PPLCNet_x2.5": + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x2_5_pretrained.pdparams" +} + +MODEL_STAGES_PATTERN = { + "PPLCNet": ["blocks2", "blocks3", "blocks4", "blocks5", "blocks6"] +} + +__all__ = list(MODEL_URLS.keys()) + +# Each element(list) represents a depthwise block, which is composed of k, in_c, out_c, s, use_se. +# k: kernel_size +# in_c: input channel number in depthwise block +# out_c: output channel number in depthwise block +# s: stride in depthwise block +# use_se: whether to use SE block + +NET_CONFIG = { + "blocks2": + # k, in_c, out_c, s, use_se + [[3, 16, 32, 1, False]], + "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]], + "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]], + "blocks5": + [[3, 128, 256, 2, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False], + [5, 256, 256, 1, False], [5, 256, 256, 1, False], [5, 256, 256, 1, False]], + "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]] +} + + +def make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class ConvBNLayer(nn.Layer): + def __init__(self, + num_channels, + filter_size, + num_filters, + stride, + num_groups=1): + super().__init__() + + self.conv = Conv2D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=num_groups, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + + self.bn = BatchNorm( + num_filters, + param_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + self.hardswish = nn.Hardswish() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.hardswish(x) + return x + + +class DepthwiseSeparable(nn.Layer): + def __init__(self, + num_channels, + num_filters, + stride, + dw_size=3, + use_se=False): + super().__init__() + self.use_se = use_se + self.dw_conv = ConvBNLayer( + num_channels=num_channels, + num_filters=num_channels, + filter_size=dw_size, + stride=stride, + num_groups=num_channels) + if use_se: + self.se = SEModule(num_channels) + self.pw_conv = ConvBNLayer( + num_channels=num_channels, + filter_size=1, + num_filters=num_filters, + stride=1) + + def forward(self, x): + x = self.dw_conv(x) + if self.use_se: + x = self.se(x) + x = self.pw_conv(x) + return x + + +class SEModule(nn.Layer): + def __init__(self, channel, reduction=4): + super().__init__() + self.avg_pool = AdaptiveAvgPool2D(1) + self.conv1 = Conv2D( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0) + self.relu = nn.ReLU() + self.conv2 = Conv2D( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0) + self.hardsigmoid = nn.Hardsigmoid() + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.hardsigmoid(x) + x = paddle.multiply(x=identity, y=x) + return x + + +class PPLCNet(nn.Layer): + def __init__(self, + in_channels=3, + scale=1.0, + pretrained=False, + use_ssld=False): + super().__init__() + self.out_channels = [ + int(NET_CONFIG["blocks3"][-1][2] * scale), + int(NET_CONFIG["blocks4"][-1][2] * scale), + int(NET_CONFIG["blocks5"][-1][2] * scale), + int(NET_CONFIG["blocks6"][-1][2] * scale) + ] + self.scale = scale + + self.conv1 = ConvBNLayer( + num_channels=in_channels, + filter_size=3, + num_filters=make_divisible(16 * scale), + stride=2) + + self.blocks2 = nn.Sequential(* [ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"]) + ]) + + self.blocks3 = nn.Sequential(* [ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"]) + ]) + + self.blocks4 = nn.Sequential(* [ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"]) + ]) + + self.blocks5 = nn.Sequential(* [ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"]) + ]) + + self.blocks6 = nn.Sequential(* [ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"]) + ]) + + if pretrained: + self._load_pretrained( + MODEL_URLS['PPLCNet_x{}'.format(scale)], use_ssld=use_ssld) + + def forward(self, x): + outs = [] + x = self.conv1(x) + x = self.blocks2(x) + x = self.blocks3(x) + outs.append(x) + x = self.blocks4(x) + outs.append(x) + x = self.blocks5(x) + outs.append(x) + x = self.blocks6(x) + outs.append(x) + return outs + + def _load_pretrained(self, pretrained_url, use_ssld=False): + if use_ssld: + pretrained_url = pretrained_url.replace("_pretrained", + "_ssld_pretrained") + print(pretrained_url) + local_weight_path = get_path_from_url( + pretrained_url, os.path.expanduser("~/.paddleclas/weights")) + param_state_dict = paddle.load(local_weight_path) + self.set_dict(param_state_dict) + return diff --git a/ppocr/modeling/backbones/det_resnet.py b/ppocr/modeling/backbones/det_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..87eef11cf0e33c24c0f539c8074b21f589345282 --- /dev/null +++ b/ppocr/modeling/backbones/det_resnet.py @@ -0,0 +1,236 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import Conv2D, BatchNorm, Linear, Dropout +from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D +from paddle.nn.initializer import Uniform + +import math + +from paddle.vision.ops import DeformConv2D +from paddle.regularizer import L2Decay +from paddle.nn.initializer import Normal, Constant, XavierUniform +from .det_resnet_vd import DeformableConvV2, ConvBNLayer + + +class BottleneckBlock(nn.Layer): + def __init__(self, + num_channels, + num_filters, + stride, + shortcut=True, + is_dcn=False): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=1, + act="relu", ) + self.conv1 = ConvBNLayer( + in_channels=num_filters, + out_channels=num_filters, + kernel_size=3, + stride=stride, + act="relu", + is_dcn=is_dcn, + dcn_groups=1, ) + self.conv2 = ConvBNLayer( + in_channels=num_filters, + out_channels=num_filters * 4, + kernel_size=1, + act=None, ) + + if not shortcut: + self.short = ConvBNLayer( + in_channels=num_channels, + out_channels=num_filters * 4, + kernel_size=1, + stride=stride, ) + + self.shortcut = shortcut + + self._num_channels_out = num_filters * 4 + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + y = paddle.add(x=short, y=conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + num_channels, + num_filters, + stride, + shortcut=True, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=3, + stride=stride, + act="relu") + self.conv1 = ConvBNLayer( + in_channels=num_filters, + out_channels=num_filters, + kernel_size=3, + act=None) + + if not shortcut: + self.short = ConvBNLayer( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=1, + stride=stride) + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv1) + y = F.relu(y) + return y + + +class ResNet(nn.Layer): + def __init__(self, + in_channels=3, + layers=50, + out_indices=None, + dcn_stage=None): + super(ResNet, self).__init__() + + self.layers = layers + self.input_image_channel = in_channels + + supported_layers = [18, 34, 50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + num_channels = [64, 256, 512, + 1024] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512] + + self.dcn_stage = dcn_stage if dcn_stage is not None else [ + False, False, False, False + ] + self.out_indices = out_indices if out_indices is not None else [ + 0, 1, 2, 3 + ] + + self.conv = ConvBNLayer( + in_channels=self.input_image_channel, + out_channels=64, + kernel_size=7, + stride=2, + act="relu", ) + self.pool2d_max = MaxPool2D( + kernel_size=3, + stride=2, + padding=1, ) + + self.stages = [] + self.out_channels = [] + if layers >= 50: + for block in range(len(depth)): + shortcut = False + block_list = [] + is_dcn = self.dcn_stage[block] + for i in range(depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneck_block = self.add_sublayer( + conv_name, + BottleneckBlock( + num_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + is_dcn=is_dcn)) + block_list.append(bottleneck_block) + shortcut = True + if block in self.out_indices: + self.out_channels.append(num_filters[block] * 4) + self.stages.append(nn.Sequential(*block_list)) + else: + for block in range(len(depth)): + shortcut = False + block_list = [] + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = self.add_sublayer( + conv_name, + BasicBlock( + num_channels=num_channels[block] + if i == 0 else num_filters[block], + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut)) + block_list.append(basic_block) + shortcut = True + if block in self.out_indices: + self.out_channels.append(num_filters[block]) + self.stages.append(nn.Sequential(*block_list)) + + def forward(self, inputs): + y = self.conv(inputs) + y = self.pool2d_max(y) + out = [] + for i, block in enumerate(self.stages): + y = block(y) + if i in self.out_indices: + out.append(y) + return out diff --git a/ppocr/modeling/backbones/det_resnet_vd.py b/ppocr/modeling/backbones/det_resnet_vd.py new file mode 100644 index 0000000000000000000000000000000000000000..a421da0ab440e9b87c1c7efc7d2448f8f76ad205 --- /dev/null +++ b/ppocr/modeling/backbones/det_resnet_vd.py @@ -0,0 +1,352 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + +from paddle.vision.ops import DeformConv2D +from paddle.regularizer import L2Decay +from paddle.nn.initializer import Normal, Constant, XavierUniform + +__all__ = ["ResNet_vd", "ConvBNLayer", "DeformableConvV2"] + + +class DeformableConvV2(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + weight_attr=None, + bias_attr=None, + lr_scale=1, + regularizer=None, + skip_quant=False, + dcn_bias_regularizer=L2Decay(0.), + dcn_bias_lr_scale=2.): + super(DeformableConvV2, self).__init__() + self.offset_channel = 2 * kernel_size**2 * groups + self.mask_channel = kernel_size**2 * groups + + if bias_attr: + # in FCOS-DCN head, specifically need learning_rate and regularizer + dcn_bias_attr = ParamAttr( + initializer=Constant(value=0), + regularizer=dcn_bias_regularizer, + learning_rate=dcn_bias_lr_scale) + else: + # in ResNet backbone, do not need bias + dcn_bias_attr = False + self.conv_dcn = DeformConv2D( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2 * dilation, + dilation=dilation, + deformable_groups=groups, + weight_attr=weight_attr, + bias_attr=dcn_bias_attr) + + if lr_scale == 1 and regularizer is None: + offset_bias_attr = ParamAttr(initializer=Constant(0.)) + else: + offset_bias_attr = ParamAttr( + initializer=Constant(0.), + learning_rate=lr_scale, + regularizer=regularizer) + self.conv_offset = nn.Conv2D( + in_channels, + groups * 3 * kernel_size**2, + kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + weight_attr=ParamAttr(initializer=Constant(0.0)), + bias_attr=offset_bias_attr) + if skip_quant: + self.conv_offset.skip_quant = True + + def forward(self, x): + offset_mask = self.conv_offset(x) + offset, mask = paddle.split( + offset_mask, + num_or_sections=[self.offset_channel, self.mask_channel], + axis=1) + mask = F.sigmoid(mask) + y = self.conv_dcn(x, offset, mask=mask) + return y + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + dcn_groups=1, + is_vd_mode=False, + act=None, + is_dcn=False): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + if not is_dcn: + self._conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias_attr=False) + else: + self._conv = DeformableConvV2( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=dcn_groups, #groups, + bias_attr=False) + self._batch_norm = nn.BatchNorm(out_channels, act=act) + + def forward(self, inputs): + if self.is_vd_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class BottleneckBlock(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + is_dcn=False, ): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu') + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + is_dcn=is_dcn, + dcn_groups=2) + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None) + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True) + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, ): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu') + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None) + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True) + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv1) + y = F.relu(y) + return y + + +class ResNet_vd(nn.Layer): + def __init__(self, + in_channels=3, + layers=50, + dcn_stage=None, + out_indices=None, + **kwargs): + super(ResNet_vd, self).__init__() + + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + num_channels = [64, 256, 512, + 1024] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512] + + self.dcn_stage = dcn_stage if dcn_stage is not None else [ + False, False, False, False + ] + self.out_indices = out_indices if out_indices is not None else [ + 0, 1, 2, 3 + ] + + self.conv1_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=32, + kernel_size=3, + stride=2, + act='relu') + self.conv1_2 = ConvBNLayer( + in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='relu') + self.conv1_3 = ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='relu') + self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.stages = [] + self.out_channels = [] + if layers >= 50: + for block in range(len(depth)): + block_list = [] + shortcut = False + is_dcn = self.dcn_stage[block] + for i in range(depth[block]): + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + is_dcn=is_dcn)) + shortcut = True + block_list.append(bottleneck_block) + if block in self.out_indices: + self.out_channels.append(num_filters[block] * 4) + self.stages.append(nn.Sequential(*block_list)) + else: + for block in range(len(depth)): + block_list = [] + shortcut = False + for i in range(depth[block]): + basic_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BasicBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0)) + shortcut = True + block_list.append(basic_block) + if block in self.out_indices: + self.out_channels.append(num_filters[block]) + self.stages.append(nn.Sequential(*block_list)) + + def forward(self, inputs): + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + y = self.pool2d_max(y) + out = [] + for i, block in enumerate(self.stages): + y = block(y) + if i in self.out_indices: + out.append(y) + return out diff --git a/ppocr/modeling/backbones/det_resnet_vd_sast.py b/ppocr/modeling/backbones/det_resnet_vd_sast.py new file mode 100644 index 0000000000000000000000000000000000000000..c9376a8d56e9fa8eff5d2980f63b0b8b955efe88 --- /dev/null +++ b/ppocr/modeling/backbones/det_resnet_vd_sast.py @@ -0,0 +1,285 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + +__all__ = ["ResNet_SAST"] + + +class ConvBNLayer(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_vd_mode=False, + act=None, + name=None, ): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self._conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self._batch_norm = nn.BatchNorm( + out_channels, + act=act, + param_attr=ParamAttr(name=bn_name + '_scale'), + bias_attr=ParamAttr(bn_name + '_offset'), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance') + + def forward(self, inputs): + if self.is_vd_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class BottleneckBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2b") + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv1) + y = F.relu(y) + return y + + +class ResNet_SAST(nn.Layer): + def __init__(self, in_channels=3, layers=50, **kwargs): + super(ResNet_SAST, self).__init__() + + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + # depth = [3, 4, 6, 3] + depth = [3, 4, 6, 3, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + # num_channels = [64, 256, 512, + # 1024] if layers >= 50 else [64, 64, 128, 256] + # num_filters = [64, 128, 256, 512] + num_channels = [64, 256, 512, + 1024, 2048] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512, 512] + + self.conv1_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=32, + kernel_size=3, + stride=2, + act='relu', + name="conv1_1") + self.conv1_2 = ConvBNLayer( + in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='relu', + name="conv1_2") + self.conv1_3 = ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='relu', + name="conv1_3") + self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.stages = [] + self.out_channels = [3, 64] + if layers >= 50: + for block in range(len(depth)): + block_list = [] + shortcut = False + for i in range(depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name)) + shortcut = True + block_list.append(bottleneck_block) + self.out_channels.append(num_filters[block] * 4) + self.stages.append(nn.Sequential(*block_list)) + else: + for block in range(len(depth)): + block_list = [] + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BasicBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name)) + shortcut = True + block_list.append(basic_block) + self.out_channels.append(num_filters[block]) + self.stages.append(nn.Sequential(*block_list)) + + def forward(self, inputs): + out = [inputs] + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + out.append(y) + y = self.pool2d_max(y) + for block in self.stages: + y = block(y) + out.append(y) + return out \ No newline at end of file diff --git a/ppocr/modeling/backbones/e2e_resnet_vd_pg.py b/ppocr/modeling/backbones/e2e_resnet_vd_pg.py new file mode 100644 index 0000000000000000000000000000000000000000..97afd3460d03dc078b53064fb45b6fb6d3542df9 --- /dev/null +++ b/ppocr/modeling/backbones/e2e_resnet_vd_pg.py @@ -0,0 +1,265 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + +__all__ = ["ResNet"] + + +class ConvBNLayer(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_vd_mode=False, + act=None, + name=None, ): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self._conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self._batch_norm = nn.BatchNorm( + out_channels, + act=act, + param_attr=ParamAttr(name=bn_name + '_scale'), + bias_attr=ParamAttr(bn_name + '_offset'), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance') + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class BottleneckBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2b") + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=stride, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv1) + y = F.relu(y) + return y + + +class ResNet(nn.Layer): + def __init__(self, in_channels=3, layers=50, **kwargs): + super(ResNet, self).__init__() + + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + # depth = [3, 4, 6, 3] + depth = [3, 4, 6, 3, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + num_channels = [64, 256, 512, 1024, + 2048] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512, 512] + + self.conv1_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=7, + stride=2, + act='relu', + name="conv1_1") + self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.stages = [] + self.out_channels = [3, 64] + # num_filters = [64, 128, 256, 512, 512] + if layers >= 50: + for block in range(len(depth)): + block_list = [] + shortcut = False + for i in range(depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name)) + shortcut = True + block_list.append(bottleneck_block) + self.out_channels.append(num_filters[block] * 4) + self.stages.append(nn.Sequential(*block_list)) + else: + for block in range(len(depth)): + block_list = [] + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + basic_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BasicBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name)) + shortcut = True + block_list.append(basic_block) + self.out_channels.append(num_filters[block]) + self.stages.append(nn.Sequential(*block_list)) + + def forward(self, inputs): + out = [inputs] + y = self.conv1_1(inputs) + out.append(y) + y = self.pool2d_max(y) + for block in self.stages: + y = block(y) + out.append(y) + return out diff --git a/ppocr/modeling/backbones/kie_unet_sdmgr.py b/ppocr/modeling/backbones/kie_unet_sdmgr.py new file mode 100644 index 0000000000000000000000000000000000000000..4b1bd8030060b26acb9e60bd671a5b23d936347b --- /dev/null +++ b/ppocr/modeling/backbones/kie_unet_sdmgr.py @@ -0,0 +1,181 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import numpy as np +import cv2 + +__all__ = ["Kie_backbone"] + + +class Encoder(nn.Layer): + def __init__(self, num_channels, num_filters): + super(Encoder, self).__init__() + self.conv1 = nn.Conv2D( + num_channels, + num_filters, + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn1 = nn.BatchNorm(num_filters, act='relu') + + self.conv2 = nn.Conv2D( + num_filters, + num_filters, + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn2 = nn.BatchNorm(num_filters, act='relu') + + self.pool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + + def forward(self, inputs): + x = self.conv1(inputs) + x = self.bn1(x) + x = self.conv2(x) + x = self.bn2(x) + x_pooled = self.pool(x) + return x, x_pooled + + +class Decoder(nn.Layer): + def __init__(self, num_channels, num_filters): + super(Decoder, self).__init__() + + self.conv1 = nn.Conv2D( + num_channels, + num_filters, + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn1 = nn.BatchNorm(num_filters, act='relu') + + self.conv2 = nn.Conv2D( + num_filters, + num_filters, + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn2 = nn.BatchNorm(num_filters, act='relu') + + self.conv0 = nn.Conv2D( + num_channels, + num_filters, + kernel_size=1, + stride=1, + padding=0, + bias_attr=False) + self.bn0 = nn.BatchNorm(num_filters, act='relu') + + def forward(self, inputs_prev, inputs): + x = self.conv0(inputs) + x = self.bn0(x) + x = paddle.nn.functional.interpolate( + x, scale_factor=2, mode='bilinear', align_corners=False) + x = paddle.concat([inputs_prev, x], axis=1) + x = self.conv1(x) + x = self.bn1(x) + x = self.conv2(x) + x = self.bn2(x) + return x + + +class UNet(nn.Layer): + def __init__(self): + super(UNet, self).__init__() + self.down1 = Encoder(num_channels=3, num_filters=16) + self.down2 = Encoder(num_channels=16, num_filters=32) + self.down3 = Encoder(num_channels=32, num_filters=64) + self.down4 = Encoder(num_channels=64, num_filters=128) + self.down5 = Encoder(num_channels=128, num_filters=256) + + self.up1 = Decoder(32, 16) + self.up2 = Decoder(64, 32) + self.up3 = Decoder(128, 64) + self.up4 = Decoder(256, 128) + self.out_channels = 16 + + def forward(self, inputs): + x1, _ = self.down1(inputs) + _, x2 = self.down2(x1) + _, x3 = self.down3(x2) + _, x4 = self.down4(x3) + _, x5 = self.down5(x4) + + x = self.up4(x4, x5) + x = self.up3(x3, x) + x = self.up2(x2, x) + x = self.up1(x1, x) + return x + + +class Kie_backbone(nn.Layer): + def __init__(self, in_channels, **kwargs): + super(Kie_backbone, self).__init__() + self.out_channels = 16 + self.img_feat = UNet() + self.maxpool = nn.MaxPool2D(kernel_size=7) + + def bbox2roi(self, bbox_list): + rois_list = [] + rois_num = [] + for img_id, bboxes in enumerate(bbox_list): + rois_num.append(bboxes.shape[0]) + rois_list.append(bboxes) + rois = paddle.concat(rois_list, 0) + rois_num = paddle.to_tensor(rois_num, dtype='int32') + return rois, rois_num + + def pre_process(self, img, relations, texts, gt_bboxes, tag, img_size): + img, relations, texts, gt_bboxes, tag, img_size = img.numpy( + ), relations.numpy(), texts.numpy(), gt_bboxes.numpy(), tag.numpy( + ).tolist(), img_size.numpy() + temp_relations, temp_texts, temp_gt_bboxes = [], [], [] + h, w = int(np.max(img_size[:, 0])), int(np.max(img_size[:, 1])) + img = paddle.to_tensor(img[:, :, :h, :w]) + batch = len(tag) + for i in range(batch): + num, recoder_len = tag[i][0], tag[i][1] + temp_relations.append( + paddle.to_tensor( + relations[i, :num, :num, :], dtype='float32')) + temp_texts.append( + paddle.to_tensor( + texts[i, :num, :recoder_len], dtype='float32')) + temp_gt_bboxes.append( + paddle.to_tensor( + gt_bboxes[i, :num, ...], dtype='float32')) + return img, temp_relations, temp_texts, temp_gt_bboxes + + def forward(self, inputs): + img = inputs[0] + relations, texts, gt_bboxes, tag, img_size = inputs[1], inputs[ + 2], inputs[3], inputs[5], inputs[-1] + img, relations, texts, gt_bboxes = self.pre_process( + img, relations, texts, gt_bboxes, tag, img_size) + x = self.img_feat(img) + boxes, rois_num = self.bbox2roi(gt_bboxes) + feats = paddle.vision.ops.roi_align( + x, boxes, spatial_scale=1.0, output_size=7, boxes_num=rois_num) + feats = self.maxpool(feats).squeeze(-1).squeeze(-1) + return [relations, texts, feats] diff --git a/ppocr/modeling/backbones/rec_densenet.py b/ppocr/modeling/backbones/rec_densenet.py new file mode 100644 index 0000000000000000000000000000000000000000..dd84a0f2c68dc17f4f21eb4ceaa851be08cfb9af --- /dev/null +++ b/ppocr/modeling/backbones/rec_densenet.py @@ -0,0 +1,146 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/LBH1024/CAN/models/densenet.py + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class Bottleneck(nn.Layer): + def __init__(self, nChannels, growthRate, use_dropout): + super(Bottleneck, self).__init__() + interChannels = 4 * growthRate + self.bn1 = nn.BatchNorm2D(interChannels) + self.conv1 = nn.Conv2D( + nChannels, interChannels, kernel_size=1, + bias_attr=None) # Xavier initialization + self.bn2 = nn.BatchNorm2D(growthRate) + self.conv2 = nn.Conv2D( + interChannels, growthRate, kernel_size=3, padding=1, + bias_attr=None) # Xavier initialization + self.use_dropout = use_dropout + self.dropout = nn.Dropout(p=0.2) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + if self.use_dropout: + out = self.dropout(out) + out = F.relu(self.bn2(self.conv2(out))) + if self.use_dropout: + out = self.dropout(out) + out = paddle.concat([x, out], 1) + return out + + +class SingleLayer(nn.Layer): + def __init__(self, nChannels, growthRate, use_dropout): + super(SingleLayer, self).__init__() + self.bn1 = nn.BatchNorm2D(nChannels) + self.conv1 = nn.Conv2D( + nChannels, growthRate, kernel_size=3, padding=1, bias_attr=False) + + self.use_dropout = use_dropout + self.dropout = nn.Dropout(p=0.2) + + def forward(self, x): + out = self.conv1(F.relu(x)) + if self.use_dropout: + out = self.dropout(out) + + out = paddle.concat([x, out], 1) + return out + + +class Transition(nn.Layer): + def __init__(self, nChannels, out_channels, use_dropout): + super(Transition, self).__init__() + self.bn1 = nn.BatchNorm2D(out_channels) + self.conv1 = nn.Conv2D( + nChannels, out_channels, kernel_size=1, bias_attr=False) + self.use_dropout = use_dropout + self.dropout = nn.Dropout(p=0.2) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + if self.use_dropout: + out = self.dropout(out) + out = F.avg_pool2d(out, 2, ceil_mode=True, exclusive=False) + return out + + +class DenseNet(nn.Layer): + def __init__(self, growthRate, reduction, bottleneck, use_dropout, + input_channel, **kwargs): + super(DenseNet, self).__init__() + + nDenseBlocks = 16 + nChannels = 2 * growthRate + + self.conv1 = nn.Conv2D( + input_channel, + nChannels, + kernel_size=7, + padding=3, + stride=2, + bias_attr=False) + self.dense1 = self._make_dense(nChannels, growthRate, nDenseBlocks, + bottleneck, use_dropout) + nChannels += nDenseBlocks * growthRate + out_channels = int(math.floor(nChannels * reduction)) + self.trans1 = Transition(nChannels, out_channels, use_dropout) + + nChannels = out_channels + self.dense2 = self._make_dense(nChannels, growthRate, nDenseBlocks, + bottleneck, use_dropout) + nChannels += nDenseBlocks * growthRate + out_channels = int(math.floor(nChannels * reduction)) + self.trans2 = Transition(nChannels, out_channels, use_dropout) + + nChannels = out_channels + self.dense3 = self._make_dense(nChannels, growthRate, nDenseBlocks, + bottleneck, use_dropout) + self.out_channels = out_channels + + def _make_dense(self, nChannels, growthRate, nDenseBlocks, bottleneck, + use_dropout): + layers = [] + for i in range(int(nDenseBlocks)): + if bottleneck: + layers.append(Bottleneck(nChannels, growthRate, use_dropout)) + else: + layers.append(SingleLayer(nChannels, growthRate, use_dropout)) + nChannels += growthRate + return nn.Sequential(*layers) + + def forward(self, inputs): + x, x_m, y = inputs + out = self.conv1(x) + out = F.relu(out) + out = F.max_pool2d(out, 2, ceil_mode=True) + out = self.dense1(out) + out = self.trans1(out) + out = self.dense2(out) + out = self.trans2(out) + out = self.dense3(out) + return out, x_m, y diff --git a/ppocr/modeling/backbones/rec_efficientb3_pren.py b/ppocr/modeling/backbones/rec_efficientb3_pren.py new file mode 100644 index 0000000000000000000000000000000000000000..701e436c1e0e29f42cc9c7ce6e66552d4005f6b0 --- /dev/null +++ b/ppocr/modeling/backbones/rec_efficientb3_pren.py @@ -0,0 +1,279 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Code is refer from: +https://github.com/RuijieJ/pren/blob/main/Nets/EfficientNet.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import re +import collections +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +__all__ = ['EfficientNetb3'] + +GlobalParams = collections.namedtuple('GlobalParams', [ + 'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate', 'num_classes', + 'width_coefficient', 'depth_coefficient', 'depth_divisor', 'min_depth', + 'drop_connect_rate', 'image_size' +]) + +BlockArgs = collections.namedtuple('BlockArgs', [ + 'kernel_size', 'num_repeat', 'input_filters', 'output_filters', + 'expand_ratio', 'id_skip', 'stride', 'se_ratio' +]) + + +class BlockDecoder: + @staticmethod + def _decode_block_string(block_string): + assert isinstance(block_string, str) + + ops = block_string.split('_') + options = {} + for op in ops: + splits = re.split(r'(\d.*)', op) + if len(splits) >= 2: + key, value = splits[:2] + options[key] = value + + assert (('s' in options and len(options['s']) == 1) or + (len(options['s']) == 2 and options['s'][0] == options['s'][1])) + + return BlockArgs( + kernel_size=int(options['k']), + num_repeat=int(options['r']), + input_filters=int(options['i']), + output_filters=int(options['o']), + expand_ratio=int(options['e']), + id_skip=('noskip' not in block_string), + se_ratio=float(options['se']) if 'se' in options else None, + stride=[int(options['s'][0])]) + + @staticmethod + def decode(string_list): + assert isinstance(string_list, list) + blocks_args = [] + for block_string in string_list: + blocks_args.append(BlockDecoder._decode_block_string(block_string)) + return blocks_args + + +def efficientnet(width_coefficient=None, + depth_coefficient=None, + dropout_rate=0.2, + drop_connect_rate=0.2, + image_size=None, + num_classes=1000): + blocks_args = [ + 'r1_k3_s11_e1_i32_o16_se0.25', + 'r2_k3_s22_e6_i16_o24_se0.25', + 'r2_k5_s22_e6_i24_o40_se0.25', + 'r3_k3_s22_e6_i40_o80_se0.25', + 'r3_k5_s11_e6_i80_o112_se0.25', + 'r4_k5_s22_e6_i112_o192_se0.25', + 'r1_k3_s11_e6_i192_o320_se0.25', + ] + blocks_args = BlockDecoder.decode(blocks_args) + + global_params = GlobalParams( + batch_norm_momentum=0.99, + batch_norm_epsilon=1e-3, + dropout_rate=dropout_rate, + drop_connect_rate=drop_connect_rate, + num_classes=num_classes, + width_coefficient=width_coefficient, + depth_coefficient=depth_coefficient, + depth_divisor=8, + min_depth=None, + image_size=image_size, ) + return blocks_args, global_params + + +class EffUtils: + @staticmethod + def round_filters(filters, global_params): + """ Calculate and round number of filters based on depth multiplier. """ + multiplier = global_params.width_coefficient + if not multiplier: + return filters + divisor = global_params.depth_divisor + min_depth = global_params.min_depth + filters *= multiplier + min_depth = min_depth or divisor + new_filters = max(min_depth, + int(filters + divisor / 2) // divisor * divisor) + if new_filters < 0.9 * filters: + new_filters += divisor + return int(new_filters) + + @staticmethod + def round_repeats(repeats, global_params): + """ Round number of filters based on depth multiplier. """ + multiplier = global_params.depth_coefficient + if not multiplier: + return repeats + return int(math.ceil(multiplier * repeats)) + + +class MbConvBlock(nn.Layer): + def __init__(self, block_args): + super(MbConvBlock, self).__init__() + self._block_args = block_args + self.has_se = (self._block_args.se_ratio is not None) and \ + (0 < self._block_args.se_ratio <= 1) + self.id_skip = block_args.id_skip + + # expansion phase + self.inp = self._block_args.input_filters + oup = self._block_args.input_filters * self._block_args.expand_ratio + if self._block_args.expand_ratio != 1: + self._expand_conv = nn.Conv2D(self.inp, oup, 1, bias_attr=False) + self._bn0 = nn.BatchNorm(oup) + + # depthwise conv phase + k = self._block_args.kernel_size + s = self._block_args.stride + if isinstance(s, list): + s = s[0] + self._depthwise_conv = nn.Conv2D( + oup, + oup, + groups=oup, + kernel_size=k, + stride=s, + padding='same', + bias_attr=False) + self._bn1 = nn.BatchNorm(oup) + + # squeeze and excitation layer, if desired + if self.has_se: + num_squeezed_channels = max(1, + int(self._block_args.input_filters * + self._block_args.se_ratio)) + self._se_reduce = nn.Conv2D(oup, num_squeezed_channels, 1) + self._se_expand = nn.Conv2D(num_squeezed_channels, oup, 1) + + # output phase and some util class + self.final_oup = self._block_args.output_filters + self._project_conv = nn.Conv2D(oup, self.final_oup, 1, bias_attr=False) + self._bn2 = nn.BatchNorm(self.final_oup) + self._swish = nn.Swish() + + def _drop_connect(self, inputs, p, training): + if not training: + return inputs + batch_size = inputs.shape[0] + keep_prob = 1 - p + random_tensor = keep_prob + random_tensor += paddle.rand([batch_size, 1, 1, 1], dtype=inputs.dtype) + random_tensor = paddle.to_tensor(random_tensor, place=inputs.place) + binary_tensor = paddle.floor(random_tensor) + output = inputs / keep_prob * binary_tensor + return output + + def forward(self, inputs, drop_connect_rate=None): + # expansion and depthwise conv + x = inputs + if self._block_args.expand_ratio != 1: + x = self._swish(self._bn0(self._expand_conv(inputs))) + x = self._swish(self._bn1(self._depthwise_conv(x))) + + # squeeze and excitation + if self.has_se: + x_squeezed = F.adaptive_avg_pool2d(x, 1) + x_squeezed = self._se_expand( + self._swish(self._se_reduce(x_squeezed))) + x = F.sigmoid(x_squeezed) * x + x = self._bn2(self._project_conv(x)) + + # skip conntection and drop connect + if self.id_skip and self._block_args.stride == 1 and \ + self.inp == self.final_oup: + if drop_connect_rate: + x = self._drop_connect( + x, p=drop_connect_rate, training=self.training) + x = x + inputs + return x + + +class EfficientNetb3_PREN(nn.Layer): + def __init__(self, in_channels): + super(EfficientNetb3_PREN, self).__init__() + """ + the fllowing are efficientnetb3's superparams, + they means efficientnetb3 network's width, depth, resolution and + dropout respectively, to fit for text recognition task, the resolution + here is changed from 300 to 64. + """ + w, d, s, p = 1.2, 1.4, 64, 0.3 + self._blocks_args, self._global_params = efficientnet( + width_coefficient=w, + depth_coefficient=d, + dropout_rate=p, + image_size=s) + self.out_channels = [] + # stem + out_channels = EffUtils.round_filters(32, self._global_params) + self._conv_stem = nn.Conv2D( + in_channels, out_channels, 3, 2, padding='same', bias_attr=False) + self._bn0 = nn.BatchNorm(out_channels) + + # build blocks + self._blocks = [] + # to extract three feature maps for fpn based on efficientnetb3 backbone + self._concerned_block_idxes = [7, 17, 25] + _concerned_idx = 0 + for i, block_args in enumerate(self._blocks_args): + block_args = block_args._replace( + input_filters=EffUtils.round_filters(block_args.input_filters, + self._global_params), + output_filters=EffUtils.round_filters(block_args.output_filters, + self._global_params), + num_repeat=EffUtils.round_repeats(block_args.num_repeat, + self._global_params)) + self._blocks.append( + self.add_sublayer(f"{i}-0", MbConvBlock(block_args))) + _concerned_idx += 1 + if _concerned_idx in self._concerned_block_idxes: + self.out_channels.append(block_args.output_filters) + if block_args.num_repeat > 1: + block_args = block_args._replace( + input_filters=block_args.output_filters, stride=1) + for j in range(block_args.num_repeat - 1): + self._blocks.append( + self.add_sublayer(f'{i}-{j+1}', MbConvBlock(block_args))) + _concerned_idx += 1 + if _concerned_idx in self._concerned_block_idxes: + self.out_channels.append(block_args.output_filters) + + self._swish = nn.Swish() + + def forward(self, inputs): + outs = [] + x = self._swish(self._bn0(self._conv_stem(inputs))) + for idx, block in enumerate(self._blocks): + drop_connect_rate = self._global_params.drop_connect_rate + if drop_connect_rate: + drop_connect_rate *= float(idx) / len(self._blocks) + x = block(x, drop_connect_rate=drop_connect_rate) + if idx in self._concerned_block_idxes: + outs.append(x) + return outs diff --git a/ppocr/modeling/backbones/rec_micronet.py b/ppocr/modeling/backbones/rec_micronet.py new file mode 100644 index 0000000000000000000000000000000000000000..b0ae5a14c3004f63d39dff32cc737a0d96155593 --- /dev/null +++ b/ppocr/modeling/backbones/rec_micronet.py @@ -0,0 +1,528 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/liyunsheng13/micronet/blob/main/backbone/micronet.py +https://github.com/liyunsheng13/micronet/blob/main/backbone/activation.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn + +from ppocr.modeling.backbones.det_mobilenet_v3 import make_divisible + +M0_cfgs = [ + # s, n, c, ks, c1, c2, g1, g2, c3, g3, g4, y1, y2, y3, r + [2, 1, 8, 3, 2, 2, 0, 4, 8, 2, 2, 2, 0, 1, 1], + [2, 1, 12, 3, 2, 2, 0, 8, 12, 4, 4, 2, 2, 1, 1], + [2, 1, 16, 5, 2, 2, 0, 12, 16, 4, 4, 2, 2, 1, 1], + [1, 1, 32, 5, 1, 4, 4, 4, 32, 4, 4, 2, 2, 1, 1], + [2, 1, 64, 5, 1, 4, 8, 8, 64, 8, 8, 2, 2, 1, 1], + [1, 1, 96, 3, 1, 4, 8, 8, 96, 8, 8, 2, 2, 1, 2], + [1, 1, 384, 3, 1, 4, 12, 12, 0, 0, 0, 2, 2, 1, 2], +] +M1_cfgs = [ + # s, n, c, ks, c1, c2, g1, g2, c3, g3, g4 + [2, 1, 8, 3, 2, 2, 0, 6, 8, 2, 2, 2, 0, 1, 1], + [2, 1, 16, 3, 2, 2, 0, 8, 16, 4, 4, 2, 2, 1, 1], + [2, 1, 16, 5, 2, 2, 0, 16, 16, 4, 4, 2, 2, 1, 1], + [1, 1, 32, 5, 1, 6, 4, 4, 32, 4, 4, 2, 2, 1, 1], + [2, 1, 64, 5, 1, 6, 8, 8, 64, 8, 8, 2, 2, 1, 1], + [1, 1, 96, 3, 1, 6, 8, 8, 96, 8, 8, 2, 2, 1, 2], + [1, 1, 576, 3, 1, 6, 12, 12, 0, 0, 0, 2, 2, 1, 2], +] +M2_cfgs = [ + # s, n, c, ks, c1, c2, g1, g2, c3, g3, g4 + [2, 1, 12, 3, 2, 2, 0, 8, 12, 4, 4, 2, 0, 1, 1], + [2, 1, 16, 3, 2, 2, 0, 12, 16, 4, 4, 2, 2, 1, 1], + [1, 1, 24, 3, 2, 2, 0, 16, 24, 4, 4, 2, 2, 1, 1], + [2, 1, 32, 5, 1, 6, 6, 6, 32, 4, 4, 2, 2, 1, 1], + [1, 1, 32, 5, 1, 6, 8, 8, 32, 4, 4, 2, 2, 1, 2], + [1, 1, 64, 5, 1, 6, 8, 8, 64, 8, 8, 2, 2, 1, 2], + [2, 1, 96, 5, 1, 6, 8, 8, 96, 8, 8, 2, 2, 1, 2], + [1, 1, 128, 3, 1, 6, 12, 12, 128, 8, 8, 2, 2, 1, 2], + [1, 1, 768, 3, 1, 6, 16, 16, 0, 0, 0, 2, 2, 1, 2], +] +M3_cfgs = [ + # s, n, c, ks, c1, c2, g1, g2, c3, g3, g4 + [2, 1, 16, 3, 2, 2, 0, 12, 16, 4, 4, 0, 2, 0, 1], + [2, 1, 24, 3, 2, 2, 0, 16, 24, 4, 4, 0, 2, 0, 1], + [1, 1, 24, 3, 2, 2, 0, 24, 24, 4, 4, 0, 2, 0, 1], + [2, 1, 32, 5, 1, 6, 6, 6, 32, 4, 4, 0, 2, 0, 1], + [1, 1, 32, 5, 1, 6, 8, 8, 32, 4, 4, 0, 2, 0, 2], + [1, 1, 64, 5, 1, 6, 8, 8, 48, 8, 8, 0, 2, 0, 2], + [1, 1, 80, 5, 1, 6, 8, 8, 80, 8, 8, 0, 2, 0, 2], + [1, 1, 80, 5, 1, 6, 10, 10, 80, 8, 8, 0, 2, 0, 2], + [1, 1, 120, 5, 1, 6, 10, 10, 120, 10, 10, 0, 2, 0, 2], + [1, 1, 120, 5, 1, 6, 12, 12, 120, 10, 10, 0, 2, 0, 2], + [1, 1, 144, 3, 1, 6, 12, 12, 144, 12, 12, 0, 2, 0, 2], + [1, 1, 432, 3, 1, 3, 12, 12, 0, 0, 0, 0, 2, 0, 2], +] + + +def get_micronet_config(mode): + return eval(mode + '_cfgs') + + +class MaxGroupPooling(nn.Layer): + def __init__(self, channel_per_group=2): + super(MaxGroupPooling, self).__init__() + self.channel_per_group = channel_per_group + + def forward(self, x): + if self.channel_per_group == 1: + return x + # max op + b, c, h, w = x.shape + + # reshape + y = paddle.reshape(x, [b, c // self.channel_per_group, -1, h, w]) + out = paddle.max(y, axis=2) + return out + + +class SpatialSepConvSF(nn.Layer): + def __init__(self, inp, oups, kernel_size, stride): + super(SpatialSepConvSF, self).__init__() + + oup1, oup2 = oups + self.conv = nn.Sequential( + nn.Conv2D( + inp, + oup1, (kernel_size, 1), (stride, 1), (kernel_size // 2, 0), + bias_attr=False, + groups=1), + nn.BatchNorm2D(oup1), + nn.Conv2D( + oup1, + oup1 * oup2, (1, kernel_size), (1, stride), + (0, kernel_size // 2), + bias_attr=False, + groups=oup1), + nn.BatchNorm2D(oup1 * oup2), + ChannelShuffle(oup1), ) + + def forward(self, x): + out = self.conv(x) + return out + + +class ChannelShuffle(nn.Layer): + def __init__(self, groups): + super(ChannelShuffle, self).__init__() + self.groups = groups + + def forward(self, x): + b, c, h, w = x.shape + + channels_per_group = c // self.groups + + # reshape + x = paddle.reshape(x, [b, self.groups, channels_per_group, h, w]) + + x = paddle.transpose(x, (0, 2, 1, 3, 4)) + out = paddle.reshape(x, [b, -1, h, w]) + + return out + + +class StemLayer(nn.Layer): + def __init__(self, inp, oup, stride, groups=(4, 4)): + super(StemLayer, self).__init__() + + g1, g2 = groups + self.stem = nn.Sequential( + SpatialSepConvSF(inp, groups, 3, stride), + MaxGroupPooling(2) if g1 * g2 == 2 * oup else nn.ReLU6()) + + def forward(self, x): + out = self.stem(x) + return out + + +class DepthSpatialSepConv(nn.Layer): + def __init__(self, inp, expand, kernel_size, stride): + super(DepthSpatialSepConv, self).__init__() + + exp1, exp2 = expand + + hidden_dim = inp * exp1 + oup = inp * exp1 * exp2 + + self.conv = nn.Sequential( + nn.Conv2D( + inp, + inp * exp1, (kernel_size, 1), (stride, 1), + (kernel_size // 2, 0), + bias_attr=False, + groups=inp), + nn.BatchNorm2D(inp * exp1), + nn.Conv2D( + hidden_dim, + oup, (1, kernel_size), + 1, (0, kernel_size // 2), + bias_attr=False, + groups=hidden_dim), + nn.BatchNorm2D(oup)) + + def forward(self, x): + x = self.conv(x) + return x + + +class GroupConv(nn.Layer): + def __init__(self, inp, oup, groups=2): + super(GroupConv, self).__init__() + self.inp = inp + self.oup = oup + self.groups = groups + self.conv = nn.Sequential( + nn.Conv2D( + inp, oup, 1, 1, 0, bias_attr=False, groups=self.groups[0]), + nn.BatchNorm2D(oup)) + + def forward(self, x): + x = self.conv(x) + return x + + +class DepthConv(nn.Layer): + def __init__(self, inp, oup, kernel_size, stride): + super(DepthConv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2D( + inp, + oup, + kernel_size, + stride, + kernel_size // 2, + bias_attr=False, + groups=inp), + nn.BatchNorm2D(oup)) + + def forward(self, x): + out = self.conv(x) + return out + + +class DYShiftMax(nn.Layer): + def __init__(self, + inp, + oup, + reduction=4, + act_max=1.0, + act_relu=True, + init_a=[0.0, 0.0], + init_b=[0.0, 0.0], + relu_before_pool=False, + g=None, + expansion=False): + super(DYShiftMax, self).__init__() + self.oup = oup + self.act_max = act_max * 2 + self.act_relu = act_relu + self.avg_pool = nn.Sequential(nn.ReLU() if relu_before_pool == True else + nn.Sequential(), nn.AdaptiveAvgPool2D(1)) + + self.exp = 4 if act_relu else 2 + self.init_a = init_a + self.init_b = init_b + + # determine squeeze + squeeze = make_divisible(inp // reduction, 4) + if squeeze < 4: + squeeze = 4 + + self.fc = nn.Sequential( + nn.Linear(inp, squeeze), + nn.ReLU(), nn.Linear(squeeze, oup * self.exp), nn.Hardsigmoid()) + + if g is None: + g = 1 + self.g = g[1] + if self.g != 1 and expansion: + self.g = inp // self.g + + self.gc = inp // self.g + index = paddle.to_tensor([range(inp)]) + index = paddle.reshape(index, [1, inp, 1, 1]) + index = paddle.reshape(index, [1, self.g, self.gc, 1, 1]) + indexgs = paddle.split(index, [1, self.g - 1], axis=1) + indexgs = paddle.concat((indexgs[1], indexgs[0]), axis=1) + indexs = paddle.split(indexgs, [1, self.gc - 1], axis=2) + indexs = paddle.concat((indexs[1], indexs[0]), axis=2) + self.index = paddle.reshape(indexs, [inp]) + self.expansion = expansion + + def forward(self, x): + x_in = x + x_out = x + + b, c, _, _ = x_in.shape + y = self.avg_pool(x_in) + y = paddle.reshape(y, [b, c]) + y = self.fc(y) + y = paddle.reshape(y, [b, self.oup * self.exp, 1, 1]) + y = (y - 0.5) * self.act_max + + n2, c2, h2, w2 = x_out.shape + x2 = paddle.to_tensor(x_out.numpy()[:, self.index.numpy(), :, :]) + + if self.exp == 4: + temp = y.shape + a1, b1, a2, b2 = paddle.split(y, temp[1] // self.oup, axis=1) + + a1 = a1 + self.init_a[0] + a2 = a2 + self.init_a[1] + + b1 = b1 + self.init_b[0] + b2 = b2 + self.init_b[1] + + z1 = x_out * a1 + x2 * b1 + z2 = x_out * a2 + x2 * b2 + + out = paddle.maximum(z1, z2) + + elif self.exp == 2: + temp = y.shape + a1, b1 = paddle.split(y, temp[1] // self.oup, axis=1) + a1 = a1 + self.init_a[0] + b1 = b1 + self.init_b[0] + out = x_out * a1 + x2 * b1 + + return out + + +class DYMicroBlock(nn.Layer): + def __init__(self, + inp, + oup, + kernel_size=3, + stride=1, + ch_exp=(2, 2), + ch_per_group=4, + groups_1x1=(1, 1), + depthsep=True, + shuffle=False, + activation_cfg=None): + super(DYMicroBlock, self).__init__() + + self.identity = stride == 1 and inp == oup + + y1, y2, y3 = activation_cfg['dy'] + act_reduction = 8 * activation_cfg['ratio'] + init_a = activation_cfg['init_a'] + init_b = activation_cfg['init_b'] + + t1 = ch_exp + gs1 = ch_per_group + hidden_fft, g1, g2 = groups_1x1 + hidden_dim2 = inp * t1[0] * t1[1] + + if gs1[0] == 0: + self.layers = nn.Sequential( + DepthSpatialSepConv(inp, t1, kernel_size, stride), + DYShiftMax( + hidden_dim2, + hidden_dim2, + act_max=2.0, + act_relu=True if y2 == 2 else False, + init_a=init_a, + reduction=act_reduction, + init_b=init_b, + g=gs1, + expansion=False) if y2 > 0 else nn.ReLU6(), + ChannelShuffle(gs1[1]) if shuffle else nn.Sequential(), + ChannelShuffle(hidden_dim2 // 2) + if shuffle and y2 != 0 else nn.Sequential(), + GroupConv(hidden_dim2, oup, (g1, g2)), + DYShiftMax( + oup, + oup, + act_max=2.0, + act_relu=False, + init_a=[1.0, 0.0], + reduction=act_reduction // 2, + init_b=[0.0, 0.0], + g=(g1, g2), + expansion=False) if y3 > 0 else nn.Sequential(), + ChannelShuffle(g2) if shuffle else nn.Sequential(), + ChannelShuffle(oup // 2) + if shuffle and oup % 2 == 0 and y3 != 0 else nn.Sequential(), ) + elif g2 == 0: + self.layers = nn.Sequential( + GroupConv(inp, hidden_dim2, gs1), + DYShiftMax( + hidden_dim2, + hidden_dim2, + act_max=2.0, + act_relu=False, + init_a=[1.0, 0.0], + reduction=act_reduction, + init_b=[0.0, 0.0], + g=gs1, + expansion=False) if y3 > 0 else nn.Sequential(), ) + else: + self.layers = nn.Sequential( + GroupConv(inp, hidden_dim2, gs1), + DYShiftMax( + hidden_dim2, + hidden_dim2, + act_max=2.0, + act_relu=True if y1 == 2 else False, + init_a=init_a, + reduction=act_reduction, + init_b=init_b, + g=gs1, + expansion=False) if y1 > 0 else nn.ReLU6(), + ChannelShuffle(gs1[1]) if shuffle else nn.Sequential(), + DepthSpatialSepConv(hidden_dim2, (1, 1), kernel_size, stride) + if depthsep else + DepthConv(hidden_dim2, hidden_dim2, kernel_size, stride), + nn.Sequential(), + DYShiftMax( + hidden_dim2, + hidden_dim2, + act_max=2.0, + act_relu=True if y2 == 2 else False, + init_a=init_a, + reduction=act_reduction, + init_b=init_b, + g=gs1, + expansion=True) if y2 > 0 else nn.ReLU6(), + ChannelShuffle(hidden_dim2 // 4) + if shuffle and y1 != 0 and y2 != 0 else nn.Sequential() + if y1 == 0 and y2 == 0 else ChannelShuffle(hidden_dim2 // 2), + GroupConv(hidden_dim2, oup, (g1, g2)), + DYShiftMax( + oup, + oup, + act_max=2.0, + act_relu=False, + init_a=[1.0, 0.0], + reduction=act_reduction // 2 + if oup < hidden_dim2 else act_reduction, + init_b=[0.0, 0.0], + g=(g1, g2), + expansion=False) if y3 > 0 else nn.Sequential(), + ChannelShuffle(g2) if shuffle else nn.Sequential(), + ChannelShuffle(oup // 2) + if shuffle and y3 != 0 else nn.Sequential(), ) + + def forward(self, x): + identity = x + out = self.layers(x) + + if self.identity: + out = out + identity + + return out + + +class MicroNet(nn.Layer): + """ + the MicroNet backbone network for recognition module. + Args: + mode(str): {'M0', 'M1', 'M2', 'M3'} + Four models are proposed based on four different computational costs (4M, 6M, 12M, 21M MAdds) + Default: 'M3'. + """ + + def __init__(self, mode='M3', **kwargs): + super(MicroNet, self).__init__() + + self.cfgs = get_micronet_config(mode) + + activation_cfg = {} + if mode == 'M0': + input_channel = 4 + stem_groups = 2, 2 + out_ch = 384 + activation_cfg['init_a'] = 1.0, 1.0 + activation_cfg['init_b'] = 0.0, 0.0 + elif mode == 'M1': + input_channel = 6 + stem_groups = 3, 2 + out_ch = 576 + activation_cfg['init_a'] = 1.0, 1.0 + activation_cfg['init_b'] = 0.0, 0.0 + elif mode == 'M2': + input_channel = 8 + stem_groups = 4, 2 + out_ch = 768 + activation_cfg['init_a'] = 1.0, 1.0 + activation_cfg['init_b'] = 0.0, 0.0 + elif mode == 'M3': + input_channel = 12 + stem_groups = 4, 3 + out_ch = 432 + activation_cfg['init_a'] = 1.0, 0.5 + activation_cfg['init_b'] = 0.0, 0.5 + else: + raise NotImplementedError("mode[" + mode + + "_model] is not implemented!") + + layers = [StemLayer(3, input_channel, stride=2, groups=stem_groups)] + + for idx, val in enumerate(self.cfgs): + s, n, c, ks, c1, c2, g1, g2, c3, g3, g4, y1, y2, y3, r = val + + t1 = (c1, c2) + gs1 = (g1, g2) + gs2 = (c3, g3, g4) + activation_cfg['dy'] = [y1, y2, y3] + activation_cfg['ratio'] = r + + output_channel = c + layers.append( + DYMicroBlock( + input_channel, + output_channel, + kernel_size=ks, + stride=s, + ch_exp=t1, + ch_per_group=gs1, + groups_1x1=gs2, + depthsep=True, + shuffle=True, + activation_cfg=activation_cfg, )) + input_channel = output_channel + for i in range(1, n): + layers.append( + DYMicroBlock( + input_channel, + output_channel, + kernel_size=ks, + stride=1, + ch_exp=t1, + ch_per_group=gs1, + groups_1x1=gs2, + depthsep=True, + shuffle=True, + activation_cfg=activation_cfg, )) + input_channel = output_channel + self.features = nn.Sequential(*layers) + + self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + + self.out_channels = make_divisible(out_ch) + + def forward(self, x): + x = self.features(x) + x = self.pool(x) + return x diff --git a/ppocr/modeling/backbones/rec_mobilenet_v3.py b/ppocr/modeling/backbones/rec_mobilenet_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..917e000d94ea01ce0057e08c1f4839240561a368 --- /dev/null +++ b/ppocr/modeling/backbones/rec_mobilenet_v3.py @@ -0,0 +1,138 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle import nn + +from ppocr.modeling.backbones.det_mobilenet_v3 import ResidualUnit, ConvBNLayer, make_divisible + +__all__ = ['MobileNetV3'] + + +class MobileNetV3(nn.Layer): + def __init__(self, + in_channels=3, + model_name='small', + scale=0.5, + large_stride=None, + small_stride=None, + disable_se=False, + **kwargs): + super(MobileNetV3, self).__init__() + self.disable_se = disable_se + if small_stride is None: + small_stride = [2, 2, 2, 2] + if large_stride is None: + large_stride = [1, 2, 2, 2] + + assert isinstance(large_stride, list), "large_stride type must " \ + "be list but got {}".format(type(large_stride)) + assert isinstance(small_stride, list), "small_stride type must " \ + "be list but got {}".format(type(small_stride)) + assert len(large_stride) == 4, "large_stride length must be " \ + "4 but got {}".format(len(large_stride)) + assert len(small_stride) == 4, "small_stride length must be " \ + "4 but got {}".format(len(small_stride)) + + if model_name == "large": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, False, 'relu', large_stride[0]], + [3, 64, 24, False, 'relu', (large_stride[1], 1)], + [3, 72, 24, False, 'relu', 1], + [5, 72, 40, True, 'relu', (large_stride[2], 1)], + [5, 120, 40, True, 'relu', 1], + [5, 120, 40, True, 'relu', 1], + [3, 240, 80, False, 'hardswish', 1], + [3, 200, 80, False, 'hardswish', 1], + [3, 184, 80, False, 'hardswish', 1], + [3, 184, 80, False, 'hardswish', 1], + [3, 480, 112, True, 'hardswish', 1], + [3, 672, 112, True, 'hardswish', 1], + [5, 672, 160, True, 'hardswish', (large_stride[3], 1)], + [5, 960, 160, True, 'hardswish', 1], + [5, 960, 160, True, 'hardswish', 1], + ] + cls_ch_squeeze = 960 + elif model_name == "small": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, True, 'relu', (small_stride[0], 1)], + [3, 72, 24, False, 'relu', (small_stride[1], 1)], + [3, 88, 24, False, 'relu', 1], + [5, 96, 40, True, 'hardswish', (small_stride[2], 1)], + [5, 240, 40, True, 'hardswish', 1], + [5, 240, 40, True, 'hardswish', 1], + [5, 120, 48, True, 'hardswish', 1], + [5, 144, 48, True, 'hardswish', 1], + [5, 288, 96, True, 'hardswish', (small_stride[3], 1)], + [5, 576, 96, True, 'hardswish', 1], + [5, 576, 96, True, 'hardswish', 1], + ] + cls_ch_squeeze = 576 + else: + raise NotImplementedError("mode[" + model_name + + "_model] is not implemented!") + + supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25] + assert scale in supported_scale, \ + "supported scales are {} but input scale is {}".format(supported_scale, scale) + + inplanes = 16 + # conv1 + self.conv1 = ConvBNLayer( + in_channels=in_channels, + out_channels=make_divisible(inplanes * scale), + kernel_size=3, + stride=2, + padding=1, + groups=1, + if_act=True, + act='hardswish') + i = 0 + block_list = [] + inplanes = make_divisible(inplanes * scale) + for (k, exp, c, se, nl, s) in cfg: + se = se and not self.disable_se + block_list.append( + ResidualUnit( + in_channels=inplanes, + mid_channels=make_divisible(scale * exp), + out_channels=make_divisible(scale * c), + kernel_size=k, + stride=s, + use_se=se, + act=nl)) + inplanes = make_divisible(scale * c) + i += 1 + self.blocks = nn.Sequential(*block_list) + + self.conv2 = ConvBNLayer( + in_channels=inplanes, + out_channels=make_divisible(scale * cls_ch_squeeze), + kernel_size=1, + stride=1, + padding=0, + groups=1, + if_act=True, + act='hardswish') + + self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self.out_channels = make_divisible(scale * cls_ch_squeeze) + + def forward(self, x): + x = self.conv1(x) + x = self.blocks(x) + x = self.conv2(x) + x = self.pool(x) + return x diff --git a/ppocr/modeling/backbones/rec_mv1_enhance.py b/ppocr/modeling/backbones/rec_mv1_enhance.py new file mode 100644 index 0000000000000000000000000000000000000000..bb6af5e82cf13ac42d9a970787596a65986ade54 --- /dev/null +++ b/ppocr/modeling/backbones/rec_mv1_enhance.py @@ -0,0 +1,256 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This code is refer from: https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/arch/backbone/legendary_models/pp_lcnet.py + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import numpy as np +import paddle +from paddle import ParamAttr, reshape, transpose +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn import Conv2D, BatchNorm, Linear, Dropout +from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D +from paddle.nn.initializer import KaimingNormal +from paddle.regularizer import L2Decay +from paddle.nn.functional import hardswish, hardsigmoid + + +class ConvBNLayer(nn.Layer): + def __init__(self, + num_channels, + filter_size, + num_filters, + stride, + padding, + channels=None, + num_groups=1, + act='hard_swish'): + super(ConvBNLayer, self).__init__() + + self._conv = Conv2D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + + self._batch_norm = BatchNorm( + num_filters, + act=act, + param_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0))) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class DepthwiseSeparable(nn.Layer): + def __init__(self, + num_channels, + num_filters1, + num_filters2, + num_groups, + stride, + scale, + dw_size=3, + padding=1, + use_se=False): + super(DepthwiseSeparable, self).__init__() + self.use_se = use_se + self._depthwise_conv = ConvBNLayer( + num_channels=num_channels, + num_filters=int(num_filters1 * scale), + filter_size=dw_size, + stride=stride, + padding=padding, + num_groups=int(num_groups * scale)) + if use_se: + self._se = SEModule(int(num_filters1 * scale)) + self._pointwise_conv = ConvBNLayer( + num_channels=int(num_filters1 * scale), + filter_size=1, + num_filters=int(num_filters2 * scale), + stride=1, + padding=0) + + def forward(self, inputs): + y = self._depthwise_conv(inputs) + if self.use_se: + y = self._se(y) + y = self._pointwise_conv(y) + return y + + +class MobileNetV1Enhance(nn.Layer): + def __init__(self, + in_channels=3, + scale=0.5, + last_conv_stride=1, + last_pool_type='max', + **kwargs): + super().__init__() + self.scale = scale + self.block_list = [] + + self.conv1 = ConvBNLayer( + num_channels=3, + filter_size=3, + channels=3, + num_filters=int(32 * scale), + stride=2, + padding=1) + + conv2_1 = DepthwiseSeparable( + num_channels=int(32 * scale), + num_filters1=32, + num_filters2=64, + num_groups=32, + stride=1, + scale=scale) + self.block_list.append(conv2_1) + + conv2_2 = DepthwiseSeparable( + num_channels=int(64 * scale), + num_filters1=64, + num_filters2=128, + num_groups=64, + stride=1, + scale=scale) + self.block_list.append(conv2_2) + + conv3_1 = DepthwiseSeparable( + num_channels=int(128 * scale), + num_filters1=128, + num_filters2=128, + num_groups=128, + stride=1, + scale=scale) + self.block_list.append(conv3_1) + + conv3_2 = DepthwiseSeparable( + num_channels=int(128 * scale), + num_filters1=128, + num_filters2=256, + num_groups=128, + stride=(2, 1), + scale=scale) + self.block_list.append(conv3_2) + + conv4_1 = DepthwiseSeparable( + num_channels=int(256 * scale), + num_filters1=256, + num_filters2=256, + num_groups=256, + stride=1, + scale=scale) + self.block_list.append(conv4_1) + + conv4_2 = DepthwiseSeparable( + num_channels=int(256 * scale), + num_filters1=256, + num_filters2=512, + num_groups=256, + stride=(2, 1), + scale=scale) + self.block_list.append(conv4_2) + + for _ in range(5): + conv5 = DepthwiseSeparable( + num_channels=int(512 * scale), + num_filters1=512, + num_filters2=512, + num_groups=512, + stride=1, + dw_size=5, + padding=2, + scale=scale, + use_se=False) + self.block_list.append(conv5) + + conv5_6 = DepthwiseSeparable( + num_channels=int(512 * scale), + num_filters1=512, + num_filters2=1024, + num_groups=512, + stride=(2, 1), + dw_size=5, + padding=2, + scale=scale, + use_se=True) + self.block_list.append(conv5_6) + + conv6 = DepthwiseSeparable( + num_channels=int(1024 * scale), + num_filters1=1024, + num_filters2=1024, + num_groups=1024, + stride=last_conv_stride, + dw_size=5, + padding=2, + use_se=True, + scale=scale) + self.block_list.append(conv6) + + self.block_list = nn.Sequential(*self.block_list) + if last_pool_type == 'avg': + self.pool = nn.AvgPool2D(kernel_size=2, stride=2, padding=0) + else: + self.pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self.out_channels = int(1024 * scale) + + def forward(self, inputs): + y = self.conv1(inputs) + y = self.block_list(y) + y = self.pool(y) + return y + + +class SEModule(nn.Layer): + def __init__(self, channel, reduction=4): + super(SEModule, self).__init__() + self.avg_pool = AdaptiveAvgPool2D(1) + self.conv1 = Conv2D( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(), + bias_attr=ParamAttr()) + self.conv2 = Conv2D( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(), + bias_attr=ParamAttr()) + + def forward(self, inputs): + outputs = self.avg_pool(inputs) + outputs = self.conv1(outputs) + outputs = F.relu(outputs) + outputs = self.conv2(outputs) + outputs = hardsigmoid(outputs) + return paddle.multiply(x=inputs, y=outputs) diff --git a/ppocr/modeling/backbones/rec_nrtr_mtb.py b/ppocr/modeling/backbones/rec_nrtr_mtb.py new file mode 100644 index 0000000000000000000000000000000000000000..22e02a6371c3ff8b28fd88b5cfa1087309d551f8 --- /dev/null +++ b/ppocr/modeling/backbones/rec_nrtr_mtb.py @@ -0,0 +1,48 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle import nn +import paddle + + +class MTB(nn.Layer): + def __init__(self, cnn_num, in_channels): + super(MTB, self).__init__() + self.block = nn.Sequential() + self.out_channels = in_channels + self.cnn_num = cnn_num + if self.cnn_num == 2: + for i in range(self.cnn_num): + self.block.add_sublayer( + 'conv_{}'.format(i), + nn.Conv2D( + in_channels=in_channels + if i == 0 else 32 * (2**(i - 1)), + out_channels=32 * (2**i), + kernel_size=3, + stride=2, + padding=1)) + self.block.add_sublayer('relu_{}'.format(i), nn.ReLU()) + self.block.add_sublayer('bn_{}'.format(i), + nn.BatchNorm2D(32 * (2**i))) + + def forward(self, images): + x = self.block(images) + if self.cnn_num == 2: + # (b, w, h, c) + x = paddle.transpose(x, [0, 3, 2, 1]) + x_shape = paddle.shape(x) + x = paddle.reshape( + x, [x_shape[0], x_shape[1], x_shape[2] * x_shape[3]]) + return x diff --git a/ppocr/modeling/backbones/rec_resnet_31.py b/ppocr/modeling/backbones/rec_resnet_31.py new file mode 100644 index 0000000000000000000000000000000000000000..46dc374008b56a20dbd4be257775368e9cbbace4 --- /dev/null +++ b/ppocr/modeling/backbones/rec_resnet_31.py @@ -0,0 +1,231 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/layers/conv_layer.py +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/backbones/resnet31_ocr.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np + +__all__ = ["ResNet31"] + +def conv3x3(in_channel, out_channel, stride=1, conv_weight_attr=None): + return nn.Conv2D( + in_channel, + out_channel, + kernel_size=3, + stride=stride, + padding=1, + weight_attr=conv_weight_attr, + bias_attr=False) + + +class BasicBlock(nn.Layer): + expansion = 1 + + def __init__(self, in_channels, channels, stride=1, downsample=False, conv_weight_attr=None, bn_weight_attr=None): + super().__init__() + self.conv1 = conv3x3(in_channels, channels, stride, + conv_weight_attr=conv_weight_attr) + self.bn1 = nn.BatchNorm2D(channels, weight_attr=bn_weight_attr) + self.relu = nn.ReLU() + self.conv2 = conv3x3(channels, channels, + conv_weight_attr=conv_weight_attr) + self.bn2 = nn.BatchNorm2D(channels, weight_attr=bn_weight_attr) + self.downsample = downsample + if downsample: + self.downsample = nn.Sequential( + nn.Conv2D( + in_channels, + channels * self.expansion, + 1, + stride, + weight_attr=conv_weight_attr, + bias_attr=False), + nn.BatchNorm2D(channels * self.expansion, weight_attr=bn_weight_attr)) + else: + self.downsample = nn.Sequential() + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNet31(nn.Layer): + ''' + Args: + in_channels (int): Number of channels of input image tensor. + layers (list[int]): List of BasicBlock number for each stage. + channels (list[int]): List of out_channels of Conv2d layer. + out_indices (None | Sequence[int]): Indices of output stages. + last_stage_pool (bool): If True, add `MaxPool2d` layer to last stage. + init_type (None | str): the config to control the initialization. + ''' + + def __init__(self, + in_channels=3, + layers=[1, 2, 5, 3], + channels=[64, 128, 256, 256, 512, 512, 512], + out_indices=None, + last_stage_pool=False, + init_type=None): + super(ResNet31, self).__init__() + assert isinstance(in_channels, int) + assert isinstance(last_stage_pool, bool) + + self.out_indices = out_indices + self.last_stage_pool = last_stage_pool + + conv_weight_attr = None + bn_weight_attr = None + + if init_type is not None: + support_dict = ['KaimingNormal'] + assert init_type in support_dict, Exception( + "resnet31 only support {}".format(support_dict)) + conv_weight_attr = nn.initializer.KaimingNormal() + bn_weight_attr = ParamAttr(initializer=nn.initializer.Uniform(), learning_rate=1) + + # conv 1 (Conv Conv) + self.conv1_1 = nn.Conv2D( + in_channels, channels[0], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr) + self.bn1_1 = nn.BatchNorm2D(channels[0], weight_attr=bn_weight_attr) + self.relu1_1 = nn.ReLU() + + self.conv1_2 = nn.Conv2D( + channels[0], channels[1], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr) + self.bn1_2 = nn.BatchNorm2D(channels[1], weight_attr=bn_weight_attr) + self.relu1_2 = nn.ReLU() + + # conv 2 (Max-pooling, Residual block, Conv) + self.pool2 = nn.MaxPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block2 = self._make_layer(channels[1], channels[2], layers[0], + conv_weight_attr=conv_weight_attr, bn_weight_attr=bn_weight_attr) + self.conv2 = nn.Conv2D( + channels[2], channels[2], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr) + self.bn2 = nn.BatchNorm2D(channels[2], weight_attr=bn_weight_attr) + self.relu2 = nn.ReLU() + + # conv 3 (Max-pooling, Residual block, Conv) + self.pool3 = nn.MaxPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block3 = self._make_layer(channels[2], channels[3], layers[1], + conv_weight_attr=conv_weight_attr, bn_weight_attr=bn_weight_attr) + self.conv3 = nn.Conv2D( + channels[3], channels[3], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr) + self.bn3 = nn.BatchNorm2D(channels[3], weight_attr=bn_weight_attr) + self.relu3 = nn.ReLU() + + # conv 4 (Max-pooling, Residual block, Conv) + self.pool4 = nn.MaxPool2D( + kernel_size=(2, 1), stride=(2, 1), padding=0, ceil_mode=True) + self.block4 = self._make_layer(channels[3], channels[4], layers[2], + conv_weight_attr=conv_weight_attr, bn_weight_attr=bn_weight_attr) + self.conv4 = nn.Conv2D( + channels[4], channels[4], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr) + self.bn4 = nn.BatchNorm2D(channels[4], weight_attr=bn_weight_attr) + self.relu4 = nn.ReLU() + + # conv 5 ((Max-pooling), Residual block, Conv) + self.pool5 = None + if self.last_stage_pool: + self.pool5 = nn.MaxPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self.block5 = self._make_layer(channels[4], channels[5], layers[3], + conv_weight_attr=conv_weight_attr, bn_weight_attr=bn_weight_attr) + self.conv5 = nn.Conv2D( + channels[5], channels[5], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr) + self.bn5 = nn.BatchNorm2D(channels[5], weight_attr=bn_weight_attr) + self.relu5 = nn.ReLU() + + self.out_channels = channels[-1] + + def _make_layer(self, input_channels, output_channels, blocks, conv_weight_attr=None, bn_weight_attr=None): + layers = [] + for _ in range(blocks): + downsample = None + if input_channels != output_channels: + downsample = nn.Sequential( + nn.Conv2D( + input_channels, + output_channels, + kernel_size=1, + stride=1, + weight_attr=conv_weight_attr, + bias_attr=False), + nn.BatchNorm2D(output_channels, weight_attr=bn_weight_attr)) + + layers.append( + BasicBlock( + input_channels, output_channels, downsample=downsample, + conv_weight_attr=conv_weight_attr, bn_weight_attr=bn_weight_attr)) + input_channels = output_channels + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1_1(x) + x = self.bn1_1(x) + x = self.relu1_1(x) + + x = self.conv1_2(x) + x = self.bn1_2(x) + x = self.relu1_2(x) + + outs = [] + for i in range(4): + layer_index = i + 2 + pool_layer = getattr(self, f'pool{layer_index}') + block_layer = getattr(self, f'block{layer_index}') + conv_layer = getattr(self, f'conv{layer_index}') + bn_layer = getattr(self, f'bn{layer_index}') + relu_layer = getattr(self, f'relu{layer_index}') + + if pool_layer is not None: + x = pool_layer(x) + x = block_layer(x) + x = conv_layer(x) + x = bn_layer(x) + x = relu_layer(x) + + outs.append(x) + + if self.out_indices is not None: + return tuple([outs[i] for i in self.out_indices]) + + return x diff --git a/ppocr/modeling/backbones/rec_resnet_32.py b/ppocr/modeling/backbones/rec_resnet_32.py new file mode 100644 index 0000000000000000000000000000000000000000..cbd19251a3ed43a472d49f03743ead1491aa86ac --- /dev/null +++ b/ppocr/modeling/backbones/rec_resnet_32.py @@ -0,0 +1,269 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/hikopensource/DAVAR-Lab-OCR/davarocr/davar_rcg/models/backbones/ResNet32.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle.nn as nn + +__all__ = ["ResNet32"] + +conv_weight_attr = nn.initializer.KaimingNormal() + +class ResNet32(nn.Layer): + """ + Feature Extractor is proposed in FAN Ref [1] + + Ref [1]: Focusing Attention: Towards Accurate Text Recognition in Neural Images ICCV-2017 + """ + + def __init__(self, in_channels, out_channels=512): + """ + + Args: + in_channels (int): input channel + output_channel (int): output channel + """ + super(ResNet32, self).__init__() + self.out_channels = out_channels + self.ConvNet = ResNet(in_channels, out_channels, BasicBlock, [1, 2, 5, 3]) + + def forward(self, inputs): + """ + Args: + inputs: input feature + + Returns: + output feature + + """ + return self.ConvNet(inputs) + +class BasicBlock(nn.Layer): + """Res-net Basic Block""" + expansion = 1 + + def __init__(self, inplanes, planes, + stride=1, downsample=None, + norm_type='BN', **kwargs): + """ + Args: + inplanes (int): input channel + planes (int): channels of the middle feature + stride (int): stride of the convolution + downsample (int): type of the down_sample + norm_type (str): type of the normalization + **kwargs (None): backup parameter + """ + super(BasicBlock, self).__init__() + self.conv1 = self._conv3x3(inplanes, planes) + self.bn1 = nn.BatchNorm2D(planes) + self.conv2 = self._conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2D(planes) + self.relu = nn.ReLU() + self.downsample = downsample + self.stride = stride + + def _conv3x3(self, in_planes, out_planes, stride=1): + """ + + Args: + in_planes (int): input channel + out_planes (int): channels of the middle feature + stride (int): stride of the convolution + Returns: + nn.Layer: Conv2D with kernel = 3 + + """ + + return nn.Conv2D(in_planes, out_planes, + kernel_size=3, stride=stride, + padding=1, weight_attr=conv_weight_attr, + bias_attr=False) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + out += residual + out = self.relu(out) + + return out + +class ResNet(nn.Layer): + """Res-Net network structure""" + def __init__(self, input_channel, + output_channel, block, layers): + """ + + Args: + input_channel (int): input channel + output_channel (int): output channel + block (BasicBlock): convolution block + layers (list): layers of the block + """ + super(ResNet, self).__init__() + + self.output_channel_block = [int(output_channel / 4), + int(output_channel / 2), + output_channel, + output_channel] + + self.inplanes = int(output_channel / 8) + self.conv0_1 = nn.Conv2D(input_channel, int(output_channel / 16), + kernel_size=3, stride=1, + padding=1, + weight_attr=conv_weight_attr, + bias_attr=False) + self.bn0_1 = nn.BatchNorm2D(int(output_channel / 16)) + self.conv0_2 = nn.Conv2D(int(output_channel / 16), self.inplanes, + kernel_size=3, stride=1, + padding=1, + weight_attr=conv_weight_attr, + bias_attr=False) + self.bn0_2 = nn.BatchNorm2D(self.inplanes) + self.relu = nn.ReLU() + + self.maxpool1 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self.layer1 = self._make_layer(block, + self.output_channel_block[0], + layers[0]) + self.conv1 = nn.Conv2D(self.output_channel_block[0], + self.output_channel_block[0], + kernel_size=3, stride=1, + padding=1, + weight_attr=conv_weight_attr, + bias_attr=False) + self.bn1 = nn.BatchNorm2D(self.output_channel_block[0]) + + self.maxpool2 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self.layer2 = self._make_layer(block, + self.output_channel_block[1], + layers[1], stride=1) + self.conv2 = nn.Conv2D(self.output_channel_block[1], + self.output_channel_block[1], + kernel_size=3, stride=1, + padding=1, + weight_attr=conv_weight_attr, + bias_attr=False,) + self.bn2 = nn.BatchNorm2D(self.output_channel_block[1]) + + self.maxpool3 = nn.MaxPool2D(kernel_size=2, + stride=(2, 1), + padding=(0, 1)) + self.layer3 = self._make_layer(block, self.output_channel_block[2], + layers[2], stride=1) + self.conv3 = nn.Conv2D(self.output_channel_block[2], + self.output_channel_block[2], + kernel_size=3, stride=1, + padding=1, + weight_attr=conv_weight_attr, + bias_attr=False) + self.bn3 = nn.BatchNorm2D(self.output_channel_block[2]) + + self.layer4 = self._make_layer(block, self.output_channel_block[3], + layers[3], stride=1) + self.conv4_1 = nn.Conv2D(self.output_channel_block[3], + self.output_channel_block[3], + kernel_size=2, stride=(2, 1), + padding=(0, 1), + weight_attr=conv_weight_attr, + bias_attr=False) + self.bn4_1 = nn.BatchNorm2D(self.output_channel_block[3]) + self.conv4_2 = nn.Conv2D(self.output_channel_block[3], + self.output_channel_block[3], + kernel_size=2, stride=1, + padding=0, + weight_attr=conv_weight_attr, + bias_attr=False) + self.bn4_2 = nn.BatchNorm2D(self.output_channel_block[3]) + + def _make_layer(self, block, planes, blocks, stride=1): + """ + + Args: + block (block): convolution block + planes (int): input channels + blocks (list): layers of the block + stride (int): stride of the convolution + + Returns: + nn.Sequential: the combination of the convolution block + + """ + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2D(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, + weight_attr=conv_weight_attr, + bias_attr=False), + nn.BatchNorm2D(planes * block.expansion), + ) + + layers = list() + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv0_1(x) + x = self.bn0_1(x) + x = self.relu(x) + x = self.conv0_2(x) + x = self.bn0_2(x) + x = self.relu(x) + + x = self.maxpool1(x) + x = self.layer1(x) + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + + x = self.maxpool2(x) + x = self.layer2(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + + x = self.maxpool3(x) + x = self.layer3(x) + x = self.conv3(x) + x = self.bn3(x) + x = self.relu(x) + + x = self.layer4(x) + x = self.conv4_1(x) + x = self.bn4_1(x) + x = self.relu(x) + x = self.conv4_2(x) + x = self.bn4_2(x) + x = self.relu(x) + return x diff --git a/ppocr/modeling/backbones/rec_resnet_45.py b/ppocr/modeling/backbones/rec_resnet_45.py new file mode 100644 index 0000000000000000000000000000000000000000..083eb7f48811cf6887845f98bbeae315b727287d --- /dev/null +++ b/ppocr/modeling/backbones/rec_resnet_45.py @@ -0,0 +1,144 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FangShancheng/ABINet/tree/main/modules +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +from paddle.nn.initializer import KaimingNormal +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np +import math + +__all__ = ["ResNet45"] + + +def conv1x1(in_planes, out_planes, stride=1): + return nn.Conv2D( + in_planes, + out_planes, + kernel_size=1, + stride=1, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + + +def conv3x3(in_channel, out_channel, stride=1): + return nn.Conv2D( + in_channel, + out_channel, + kernel_size=3, + stride=stride, + padding=1, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + + +class BasicBlock(nn.Layer): + expansion = 1 + + def __init__(self, in_channels, channels, stride=1, downsample=None): + super().__init__() + self.conv1 = conv1x1(in_channels, channels) + self.bn1 = nn.BatchNorm2D(channels) + self.relu = nn.ReLU() + self.conv2 = conv3x3(channels, channels, stride) + self.bn2 = nn.BatchNorm2D(channels) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + out += residual + out = self.relu(out) + + return out + + +class ResNet45(nn.Layer): + def __init__(self, + in_channels=3, + block=BasicBlock, + layers=[3, 4, 6, 6, 3], + strides=[2, 1, 2, 1, 1]): + self.inplanes = 32 + super(ResNet45, self).__init__() + self.conv1 = nn.Conv2D( + in_channels, + 32, + kernel_size=3, + stride=1, + padding=1, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False) + self.bn1 = nn.BatchNorm2D(32) + self.relu = nn.ReLU() + + self.layer1 = self._make_layer(block, 32, layers[0], stride=strides[0]) + self.layer2 = self._make_layer(block, 64, layers[1], stride=strides[1]) + self.layer3 = self._make_layer(block, 128, layers[2], stride=strides[2]) + self.layer4 = self._make_layer(block, 256, layers[3], stride=strides[3]) + self.layer5 = self._make_layer(block, 512, layers[4], stride=strides[4]) + self.out_channels = 512 + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + # downsample = True + downsample = nn.Sequential( + nn.Conv2D( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False), + nn.BatchNorm2D(planes * block.expansion), ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.layer5(x) + return x diff --git a/ppocr/modeling/backbones/rec_resnet_aster.py b/ppocr/modeling/backbones/rec_resnet_aster.py new file mode 100644 index 0000000000000000000000000000000000000000..782dc393ea3c8b67d68fb9f4b038afc85ffcad93 --- /dev/null +++ b/ppocr/modeling/backbones/rec_resnet_aster.py @@ -0,0 +1,143 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/resnet_aster.py +""" +import paddle +import paddle.nn as nn + +import sys +import math + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2D( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=1, + bias_attr=False) + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution""" + return nn.Conv2D( + in_planes, out_planes, kernel_size=1, stride=stride, bias_attr=False) + + +def get_sinusoid_encoding(n_position, feat_dim, wave_length=10000): + # [n_position] + positions = paddle.arange(0, n_position) + # [feat_dim] + dim_range = paddle.arange(0, feat_dim) + dim_range = paddle.pow(wave_length, 2 * (dim_range // 2) / feat_dim) + # [n_position, feat_dim] + angles = paddle.unsqueeze( + positions, axis=1) / paddle.unsqueeze( + dim_range, axis=0) + angles = paddle.cast(angles, "float32") + angles[:, 0::2] = paddle.sin(angles[:, 0::2]) + angles[:, 1::2] = paddle.cos(angles[:, 1::2]) + return angles + + +class AsterBlock(nn.Layer): + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(AsterBlock, self).__init__() + self.conv1 = conv1x1(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2D(planes) + self.relu = nn.ReLU() + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2D(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + out += residual + out = self.relu(out) + return out + + +class ResNet_ASTER(nn.Layer): + """For aster or crnn""" + + def __init__(self, with_lstm=True, n_group=1, in_channels=3): + super(ResNet_ASTER, self).__init__() + self.with_lstm = with_lstm + self.n_group = n_group + + self.layer0 = nn.Sequential( + nn.Conv2D( + in_channels, + 32, + kernel_size=(3, 3), + stride=1, + padding=1, + bias_attr=False), + nn.BatchNorm2D(32), + nn.ReLU()) + + self.inplanes = 32 + self.layer1 = self._make_layer(32, 3, [2, 2]) # [16, 50] + self.layer2 = self._make_layer(64, 4, [2, 2]) # [8, 25] + self.layer3 = self._make_layer(128, 6, [2, 1]) # [4, 25] + self.layer4 = self._make_layer(256, 6, [2, 1]) # [2, 25] + self.layer5 = self._make_layer(512, 3, [2, 1]) # [1, 25] + + if with_lstm: + self.rnn = nn.LSTM(512, 256, direction="bidirect", num_layers=2) + self.out_channels = 2 * 256 + else: + self.out_channels = 512 + + def _make_layer(self, planes, blocks, stride): + downsample = None + if stride != [1, 1] or self.inplanes != planes: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes, stride), nn.BatchNorm2D(planes)) + + layers = [] + layers.append(AsterBlock(self.inplanes, planes, stride, downsample)) + self.inplanes = planes + for _ in range(1, blocks): + layers.append(AsterBlock(self.inplanes, planes)) + return nn.Sequential(*layers) + + def forward(self, x): + x0 = self.layer0(x) + x1 = self.layer1(x0) + x2 = self.layer2(x1) + x3 = self.layer3(x2) + x4 = self.layer4(x3) + x5 = self.layer5(x4) + + cnn_feat = x5.squeeze(2) # [N, c, w] + cnn_feat = paddle.transpose(cnn_feat, perm=[0, 2, 1]) + if self.with_lstm: + rnn_feat, _ = self.rnn(cnn_feat) + return rnn_feat + else: + return cnn_feat \ No newline at end of file diff --git a/ppocr/modeling/backbones/rec_resnet_fpn.py b/ppocr/modeling/backbones/rec_resnet_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..79efd6e41e231ecad99aa4d01a8226a8550bd1ef --- /dev/null +++ b/ppocr/modeling/backbones/rec_resnet_fpn.py @@ -0,0 +1,306 @@ +#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import paddle +import numpy as np + +__all__ = ["ResNetFPN"] + + +class ResNetFPN(nn.Layer): + def __init__(self, in_channels=1, layers=50, **kwargs): + super(ResNetFPN, self).__init__() + supported_layers = { + 18: { + 'depth': [2, 2, 2, 2], + 'block_class': BasicBlock + }, + 34: { + 'depth': [3, 4, 6, 3], + 'block_class': BasicBlock + }, + 50: { + 'depth': [3, 4, 6, 3], + 'block_class': BottleneckBlock + }, + 101: { + 'depth': [3, 4, 23, 3], + 'block_class': BottleneckBlock + }, + 152: { + 'depth': [3, 8, 36, 3], + 'block_class': BottleneckBlock + } + } + stride_list = [(2, 2), (2, 2), (1, 1), (1, 1)] + num_filters = [64, 128, 256, 512] + self.depth = supported_layers[layers]['depth'] + self.F = [] + self.conv = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=7, + stride=2, + act="relu", + name="conv1") + self.block_list = [] + in_ch = 64 + if layers >= 50: + for block in range(len(self.depth)): + for i in range(self.depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + block_list = self.add_sublayer( + "bottleneckBlock_{}_{}".format(block, i), + BottleneckBlock( + in_channels=in_ch, + out_channels=num_filters[block], + stride=stride_list[block] if i == 0 else 1, + name=conv_name)) + in_ch = num_filters[block] * 4 + self.block_list.append(block_list) + self.F.append(block_list) + else: + for block in range(len(self.depth)): + for i in range(self.depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + if i == 0 and block != 0: + stride = (2, 1) + else: + stride = (1, 1) + basic_block = self.add_sublayer( + conv_name, + BasicBlock( + in_channels=in_ch, + out_channels=num_filters[block], + stride=stride_list[block] if i == 0 else 1, + is_first=block == i == 0, + name=conv_name)) + in_ch = basic_block.out_channels + self.block_list.append(basic_block) + out_ch_list = [in_ch // 4, in_ch // 2, in_ch] + self.base_block = [] + self.conv_trans = [] + self.bn_block = [] + for i in [-2, -3]: + in_channels = out_ch_list[i + 1] + out_ch_list[i] + + self.base_block.append( + self.add_sublayer( + "F_{}_base_block_0".format(i), + nn.Conv2D( + in_channels=in_channels, + out_channels=out_ch_list[i], + kernel_size=1, + weight_attr=ParamAttr(trainable=True), + bias_attr=ParamAttr(trainable=True)))) + self.base_block.append( + self.add_sublayer( + "F_{}_base_block_1".format(i), + nn.Conv2D( + in_channels=out_ch_list[i], + out_channels=out_ch_list[i], + kernel_size=3, + padding=1, + weight_attr=ParamAttr(trainable=True), + bias_attr=ParamAttr(trainable=True)))) + self.base_block.append( + self.add_sublayer( + "F_{}_base_block_2".format(i), + nn.BatchNorm( + num_channels=out_ch_list[i], + act="relu", + param_attr=ParamAttr(trainable=True), + bias_attr=ParamAttr(trainable=True)))) + self.base_block.append( + self.add_sublayer( + "F_{}_base_block_3".format(i), + nn.Conv2D( + in_channels=out_ch_list[i], + out_channels=512, + kernel_size=1, + bias_attr=ParamAttr(trainable=True), + weight_attr=ParamAttr(trainable=True)))) + self.out_channels = 512 + + def __call__(self, x): + x = self.conv(x) + fpn_list = [] + F = [] + for i in range(len(self.depth)): + fpn_list.append(np.sum(self.depth[:i + 1])) + + for i, block in enumerate(self.block_list): + x = block(x) + for number in fpn_list: + if i + 1 == number: + F.append(x) + base = F[-1] + + j = 0 + for i, block in enumerate(self.base_block): + if i % 3 == 0 and i < 6: + j = j + 1 + b, c, w, h = F[-j - 1].shape + if [w, h] == list(base.shape[2:]): + base = base + else: + base = self.conv_trans[j - 1](base) + base = self.bn_block[j - 1](base) + base = paddle.concat([base, F[-j - 1]], axis=1) + base = block(base) + return base + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=2 if stride == (1, 1) else kernel_size, + dilation=2 if stride == (1, 1) else 1, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + '.conv2d.output.1.w_0'), + bias_attr=False, ) + + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name=name + '.output.1.w_0'), + bias_attr=ParamAttr(name=name + '.output.1.b_0'), + moving_mean_name=bn_name + "_mean", + moving_variance_name=bn_name + "_variance") + + def __call__(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class ShortCut(nn.Layer): + def __init__(self, in_channels, out_channels, stride, name, is_first=False): + super(ShortCut, self).__init__() + self.use_conv = True + + if in_channels != out_channels or stride != 1 or is_first == True: + if stride == (1, 1): + self.conv = ConvBNLayer( + in_channels, out_channels, 1, 1, name=name) + else: # stride==(2,2) + self.conv = ConvBNLayer( + in_channels, out_channels, 1, stride, name=name) + else: + self.use_conv = False + + def forward(self, x): + if self.use_conv: + x = self.conv(x) + return x + + +class BottleneckBlock(nn.Layer): + def __init__(self, in_channels, out_channels, stride, name): + super(BottleneckBlock, self).__init__() + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2b") + + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + self.short = ShortCut( + in_channels=in_channels, + out_channels=out_channels * 4, + stride=stride, + is_first=False, + name=name + "_branch1") + self.out_channels = out_channels * 4 + + def forward(self, x): + y = self.conv0(x) + y = self.conv1(y) + y = self.conv2(y) + y = y + self.short(x) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, in_channels, out_channels, stride, name, is_first): + super(BasicBlock, self).__init__() + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + act='relu', + stride=stride, + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + self.short = ShortCut( + in_channels=in_channels, + out_channels=out_channels, + stride=stride, + is_first=is_first, + name=name + "_branch1") + self.out_channels = out_channels + + def forward(self, x): + y = self.conv0(x) + y = self.conv1(y) + y = y + self.short(x) + return F.relu(y) diff --git a/ppocr/modeling/backbones/rec_resnet_rfl.py b/ppocr/modeling/backbones/rec_resnet_rfl.py new file mode 100644 index 0000000000000000000000000000000000000000..fd317c6ea67acb4e02aadeb7c77c09eb92c4ca95 --- /dev/null +++ b/ppocr/modeling/backbones/rec_resnet_rfl.py @@ -0,0 +1,348 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/hikopensource/DAVAR-Lab-OCR/blob/main/davarocr/davar_rcg/models/backbones/ResNetRFL.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn + +from paddle.nn.initializer import TruncatedNormal, Constant, Normal, KaimingNormal + +kaiming_init_ = KaimingNormal() +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + + +class BasicBlock(nn.Layer): + """Res-net Basic Block""" + expansion = 1 + + def __init__(self, + inplanes, + planes, + stride=1, + downsample=None, + norm_type='BN', + **kwargs): + """ + Args: + inplanes (int): input channel + planes (int): channels of the middle feature + stride (int): stride of the convolution + downsample (int): type of the down_sample + norm_type (str): type of the normalization + **kwargs (None): backup parameter + """ + super(BasicBlock, self).__init__() + self.conv1 = self._conv3x3(inplanes, planes) + self.bn1 = nn.BatchNorm(planes) + self.conv2 = self._conv3x3(planes, planes) + self.bn2 = nn.BatchNorm(planes) + self.relu = nn.ReLU() + self.downsample = downsample + self.stride = stride + + def _conv3x3(self, in_planes, out_planes, stride=1): + + return nn.Conv2D( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=1, + bias_attr=False) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + out += residual + out = self.relu(out) + + return out + + +class ResNetRFL(nn.Layer): + def __init__(self, + in_channels, + out_channels=512, + use_cnt=True, + use_seq=True): + """ + + Args: + in_channels (int): input channel + out_channels (int): output channel + """ + super(ResNetRFL, self).__init__() + assert use_cnt or use_seq + self.use_cnt, self.use_seq = use_cnt, use_seq + self.backbone = RFLBase(in_channels) + + self.out_channels = out_channels + self.out_channels_block = [ + int(self.out_channels / 4), int(self.out_channels / 2), + self.out_channels, self.out_channels + ] + block = BasicBlock + layers = [1, 2, 5, 3] + self.inplanes = int(self.out_channels // 2) + + self.relu = nn.ReLU() + if self.use_seq: + self.maxpool3 = nn.MaxPool2D( + kernel_size=2, stride=(2, 1), padding=(0, 1)) + self.layer3 = self._make_layer( + block, self.out_channels_block[2], layers[2], stride=1) + self.conv3 = nn.Conv2D( + self.out_channels_block[2], + self.out_channels_block[2], + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn3 = nn.BatchNorm(self.out_channels_block[2]) + + self.layer4 = self._make_layer( + block, self.out_channels_block[3], layers[3], stride=1) + self.conv4_1 = nn.Conv2D( + self.out_channels_block[3], + self.out_channels_block[3], + kernel_size=2, + stride=(2, 1), + padding=(0, 1), + bias_attr=False) + self.bn4_1 = nn.BatchNorm(self.out_channels_block[3]) + self.conv4_2 = nn.Conv2D( + self.out_channels_block[3], + self.out_channels_block[3], + kernel_size=2, + stride=1, + padding=0, + bias_attr=False) + self.bn4_2 = nn.BatchNorm(self.out_channels_block[3]) + + if self.use_cnt: + self.inplanes = int(self.out_channels // 2) + self.v_maxpool3 = nn.MaxPool2D( + kernel_size=2, stride=(2, 1), padding=(0, 1)) + self.v_layer3 = self._make_layer( + block, self.out_channels_block[2], layers[2], stride=1) + self.v_conv3 = nn.Conv2D( + self.out_channels_block[2], + self.out_channels_block[2], + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.v_bn3 = nn.BatchNorm(self.out_channels_block[2]) + + self.v_layer4 = self._make_layer( + block, self.out_channels_block[3], layers[3], stride=1) + self.v_conv4_1 = nn.Conv2D( + self.out_channels_block[3], + self.out_channels_block[3], + kernel_size=2, + stride=(2, 1), + padding=(0, 1), + bias_attr=False) + self.v_bn4_1 = nn.BatchNorm(self.out_channels_block[3]) + self.v_conv4_2 = nn.Conv2D( + self.out_channels_block[3], + self.out_channels_block[3], + kernel_size=2, + stride=1, + padding=0, + bias_attr=False) + self.v_bn4_2 = nn.BatchNorm(self.out_channels_block[3]) + + def _make_layer(self, block, planes, blocks, stride=1): + + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2D( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias_attr=False), + nn.BatchNorm(planes * block.expansion), ) + + layers = list() + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, inputs): + x_1 = self.backbone(inputs) + + if self.use_cnt: + v_x = self.v_maxpool3(x_1) + v_x = self.v_layer3(v_x) + v_x = self.v_conv3(v_x) + v_x = self.v_bn3(v_x) + visual_feature_2 = self.relu(v_x) + + v_x = self.v_layer4(visual_feature_2) + v_x = self.v_conv4_1(v_x) + v_x = self.v_bn4_1(v_x) + v_x = self.relu(v_x) + v_x = self.v_conv4_2(v_x) + v_x = self.v_bn4_2(v_x) + visual_feature_3 = self.relu(v_x) + else: + visual_feature_3 = None + if self.use_seq: + x = self.maxpool3(x_1) + x = self.layer3(x) + x = self.conv3(x) + x = self.bn3(x) + x_2 = self.relu(x) + + x = self.layer4(x_2) + x = self.conv4_1(x) + x = self.bn4_1(x) + x = self.relu(x) + x = self.conv4_2(x) + x = self.bn4_2(x) + x_3 = self.relu(x) + else: + x_3 = None + + return [visual_feature_3, x_3] + + +class ResNetBase(nn.Layer): + def __init__(self, in_channels, out_channels, block, layers): + super(ResNetBase, self).__init__() + + self.out_channels_block = [ + int(out_channels / 4), int(out_channels / 2), out_channels, + out_channels + ] + + self.inplanes = int(out_channels / 8) + self.conv0_1 = nn.Conv2D( + in_channels, + int(out_channels / 16), + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn0_1 = nn.BatchNorm(int(out_channels / 16)) + self.conv0_2 = nn.Conv2D( + int(out_channels / 16), + self.inplanes, + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn0_2 = nn.BatchNorm(self.inplanes) + self.relu = nn.ReLU() + + self.maxpool1 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self.layer1 = self._make_layer(block, self.out_channels_block[0], + layers[0]) + self.conv1 = nn.Conv2D( + self.out_channels_block[0], + self.out_channels_block[0], + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn1 = nn.BatchNorm(self.out_channels_block[0]) + + self.maxpool2 = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + self.layer2 = self._make_layer( + block, self.out_channels_block[1], layers[1], stride=1) + self.conv2 = nn.Conv2D( + self.out_channels_block[1], + self.out_channels_block[1], + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn2 = nn.BatchNorm(self.out_channels_block[1]) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2D( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias_attr=False), + nn.BatchNorm(planes * block.expansion), ) + + layers = list() + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv0_1(x) + x = self.bn0_1(x) + x = self.relu(x) + x = self.conv0_2(x) + x = self.bn0_2(x) + x = self.relu(x) + + x = self.maxpool1(x) + x = self.layer1(x) + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + + x = self.maxpool2(x) + x = self.layer2(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + + return x + + +class RFLBase(nn.Layer): + """ Reciprocal feature learning share backbone network""" + + def __init__(self, in_channels, out_channels=512): + super(RFLBase, self).__init__() + self.ConvNet = ResNetBase(in_channels, out_channels, BasicBlock, + [1, 2, 5, 3]) + + def forward(self, inputs): + return self.ConvNet(inputs) diff --git a/ppocr/modeling/backbones/rec_resnet_vd.py b/ppocr/modeling/backbones/rec_resnet_vd.py new file mode 100644 index 0000000000000000000000000000000000000000..0187deb96f111a2c2b545c7be42dba48c7352e17 --- /dev/null +++ b/ppocr/modeling/backbones/rec_resnet_vd.py @@ -0,0 +1,286 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + +__all__ = ["ResNet"] + + +class ConvBNLayer(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_vd_mode=False, + act=None, + name=None, ): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2D( + kernel_size=stride, stride=stride, padding=0, ceil_mode=True) + self._conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1 if is_vd_mode else stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self._batch_norm = nn.BatchNorm( + out_channels, + act=act, + param_attr=ParamAttr(name=bn_name + '_scale'), + bias_attr=ParamAttr(bn_name + '_offset'), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance') + + def forward(self, inputs): + if self.is_vd_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class BottleneckBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2b") + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=stride, + is_vd_mode=not if_first and stride[0] != 1, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv2) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + in_channels, + out_channels, + stride, + shortcut=True, + if_first=False, + name=None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2a") + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=stride, + is_vd_mode=not if_first and stride[0] != 1, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.add(x=short, y=conv1) + y = F.relu(y) + return y + + +class ResNet(nn.Layer): + def __init__(self, in_channels=3, layers=50, **kwargs): + super(ResNet, self).__init__() + + self.layers = layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format( + supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + num_channels = [64, 256, 512, + 1024] if layers >= 50 else [64, 64, 128, 256] + num_filters = [64, 128, 256, 512] + + self.conv1_1 = ConvBNLayer( + in_channels=in_channels, + out_channels=32, + kernel_size=3, + stride=1, + act='relu', + name="conv1_1") + self.conv1_2 = ConvBNLayer( + in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='relu', + name="conv1_2") + self.conv1_3 = ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='relu', + name="conv1_3") + self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.block_list = [] + if layers >= 50: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + if layers in [101, 152, 200] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + + if i == 0 and block != 0: + stride = (2, 1) + else: + stride = (1, 1) + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=stride, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name)) + shortcut = True + self.block_list.append(bottleneck_block) + self.out_channels = num_filters[block] * 4 + else: + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + if i == 0 and block != 0: + stride = (2, 1) + else: + stride = (1, 1) + + basic_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BasicBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block], + out_channels=num_filters[block], + stride=stride, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name)) + shortcut = True + self.block_list.append(basic_block) + self.out_channels = num_filters[block] + self.out_pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + + def forward(self, inputs): + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + y = self.pool2d_max(y) + for block in self.block_list: + y = block(y) + y = self.out_pool(y) + return y diff --git a/ppocr/modeling/backbones/rec_svtrnet.py b/ppocr/modeling/backbones/rec_svtrnet.py new file mode 100644 index 0000000000000000000000000000000000000000..de5920c633b27b8284793072315d8105de638017 --- /dev/null +++ b/ppocr/modeling/backbones/rec_svtrnet.py @@ -0,0 +1,592 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle import ParamAttr +from paddle.nn.initializer import KaimingNormal +import numpy as np +import paddle +import paddle.nn as nn +from paddle.nn.initializer import TruncatedNormal, Constant, Normal + +trunc_normal_ = TruncatedNormal(std=.02) +normal_ = Normal +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + + +def drop_path(x, drop_prob=0., training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... + """ + if drop_prob == 0. or not training: + return x + keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype) + shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) + random_tensor = paddle.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + return output + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=0, + bias_attr=False, + groups=1, + act=nn.GELU): + super().__init__() + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=paddle.ParamAttr( + initializer=nn.initializer.KaimingUniform()), + bias_attr=bias_attr) + self.norm = nn.BatchNorm2D(out_channels) + self.act = act() + + def forward(self, inputs): + out = self.conv(inputs) + out = self.norm(out) + out = self.act(out) + return out + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Identity(nn.Layer): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +class Mlp(nn.Layer): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class ConvMixer(nn.Layer): + def __init__( + self, + dim, + num_heads=8, + HW=[8, 25], + local_k=[3, 3], ): + super().__init__() + self.HW = HW + self.dim = dim + self.local_mixer = nn.Conv2D( + dim, + dim, + local_k, + 1, [local_k[0] // 2, local_k[1] // 2], + groups=num_heads, + weight_attr=ParamAttr(initializer=KaimingNormal())) + + def forward(self, x): + h = self.HW[0] + w = self.HW[1] + x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w]) + x = self.local_mixer(x) + x = x.flatten(2).transpose([0, 2, 1]) + return x + + +class Attention(nn.Layer): + def __init__(self, + dim, + num_heads=8, + mixer='Global', + HW=None, + local_k=[7, 11], + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.HW = HW + if HW is not None: + H = HW[0] + W = HW[1] + self.N = H * W + self.C = dim + if mixer == 'Local' and HW is not None: + hk = local_k[0] + wk = local_k[1] + mask = paddle.ones([H * W, H + hk - 1, W + wk - 1], dtype='float32') + for h in range(0, H): + for w in range(0, W): + mask[h * W + w, h:h + hk, w:w + wk] = 0. + mask_paddle = mask[:, hk // 2:H + hk // 2, wk // 2:W + wk // + 2].flatten(1) + mask_inf = paddle.full([H * W, H * W], '-inf', dtype='float32') + mask = paddle.where(mask_paddle < 1, mask_paddle, mask_inf) + self.mask = mask.unsqueeze([0, 1]) + self.mixer = mixer + + def forward(self, x): + if self.HW is not None: + N = self.N + C = self.C + else: + _, N, C = x.shape + qkv = self.qkv(x).reshape((0, N, 3, self.num_heads, C // + self.num_heads)).transpose((2, 0, 3, 1, 4)) + q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] + + attn = (q.matmul(k.transpose((0, 1, 3, 2)))) + if self.mixer == 'Local': + attn += self.mask + attn = nn.functional.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((0, N, C)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Layer): + def __init__(self, + dim, + num_heads, + mixer='Global', + local_mixer=[7, 11], + HW=None, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer='nn.LayerNorm', + epsilon=1e-6, + prenorm=True): + super().__init__() + if isinstance(norm_layer, str): + self.norm1 = eval(norm_layer)(dim, epsilon=epsilon) + else: + self.norm1 = norm_layer(dim) + if mixer == 'Global' or mixer == 'Local': + self.mixer = Attention( + dim, + num_heads=num_heads, + mixer=mixer, + HW=HW, + local_k=local_mixer, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + elif mixer == 'Conv': + self.mixer = ConvMixer( + dim, num_heads=num_heads, HW=HW, local_k=local_mixer) + else: + raise TypeError("The mixer must be one of [Global, Local, Conv]") + + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + if isinstance(norm_layer, str): + self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) + else: + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp_ratio = mlp_ratio + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + self.prenorm = prenorm + + def forward(self, x): + if self.prenorm: + x = self.norm1(x + self.drop_path(self.mixer(x))) + x = self.norm2(x + self.drop_path(self.mlp(x))) + else: + x = x + self.drop_path(self.mixer(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Layer): + """ Image to Patch Embedding + """ + + def __init__(self, + img_size=[32, 100], + in_channels=3, + embed_dim=768, + sub_num=2, + patch_size=[4, 4], + mode='pope'): + super().__init__() + num_patches = (img_size[1] // (2 ** sub_num)) * \ + (img_size[0] // (2 ** sub_num)) + self.img_size = img_size + self.num_patches = num_patches + self.embed_dim = embed_dim + self.norm = None + if mode == 'pope': + if sub_num == 2: + self.proj = nn.Sequential( + ConvBNLayer( + in_channels=in_channels, + out_channels=embed_dim // 2, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None), + ConvBNLayer( + in_channels=embed_dim // 2, + out_channels=embed_dim, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None)) + if sub_num == 3: + self.proj = nn.Sequential( + ConvBNLayer( + in_channels=in_channels, + out_channels=embed_dim // 4, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None), + ConvBNLayer( + in_channels=embed_dim // 4, + out_channels=embed_dim // 2, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None), + ConvBNLayer( + in_channels=embed_dim // 2, + out_channels=embed_dim, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None)) + elif mode == 'linear': + self.proj = nn.Conv2D( + 1, embed_dim, kernel_size=patch_size, stride=patch_size) + self.num_patches = img_size[0] // patch_size[0] * img_size[ + 1] // patch_size[1] + + def forward(self, x): + B, C, H, W = x.shape + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).transpose((0, 2, 1)) + return x + + +class SubSample(nn.Layer): + def __init__(self, + in_channels, + out_channels, + types='Pool', + stride=[2, 1], + sub_norm='nn.LayerNorm', + act=None): + super().__init__() + self.types = types + if types == 'Pool': + self.avgpool = nn.AvgPool2D( + kernel_size=[3, 5], stride=stride, padding=[1, 2]) + self.maxpool = nn.MaxPool2D( + kernel_size=[3, 5], stride=stride, padding=[1, 2]) + self.proj = nn.Linear(in_channels, out_channels) + else: + self.conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + weight_attr=ParamAttr(initializer=KaimingNormal())) + self.norm = eval(sub_norm)(out_channels) + if act is not None: + self.act = act() + else: + self.act = None + + def forward(self, x): + + if self.types == 'Pool': + x1 = self.avgpool(x) + x2 = self.maxpool(x) + x = (x1 + x2) * 0.5 + out = self.proj(x.flatten(2).transpose((0, 2, 1))) + else: + x = self.conv(x) + out = x.flatten(2).transpose((0, 2, 1)) + out = self.norm(out) + if self.act is not None: + out = self.act(out) + + return out + + +class SVTRNet(nn.Layer): + def __init__( + self, + img_size=[32, 100], + in_channels=3, + embed_dim=[64, 128, 256], + depth=[3, 6, 3], + num_heads=[2, 4, 8], + mixer=['Local'] * 6 + ['Global'] * + 6, # Local atten, Global atten, Conv + local_mixer=[[7, 11], [7, 11], [7, 11]], + patch_merging='Conv', # Conv, Pool, None + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + last_drop=0.1, + attn_drop_rate=0., + drop_path_rate=0.1, + norm_layer='nn.LayerNorm', + sub_norm='nn.LayerNorm', + epsilon=1e-6, + out_channels=192, + out_char_num=25, + block_unit='Block', + act='nn.GELU', + last_stage=True, + sub_num=2, + prenorm=True, + use_lenhead=False, + **kwargs): + super().__init__() + self.img_size = img_size + self.embed_dim = embed_dim + self.out_channels = out_channels + self.prenorm = prenorm + patch_merging = None if patch_merging != 'Conv' and patch_merging != 'Pool' else patch_merging + self.patch_embed = PatchEmbed( + img_size=img_size, + in_channels=in_channels, + embed_dim=embed_dim[0], + sub_num=sub_num) + num_patches = self.patch_embed.num_patches + self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)] + self.pos_embed = self.create_parameter( + shape=[1, num_patches, embed_dim[0]], default_initializer=zeros_) + self.add_parameter("pos_embed", self.pos_embed) + self.pos_drop = nn.Dropout(p=drop_rate) + Block_unit = eval(block_unit) + + dpr = np.linspace(0, drop_path_rate, sum(depth)) + self.blocks1 = nn.LayerList([ + Block_unit( + dim=embed_dim[0], + num_heads=num_heads[0], + mixer=mixer[0:depth[0]][i], + HW=self.HW, + local_mixer=local_mixer[0], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=eval(act), + attn_drop=attn_drop_rate, + drop_path=dpr[0:depth[0]][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm) for i in range(depth[0]) + ]) + if patch_merging is not None: + self.sub_sample1 = SubSample( + embed_dim[0], + embed_dim[1], + sub_norm=sub_norm, + stride=[2, 1], + types=patch_merging) + HW = [self.HW[0] // 2, self.HW[1]] + else: + HW = self.HW + self.patch_merging = patch_merging + self.blocks2 = nn.LayerList([ + Block_unit( + dim=embed_dim[1], + num_heads=num_heads[1], + mixer=mixer[depth[0]:depth[0] + depth[1]][i], + HW=HW, + local_mixer=local_mixer[1], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=eval(act), + attn_drop=attn_drop_rate, + drop_path=dpr[depth[0]:depth[0] + depth[1]][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm) for i in range(depth[1]) + ]) + if patch_merging is not None: + self.sub_sample2 = SubSample( + embed_dim[1], + embed_dim[2], + sub_norm=sub_norm, + stride=[2, 1], + types=patch_merging) + HW = [self.HW[0] // 4, self.HW[1]] + else: + HW = self.HW + self.blocks3 = nn.LayerList([ + Block_unit( + dim=embed_dim[2], + num_heads=num_heads[2], + mixer=mixer[depth[0] + depth[1]:][i], + HW=HW, + local_mixer=local_mixer[2], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=eval(act), + attn_drop=attn_drop_rate, + drop_path=dpr[depth[0] + depth[1]:][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm) for i in range(depth[2]) + ]) + self.last_stage = last_stage + if last_stage: + self.avg_pool = nn.AdaptiveAvgPool2D([1, out_char_num]) + self.last_conv = nn.Conv2D( + in_channels=embed_dim[2], + out_channels=self.out_channels, + kernel_size=1, + stride=1, + padding=0, + bias_attr=False) + self.hardswish = nn.Hardswish() + self.dropout = nn.Dropout(p=last_drop, mode="downscale_in_infer") + if not prenorm: + self.norm = eval(norm_layer)(embed_dim[-1], epsilon=epsilon) + self.use_lenhead = use_lenhead + if use_lenhead: + self.len_conv = nn.Linear(embed_dim[2], self.out_channels) + self.hardswish_len = nn.Hardswish() + self.dropout_len = nn.Dropout( + p=last_drop, mode="downscale_in_infer") + + trunc_normal_(self.pos_embed) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + + def forward_features(self, x): + x = self.patch_embed(x) + x = x + self.pos_embed + x = self.pos_drop(x) + for blk in self.blocks1: + x = blk(x) + if self.patch_merging is not None: + x = self.sub_sample1( + x.transpose([0, 2, 1]).reshape( + [0, self.embed_dim[0], self.HW[0], self.HW[1]])) + for blk in self.blocks2: + x = blk(x) + if self.patch_merging is not None: + x = self.sub_sample2( + x.transpose([0, 2, 1]).reshape( + [0, self.embed_dim[1], self.HW[0] // 2, self.HW[1]])) + for blk in self.blocks3: + x = blk(x) + if not self.prenorm: + x = self.norm(x) + return x + + def forward(self, x): + x = self.forward_features(x) + if self.use_lenhead: + len_x = self.len_conv(x.mean(1)) + len_x = self.dropout_len(self.hardswish_len(len_x)) + if self.last_stage: + if self.patch_merging is not None: + h = self.HW[0] // 4 + else: + h = self.HW[0] + x = self.avg_pool( + x.transpose([0, 2, 1]).reshape( + [0, self.embed_dim[2], h, self.HW[1]])) + x = self.last_conv(x) + x = self.hardswish(x) + x = self.dropout(x) + if self.use_lenhead: + return x, len_x + return x diff --git a/ppocr/modeling/backbones/rec_vitstr.py b/ppocr/modeling/backbones/rec_vitstr.py new file mode 100644 index 0000000000000000000000000000000000000000..d5d7d5148a1120e6f97a321b4135c6780c0c5db2 --- /dev/null +++ b/ppocr/modeling/backbones/rec_vitstr.py @@ -0,0 +1,120 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/roatienza/deep-text-recognition-benchmark/blob/master/modules/vitstr.py +""" + +import numpy as np +import paddle +import paddle.nn as nn +from ppocr.modeling.backbones.rec_svtrnet import Block, PatchEmbed, zeros_, trunc_normal_, ones_ + +scale_dim_heads = {'tiny': [192, 3], 'small': [384, 6], 'base': [768, 12]} + + +class ViTSTR(nn.Layer): + def __init__(self, + img_size=[224, 224], + in_channels=1, + scale='tiny', + seqlen=27, + patch_size=[16, 16], + embed_dim=None, + depth=12, + num_heads=None, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_path_rate=0., + drop_rate=0., + attn_drop_rate=0., + norm_layer='nn.LayerNorm', + act_layer='nn.GELU', + epsilon=1e-6, + out_channels=None, + **kwargs): + super().__init__() + self.seqlen = seqlen + embed_dim = embed_dim if embed_dim is not None else scale_dim_heads[ + scale][0] + num_heads = num_heads if num_heads is not None else scale_dim_heads[ + scale][1] + out_channels = out_channels if out_channels is not None else embed_dim + self.patch_embed = PatchEmbed( + img_size=img_size, + in_channels=in_channels, + embed_dim=embed_dim, + patch_size=patch_size, + mode='linear') + num_patches = self.patch_embed.num_patches + + self.pos_embed = self.create_parameter( + shape=[1, num_patches + 1, embed_dim], default_initializer=zeros_) + self.add_parameter("pos_embed", self.pos_embed) + self.cls_token = self.create_parameter( + shape=[1, 1, embed_dim], default_initializer=zeros_) + self.add_parameter("cls_token", self.cls_token) + + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = np.linspace(0, drop_path_rate, depth) + self.blocks = nn.LayerList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=eval(act_layer), + epsilon=epsilon, + prenorm=False) for i in range(depth) + ]) + self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon) + + self.out_channels = out_channels + + trunc_normal_(self.pos_embed) + trunc_normal_(self.cls_token) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + + def forward_features(self, x): + B = x.shape[0] + x = self.patch_embed(x) + cls_tokens = paddle.tile(self.cls_token, repeat_times=[B, 1, 1]) + x = paddle.concat((cls_tokens, x), axis=1) + x = x + self.pos_embed + x = self.pos_drop(x) + for blk in self.blocks: + x = blk(x) + x = self.norm(x) + return x + + def forward(self, x): + x = self.forward_features(x) + x = x[:, :self.seqlen] + return x.transpose([0, 2, 1]).unsqueeze(2) diff --git a/ppocr/modeling/backbones/table_master_resnet.py b/ppocr/modeling/backbones/table_master_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..dacf5ed26e5374b3c93c1a983be1d7b5b4c471fc --- /dev/null +++ b/ppocr/modeling/backbones/table_master_resnet.py @@ -0,0 +1,369 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/JiaquanYe/TableMASTER-mmocr/blob/master/mmocr/models/textrecog/backbones/table_resnet_extra.py +""" + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class BasicBlock(nn.Layer): + expansion = 1 + + def __init__(self, + inplanes, + planes, + stride=1, + downsample=None, + gcb_config=None): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2D( + inplanes, + planes, + kernel_size=3, + stride=stride, + padding=1, + bias_attr=False) + self.bn1 = nn.BatchNorm2D(planes, momentum=0.9) + self.relu = nn.ReLU() + self.conv2 = nn.Conv2D( + planes, planes, kernel_size=3, stride=1, padding=1, bias_attr=False) + self.bn2 = nn.BatchNorm2D(planes, momentum=0.9) + self.downsample = downsample + self.stride = stride + self.gcb_config = gcb_config + + if self.gcb_config is not None: + gcb_ratio = gcb_config['ratio'] + gcb_headers = gcb_config['headers'] + att_scale = gcb_config['att_scale'] + fusion_type = gcb_config['fusion_type'] + self.context_block = MultiAspectGCAttention( + inplanes=planes, + ratio=gcb_ratio, + headers=gcb_headers, + att_scale=att_scale, + fusion_type=fusion_type) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.gcb_config is not None: + out = self.context_block(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +def get_gcb_config(gcb_config, layer): + if gcb_config is None or not gcb_config['layers'][layer]: + return None + else: + return gcb_config + + +class TableResNetExtra(nn.Layer): + def __init__(self, layers, in_channels=3, gcb_config=None): + assert len(layers) >= 4 + + super(TableResNetExtra, self).__init__() + self.inplanes = 128 + self.conv1 = nn.Conv2D( + in_channels, + 64, + kernel_size=3, + stride=1, + padding=1, + bias_attr=False) + self.bn1 = nn.BatchNorm2D(64) + self.relu1 = nn.ReLU() + + self.conv2 = nn.Conv2D( + 64, 128, kernel_size=3, stride=1, padding=1, bias_attr=False) + self.bn2 = nn.BatchNorm2D(128) + self.relu2 = nn.ReLU() + + self.maxpool1 = nn.MaxPool2D(kernel_size=2, stride=2) + + self.layer1 = self._make_layer( + BasicBlock, + 256, + layers[0], + stride=1, + gcb_config=get_gcb_config(gcb_config, 0)) + + self.conv3 = nn.Conv2D( + 256, 256, kernel_size=3, stride=1, padding=1, bias_attr=False) + self.bn3 = nn.BatchNorm2D(256) + self.relu3 = nn.ReLU() + + self.maxpool2 = nn.MaxPool2D(kernel_size=2, stride=2) + + self.layer2 = self._make_layer( + BasicBlock, + 256, + layers[1], + stride=1, + gcb_config=get_gcb_config(gcb_config, 1)) + + self.conv4 = nn.Conv2D( + 256, 256, kernel_size=3, stride=1, padding=1, bias_attr=False) + self.bn4 = nn.BatchNorm2D(256) + self.relu4 = nn.ReLU() + + self.maxpool3 = nn.MaxPool2D(kernel_size=2, stride=2) + + self.layer3 = self._make_layer( + BasicBlock, + 512, + layers[2], + stride=1, + gcb_config=get_gcb_config(gcb_config, 2)) + + self.conv5 = nn.Conv2D( + 512, 512, kernel_size=3, stride=1, padding=1, bias_attr=False) + self.bn5 = nn.BatchNorm2D(512) + self.relu5 = nn.ReLU() + + self.layer4 = self._make_layer( + BasicBlock, + 512, + layers[3], + stride=1, + gcb_config=get_gcb_config(gcb_config, 3)) + + self.conv6 = nn.Conv2D( + 512, 512, kernel_size=3, stride=1, padding=1, bias_attr=False) + self.bn6 = nn.BatchNorm2D(512) + self.relu6 = nn.ReLU() + + self.out_channels = [256, 256, 512] + + def _make_layer(self, block, planes, blocks, stride=1, gcb_config=None): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2D( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias_attr=False), + nn.BatchNorm2D(planes * block.expansion), ) + + layers = [] + layers.append( + block( + self.inplanes, + planes, + stride, + downsample, + gcb_config=gcb_config)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + f = [] + x = self.conv1(x) + + x = self.bn1(x) + x = self.relu1(x) + + x = self.conv2(x) + x = self.bn2(x) + x = self.relu2(x) + + x = self.maxpool1(x) + x = self.layer1(x) + + x = self.conv3(x) + x = self.bn3(x) + x = self.relu3(x) + f.append(x) + + x = self.maxpool2(x) + x = self.layer2(x) + + x = self.conv4(x) + x = self.bn4(x) + x = self.relu4(x) + f.append(x) + + x = self.maxpool3(x) + + x = self.layer3(x) + x = self.conv5(x) + x = self.bn5(x) + x = self.relu5(x) + + x = self.layer4(x) + x = self.conv6(x) + x = self.bn6(x) + x = self.relu6(x) + f.append(x) + return f + + +class MultiAspectGCAttention(nn.Layer): + def __init__(self, + inplanes, + ratio, + headers, + pooling_type='att', + att_scale=False, + fusion_type='channel_add'): + super(MultiAspectGCAttention, self).__init__() + assert pooling_type in ['avg', 'att'] + + assert fusion_type in ['channel_add', 'channel_mul', 'channel_concat'] + assert inplanes % headers == 0 and inplanes >= 8 # inplanes must be divided by headers evenly + + self.headers = headers + self.inplanes = inplanes + self.ratio = ratio + self.planes = int(inplanes * ratio) + self.pooling_type = pooling_type + self.fusion_type = fusion_type + self.att_scale = False + + self.single_header_inplanes = int(inplanes / headers) + + if pooling_type == 'att': + self.conv_mask = nn.Conv2D( + self.single_header_inplanes, 1, kernel_size=1) + self.softmax = nn.Softmax(axis=2) + else: + self.avg_pool = nn.AdaptiveAvgPool2D(1) + + if fusion_type == 'channel_add': + self.channel_add_conv = nn.Sequential( + nn.Conv2D( + self.inplanes, self.planes, kernel_size=1), + nn.LayerNorm([self.planes, 1, 1]), + nn.ReLU(), + nn.Conv2D( + self.planes, self.inplanes, kernel_size=1)) + elif fusion_type == 'channel_concat': + self.channel_concat_conv = nn.Sequential( + nn.Conv2D( + self.inplanes, self.planes, kernel_size=1), + nn.LayerNorm([self.planes, 1, 1]), + nn.ReLU(), + nn.Conv2D( + self.planes, self.inplanes, kernel_size=1)) + # for concat + self.cat_conv = nn.Conv2D( + 2 * self.inplanes, self.inplanes, kernel_size=1) + elif fusion_type == 'channel_mul': + self.channel_mul_conv = nn.Sequential( + nn.Conv2D( + self.inplanes, self.planes, kernel_size=1), + nn.LayerNorm([self.planes, 1, 1]), + nn.ReLU(), + nn.Conv2D( + self.planes, self.inplanes, kernel_size=1)) + + def spatial_pool(self, x): + batch, channel, height, width = x.shape + if self.pooling_type == 'att': + # [N*headers, C', H , W] C = headers * C' + x = x.reshape([ + batch * self.headers, self.single_header_inplanes, height, width + ]) + input_x = x + + # [N*headers, C', H * W] C = headers * C' + # input_x = input_x.view(batch, channel, height * width) + input_x = input_x.reshape([ + batch * self.headers, self.single_header_inplanes, + height * width + ]) + + # [N*headers, 1, C', H * W] + input_x = input_x.unsqueeze(1) + # [N*headers, 1, H, W] + context_mask = self.conv_mask(x) + # [N*headers, 1, H * W] + context_mask = context_mask.reshape( + [batch * self.headers, 1, height * width]) + + # scale variance + if self.att_scale and self.headers > 1: + context_mask = context_mask / paddle.sqrt( + self.single_header_inplanes) + + # [N*headers, 1, H * W] + context_mask = self.softmax(context_mask) + + # [N*headers, 1, H * W, 1] + context_mask = context_mask.unsqueeze(-1) + # [N*headers, 1, C', 1] = [N*headers, 1, C', H * W] * [N*headers, 1, H * W, 1] + context = paddle.matmul(input_x, context_mask) + + # [N, headers * C', 1, 1] + context = context.reshape( + [batch, self.headers * self.single_header_inplanes, 1, 1]) + else: + # [N, C, 1, 1] + context = self.avg_pool(x) + + return context + + def forward(self, x): + # [N, C, 1, 1] + context = self.spatial_pool(x) + + out = x + + if self.fusion_type == 'channel_mul': + # [N, C, 1, 1] + channel_mul_term = F.sigmoid(self.channel_mul_conv(context)) + out = out * channel_mul_term + elif self.fusion_type == 'channel_add': + # [N, C, 1, 1] + channel_add_term = self.channel_add_conv(context) + out = out + channel_add_term + else: + # [N, C, 1, 1] + channel_concat_term = self.channel_concat_conv(context) + + # use concat + _, C1, _, _ = channel_concat_term.shape + N, C2, H, W = out.shape + + out = paddle.concat( + [out, channel_concat_term.expand([-1, -1, H, W])], axis=1) + out = self.cat_conv(out) + out = F.layer_norm(out, [self.inplanes, H, W]) + out = F.relu(out) + + return out diff --git a/ppocr/modeling/backbones/vqa_layoutlm.py b/ppocr/modeling/backbones/vqa_layoutlm.py new file mode 100644 index 0000000000000000000000000000000000000000..acb1315cc0a588396549e5b8928bd2e4d3c769be --- /dev/null +++ b/ppocr/modeling/backbones/vqa_layoutlm.py @@ -0,0 +1,238 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +from paddle import nn + +from paddlenlp.transformers import LayoutXLMModel, LayoutXLMForTokenClassification, LayoutXLMForRelationExtraction +from paddlenlp.transformers import LayoutLMModel, LayoutLMForTokenClassification +from paddlenlp.transformers import LayoutLMv2Model, LayoutLMv2ForTokenClassification, LayoutLMv2ForRelationExtraction +from paddlenlp.transformers import AutoModel + +__all__ = ["LayoutXLMForSer", "LayoutLMForSer"] + +pretrained_model_dict = { + LayoutXLMModel: { + "base": "layoutxlm-base-uncased", + "vi": "vi-layoutxlm-base-uncased", + }, + LayoutLMModel: { + "base": "layoutlm-base-uncased", + }, + LayoutLMv2Model: { + "base": "layoutlmv2-base-uncased", + "vi": "vi-layoutlmv2-base-uncased", + }, +} + + +class NLPBaseModel(nn.Layer): + def __init__(self, + base_model_class, + model_class, + mode="base", + type="ser", + pretrained=True, + checkpoints=None, + **kwargs): + super(NLPBaseModel, self).__init__() + if checkpoints is not None: # load the trained model + self.model = model_class.from_pretrained(checkpoints) + else: # load the pretrained-model + pretrained_model_name = pretrained_model_dict[base_model_class][ + mode] + if pretrained is True: + base_model = base_model_class.from_pretrained( + pretrained_model_name) + else: + base_model = base_model_class.from_pretrained(pretrained) + if type == "ser": + self.model = model_class( + base_model, num_classes=kwargs["num_classes"], dropout=None) + else: + self.model = model_class(base_model, dropout=None) + self.out_channels = 1 + self.use_visual_backbone = True + + +class LayoutLMForSer(NLPBaseModel): + def __init__(self, + num_classes, + pretrained=True, + checkpoints=None, + mode="base", + **kwargs): + super(LayoutLMForSer, self).__init__( + LayoutLMModel, + LayoutLMForTokenClassification, + mode, + "ser", + pretrained, + checkpoints, + num_classes=num_classes, ) + self.use_visual_backbone = False + + def forward(self, x): + x = self.model( + input_ids=x[0], + bbox=x[1], + attention_mask=x[2], + token_type_ids=x[3], + position_ids=None, + output_hidden_states=False) + return x + + +class LayoutLMv2ForSer(NLPBaseModel): + def __init__(self, + num_classes, + pretrained=True, + checkpoints=None, + mode="base", + **kwargs): + super(LayoutLMv2ForSer, self).__init__( + LayoutLMv2Model, + LayoutLMv2ForTokenClassification, + mode, + "ser", + pretrained, + checkpoints, + num_classes=num_classes) + if hasattr(self.model.layoutlmv2, "use_visual_backbone" + ) and self.model.layoutlmv2.use_visual_backbone is False: + self.use_visual_backbone = False + + def forward(self, x): + if self.use_visual_backbone is True: + image = x[4] + else: + image = None + x = self.model( + input_ids=x[0], + bbox=x[1], + attention_mask=x[2], + token_type_ids=x[3], + image=image, + position_ids=None, + head_mask=None, + labels=None) + if self.training: + res = {"backbone_out": x[0]} + res.update(x[1]) + return res + else: + return x + + +class LayoutXLMForSer(NLPBaseModel): + def __init__(self, + num_classes, + pretrained=True, + checkpoints=None, + mode="base", + **kwargs): + super(LayoutXLMForSer, self).__init__( + LayoutXLMModel, + LayoutXLMForTokenClassification, + mode, + "ser", + pretrained, + checkpoints, + num_classes=num_classes) + if hasattr(self.model.layoutxlm, "use_visual_backbone" + ) and self.model.layoutxlm.use_visual_backbone is False: + self.use_visual_backbone = False + + def forward(self, x): + if self.use_visual_backbone is True: + image = x[4] + else: + image = None + x = self.model( + input_ids=x[0], + bbox=x[1], + attention_mask=x[2], + token_type_ids=x[3], + image=image, + position_ids=None, + head_mask=None, + labels=None) + if self.training: + res = {"backbone_out": x[0]} + res.update(x[1]) + return res + else: + return x + + +class LayoutLMv2ForRe(NLPBaseModel): + def __init__(self, pretrained=True, checkpoints=None, mode="base", + **kwargs): + super(LayoutLMv2ForRe, self).__init__( + LayoutLMv2Model, LayoutLMv2ForRelationExtraction, mode, "re", + pretrained, checkpoints) + if hasattr(self.model.layoutlmv2, "use_visual_backbone" + ) and self.model.layoutlmv2.use_visual_backbone is False: + self.use_visual_backbone = False + + def forward(self, x): + x = self.model( + input_ids=x[0], + bbox=x[1], + attention_mask=x[2], + token_type_ids=x[3], + image=x[4], + position_ids=None, + head_mask=None, + labels=None, + entities=x[5], + relations=x[6]) + return x + + +class LayoutXLMForRe(NLPBaseModel): + def __init__(self, pretrained=True, checkpoints=None, mode="base", + **kwargs): + super(LayoutXLMForRe, self).__init__( + LayoutXLMModel, LayoutXLMForRelationExtraction, mode, "re", + pretrained, checkpoints) + if hasattr(self.model.layoutxlm, "use_visual_backbone" + ) and self.model.layoutxlm.use_visual_backbone is False: + self.use_visual_backbone = False + + def forward(self, x): + if self.use_visual_backbone is True: + image = x[4] + entities = x[5] + relations = x[6] + else: + image = None + entities = x[4] + relations = x[5] + x = self.model( + input_ids=x[0], + bbox=x[1], + attention_mask=x[2], + token_type_ids=x[3], + image=image, + position_ids=None, + head_mask=None, + labels=None, + entities=entities, + relations=relations) + return x diff --git a/ppocr/modeling/heads/__init__.py b/ppocr/modeling/heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..65afaf84f4453f2d4199371576ac71bb93a1e6d5 --- /dev/null +++ b/ppocr/modeling/heads/__init__.py @@ -0,0 +1,72 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['build_head'] + + +def build_head(config): + # det head + from .det_db_head import DBHead + from .det_east_head import EASTHead + from .det_sast_head import SASTHead + from .det_pse_head import PSEHead + from .det_fce_head import FCEHead + from .e2e_pg_head import PGHead + from .det_ct_head import CT_Head + + # rec head + from .rec_ctc_head import CTCHead + from .rec_att_head import AttentionHead + from .rec_srn_head import SRNHead + from .rec_nrtr_head import Transformer + from .rec_sar_head import SARHead + from .rec_aster_head import AsterHead + from .rec_pren_head import PRENHead + from .rec_multi_head import MultiHead + from .rec_spin_att_head import SPINAttentionHead + from .rec_abinet_head import ABINetHead + from .rec_robustscanner_head import RobustScannerHead + from .rec_visionlan_head import VLHead + from .rec_rfl_head import RFLHead + from .rec_can_head import CANHead + + # cls head + from .cls_head import ClsHead + + #kie head + from .kie_sdmgr_head import SDMGRHead + + from .table_att_head import TableAttentionHead, SLAHead + from .table_master_head import TableMasterHead + + support_dict = [ + 'DBHead', 'PSEHead', 'FCEHead', 'EASTHead', 'SASTHead', 'CTCHead', + 'ClsHead', 'AttentionHead', 'SRNHead', 'PGHead', 'Transformer', + 'TableAttentionHead', 'SARHead', 'AsterHead', 'SDMGRHead', 'PRENHead', + 'MultiHead', 'ABINetHead', 'TableMasterHead', 'SPINAttentionHead', + 'VLHead', 'SLAHead', 'RobustScannerHead', 'CT_Head', 'RFLHead', + 'DRRGHead', 'CANHead' + ] + + if config['name'] == 'DRRGHead': + from .det_drrg_head import DRRGHead + support_dict.append('DRRGHead') + + #table head + + module_name = config.pop('name') + assert module_name in support_dict, Exception('head only support {}'.format( + support_dict)) + module_class = eval(module_name)(**config) + return module_class diff --git a/ppocr/modeling/heads/cls_head.py b/ppocr/modeling/heads/cls_head.py new file mode 100644 index 0000000000000000000000000000000000000000..91bfa615a8206b5ec0f993429ccae990a05d0b9b --- /dev/null +++ b/ppocr/modeling/heads/cls_head.py @@ -0,0 +1,52 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn, ParamAttr +import paddle.nn.functional as F + + +class ClsHead(nn.Layer): + """ + Class orientation + + Args: + + params(dict): super parameters for build Class network + """ + + def __init__(self, in_channels, class_dim, **kwargs): + super(ClsHead, self).__init__() + self.pool = nn.AdaptiveAvgPool2D(1) + stdv = 1.0 / math.sqrt(in_channels * 1.0) + self.fc = nn.Linear( + in_channels, + class_dim, + weight_attr=ParamAttr( + name="fc_0.w_0", + initializer=nn.initializer.Uniform(-stdv, stdv)), + bias_attr=ParamAttr(name="fc_0.b_0"), ) + + def forward(self, x, targets=None): + x = self.pool(x) + x = paddle.reshape(x, shape=[x.shape[0], x.shape[1]]) + x = self.fc(x) + if not self.training: + x = F.softmax(x, axis=1) + return x diff --git a/ppocr/modeling/heads/det_ct_head.py b/ppocr/modeling/heads/det_ct_head.py new file mode 100644 index 0000000000000000000000000000000000000000..08e6719e8f0ade6887eb4ad7f44a2bc36ec132db --- /dev/null +++ b/ppocr/modeling/heads/det_ct_head.py @@ -0,0 +1,69 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + +import math +from paddle.nn.initializer import TruncatedNormal, Constant, Normal +ones_ = Constant(value=1.) +zeros_ = Constant(value=0.) + + +class CT_Head(nn.Layer): + def __init__(self, + in_channels, + hidden_dim, + num_classes, + loss_kernel=None, + loss_loc=None): + super(CT_Head, self).__init__() + self.conv1 = nn.Conv2D( + in_channels, hidden_dim, kernel_size=3, stride=1, padding=1) + self.bn1 = nn.BatchNorm2D(hidden_dim) + self.relu1 = nn.ReLU() + + self.conv2 = nn.Conv2D( + hidden_dim, num_classes, kernel_size=1, stride=1, padding=0) + + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + normal_ = Normal(mean=0.0, std=math.sqrt(2. / n)) + normal_(m.weight) + elif isinstance(m, nn.BatchNorm2D): + zeros_(m.bias) + ones_(m.weight) + + def _upsample(self, x, scale=1): + return F.upsample(x, scale_factor=scale, mode='bilinear') + + def forward(self, f, targets=None): + out = self.conv1(f) + out = self.relu1(self.bn1(out)) + out = self.conv2(out) + + if self.training: + out = self._upsample(out, scale=4) + return {'maps': out} + else: + score = F.sigmoid(out[:, 0, :, :]) + return {'maps': out, 'score': score} diff --git a/ppocr/modeling/heads/det_db_head.py b/ppocr/modeling/heads/det_db_head.py new file mode 100644 index 0000000000000000000000000000000000000000..77cb6f1db2dda92dbe74803af68c6ec87ed2d583 --- /dev/null +++ b/ppocr/modeling/heads/det_db_head.py @@ -0,0 +1,110 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +def get_bias_attr(k): + stdv = 1.0 / math.sqrt(k * 1.0) + initializer = paddle.nn.initializer.Uniform(-stdv, stdv) + bias_attr = ParamAttr(initializer=initializer) + return bias_attr + + +class Head(nn.Layer): + def __init__(self, in_channels, kernel_list=[3, 2, 2], **kwargs): + super(Head, self).__init__() + + self.conv1 = nn.Conv2D( + in_channels=in_channels, + out_channels=in_channels // 4, + kernel_size=kernel_list[0], + padding=int(kernel_list[0] // 2), + weight_attr=ParamAttr(), + bias_attr=False) + self.conv_bn1 = nn.BatchNorm( + num_channels=in_channels // 4, + param_attr=ParamAttr( + initializer=paddle.nn.initializer.Constant(value=1.0)), + bias_attr=ParamAttr( + initializer=paddle.nn.initializer.Constant(value=1e-4)), + act='relu') + self.conv2 = nn.Conv2DTranspose( + in_channels=in_channels // 4, + out_channels=in_channels // 4, + kernel_size=kernel_list[1], + stride=2, + weight_attr=ParamAttr( + initializer=paddle.nn.initializer.KaimingUniform()), + bias_attr=get_bias_attr(in_channels // 4)) + self.conv_bn2 = nn.BatchNorm( + num_channels=in_channels // 4, + param_attr=ParamAttr( + initializer=paddle.nn.initializer.Constant(value=1.0)), + bias_attr=ParamAttr( + initializer=paddle.nn.initializer.Constant(value=1e-4)), + act="relu") + self.conv3 = nn.Conv2DTranspose( + in_channels=in_channels // 4, + out_channels=1, + kernel_size=kernel_list[2], + stride=2, + weight_attr=ParamAttr( + initializer=paddle.nn.initializer.KaimingUniform()), + bias_attr=get_bias_attr(in_channels // 4), ) + + def forward(self, x): + x = self.conv1(x) + x = self.conv_bn1(x) + x = self.conv2(x) + x = self.conv_bn2(x) + x = self.conv3(x) + x = F.sigmoid(x) + return x + + +class DBHead(nn.Layer): + """ + Differentiable Binarization (DB) for text detection: + see https://arxiv.org/abs/1911.08947 + args: + params(dict): super parameters for build DB network + """ + + def __init__(self, in_channels, k=50, **kwargs): + super(DBHead, self).__init__() + self.k = k + self.binarize = Head(in_channels, **kwargs) + self.thresh = Head(in_channels, **kwargs) + + def step_function(self, x, y): + return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y))) + + def forward(self, x, targets=None): + shrink_maps = self.binarize(x) + if not self.training: + return {'maps': shrink_maps} + + threshold_maps = self.thresh(x) + binary_maps = self.step_function(shrink_maps, threshold_maps) + y = paddle.concat([shrink_maps, threshold_maps, binary_maps], axis=1) + return {'maps': y} diff --git a/ppocr/modeling/heads/det_drrg_head.py b/ppocr/modeling/heads/det_drrg_head.py new file mode 100644 index 0000000000000000000000000000000000000000..3aee1f8cb7734fd6093cd6ed11e5492ef5cd9785 --- /dev/null +++ b/ppocr/modeling/heads/det_drrg_head.py @@ -0,0 +1,191 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/dense_heads/drrg_head.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import warnings +import cv2 +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from .gcn import GCN +from .local_graph import LocalGraphs +from .proposal_local_graph import ProposalLocalGraphs + + +class DRRGHead(nn.Layer): + def __init__(self, + in_channels, + k_at_hops=(8, 4), + num_adjacent_linkages=3, + node_geo_feat_len=120, + pooling_scale=1.0, + pooling_output_size=(4, 3), + nms_thr=0.3, + min_width=8.0, + max_width=24.0, + comp_shrink_ratio=1.03, + comp_ratio=0.4, + comp_score_thr=0.3, + text_region_thr=0.2, + center_region_thr=0.2, + center_region_area_thr=50, + local_graph_thr=0.7, + **kwargs): + super().__init__() + + assert isinstance(in_channels, int) + assert isinstance(k_at_hops, tuple) + assert isinstance(num_adjacent_linkages, int) + assert isinstance(node_geo_feat_len, int) + assert isinstance(pooling_scale, float) + assert isinstance(pooling_output_size, tuple) + assert isinstance(comp_shrink_ratio, float) + assert isinstance(nms_thr, float) + assert isinstance(min_width, float) + assert isinstance(max_width, float) + assert isinstance(comp_ratio, float) + assert isinstance(comp_score_thr, float) + assert isinstance(text_region_thr, float) + assert isinstance(center_region_thr, float) + assert isinstance(center_region_area_thr, int) + assert isinstance(local_graph_thr, float) + + self.in_channels = in_channels + self.out_channels = 6 + self.downsample_ratio = 1.0 + self.k_at_hops = k_at_hops + self.num_adjacent_linkages = num_adjacent_linkages + self.node_geo_feat_len = node_geo_feat_len + self.pooling_scale = pooling_scale + self.pooling_output_size = pooling_output_size + self.comp_shrink_ratio = comp_shrink_ratio + self.nms_thr = nms_thr + self.min_width = min_width + self.max_width = max_width + self.comp_ratio = comp_ratio + self.comp_score_thr = comp_score_thr + self.text_region_thr = text_region_thr + self.center_region_thr = center_region_thr + self.center_region_area_thr = center_region_area_thr + self.local_graph_thr = local_graph_thr + + self.out_conv = nn.Conv2D( + in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=1, + stride=1, + padding=0) + + self.graph_train = LocalGraphs( + self.k_at_hops, self.num_adjacent_linkages, self.node_geo_feat_len, + self.pooling_scale, self.pooling_output_size, self.local_graph_thr) + + self.graph_test = ProposalLocalGraphs( + self.k_at_hops, self.num_adjacent_linkages, self.node_geo_feat_len, + self.pooling_scale, self.pooling_output_size, self.nms_thr, + self.min_width, self.max_width, self.comp_shrink_ratio, + self.comp_ratio, self.comp_score_thr, self.text_region_thr, + self.center_region_thr, self.center_region_area_thr) + + pool_w, pool_h = self.pooling_output_size + node_feat_len = (pool_w * pool_h) * ( + self.in_channels + self.out_channels) + self.node_geo_feat_len + self.gcn = GCN(node_feat_len) + + def forward(self, inputs, targets=None): + """ + Args: + inputs (Tensor): Shape of :math:`(N, C, H, W)`. + gt_comp_attribs (list[ndarray]): The padded text component + attributes. Shape: (num_component, 8). + + Returns: + tuple: Returns (pred_maps, (gcn_pred, gt_labels)). + + - | pred_maps (Tensor): Prediction map with shape + :math:`(N, C_{out}, H, W)`. + - | gcn_pred (Tensor): Prediction from GCN module, with + shape :math:`(N, 2)`. + - | gt_labels (Tensor): Ground-truth label with shape + :math:`(N, 8)`. + """ + if self.training: + assert targets is not None + gt_comp_attribs = targets[7] + pred_maps = self.out_conv(inputs) + feat_maps = paddle.concat([inputs, pred_maps], axis=1) + node_feats, adjacent_matrices, knn_inds, gt_labels = self.graph_train( + feat_maps, np.stack(gt_comp_attribs)) + + gcn_pred = self.gcn(node_feats, adjacent_matrices, knn_inds) + + return pred_maps, (gcn_pred, gt_labels) + else: + return self.single_test(inputs) + + def single_test(self, feat_maps): + r""" + Args: + feat_maps (Tensor): Shape of :math:`(N, C, H, W)`. + + Returns: + tuple: Returns (edge, score, text_comps). + + - | edge (ndarray): The edge array of shape :math:`(N, 2)` + where each row is a pair of text component indices + that makes up an edge in graph. + - | score (ndarray): The score array of shape :math:`(N,)`, + corresponding to the edge above. + - | text_comps (ndarray): The text components of shape + :math:`(N, 9)` where each row corresponds to one box and + its score: (x1, y1, x2, y2, x3, y3, x4, y4, score). + """ + pred_maps = self.out_conv(feat_maps) + feat_maps = paddle.concat([feat_maps, pred_maps], axis=1) + + none_flag, graph_data = self.graph_test(pred_maps, feat_maps) + + (local_graphs_node_feat, adjacent_matrices, pivots_knn_inds, + pivot_local_graphs, text_comps) = graph_data + + if none_flag: + return None, None, None + gcn_pred = self.gcn(local_graphs_node_feat, adjacent_matrices, + pivots_knn_inds) + pred_labels = F.softmax(gcn_pred, axis=1) + + edges = [] + scores = [] + pivot_local_graphs = pivot_local_graphs.squeeze().numpy() + + for pivot_ind, pivot_local_graph in enumerate(pivot_local_graphs): + pivot = pivot_local_graph[0] + for k_ind, neighbor_ind in enumerate(pivots_knn_inds[pivot_ind]): + neighbor = pivot_local_graph[neighbor_ind.item()] + edges.append([pivot, neighbor]) + scores.append(pred_labels[pivot_ind * pivots_knn_inds.shape[1] + + k_ind, 1].item()) + + edges = np.asarray(edges) + scores = np.asarray(scores) + + return edges, scores, text_comps diff --git a/ppocr/modeling/heads/det_east_head.py b/ppocr/modeling/heads/det_east_head.py new file mode 100644 index 0000000000000000000000000000000000000000..004eb5d7bb9a134d1a84f980e37e5336dc43a29a --- /dev/null +++ b/ppocr/modeling/heads/det_east_head.py @@ -0,0 +1,121 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance") + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class EASTHead(nn.Layer): + """ + """ + def __init__(self, in_channels, model_name, **kwargs): + super(EASTHead, self).__init__() + self.model_name = model_name + if self.model_name == "large": + num_outputs = [128, 64, 1, 8] + else: + num_outputs = [64, 32, 1, 8] + + self.det_conv1 = ConvBNLayer( + in_channels=in_channels, + out_channels=num_outputs[0], + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="det_head1") + self.det_conv2 = ConvBNLayer( + in_channels=num_outputs[0], + out_channels=num_outputs[1], + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="det_head2") + self.score_conv = ConvBNLayer( + in_channels=num_outputs[1], + out_channels=num_outputs[2], + kernel_size=1, + stride=1, + padding=0, + if_act=False, + act=None, + name="f_score") + self.geo_conv = ConvBNLayer( + in_channels=num_outputs[1], + out_channels=num_outputs[3], + kernel_size=1, + stride=1, + padding=0, + if_act=False, + act=None, + name="f_geo") + + def forward(self, x, targets=None): + f_det = self.det_conv1(x) + f_det = self.det_conv2(f_det) + f_score = self.score_conv(f_det) + f_score = F.sigmoid(f_score) + f_geo = self.geo_conv(f_det) + f_geo = (F.sigmoid(f_geo) - 0.5) * 2 * 800 + + pred = {'f_score': f_score, 'f_geo': f_geo} + return pred diff --git a/ppocr/modeling/heads/det_fce_head.py b/ppocr/modeling/heads/det_fce_head.py new file mode 100644 index 0000000000000000000000000000000000000000..9503989f58f09a5137f1002f7b90d0942a97d1d6 --- /dev/null +++ b/ppocr/modeling/heads/det_fce_head.py @@ -0,0 +1,99 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/dense_heads/fce_head.py +""" + +from paddle import nn +from paddle import ParamAttr +import paddle.nn.functional as F +from paddle.nn.initializer import Normal +import paddle +from functools import partial + + +def multi_apply(func, *args, **kwargs): + pfunc = partial(func, **kwargs) if kwargs else func + map_results = map(pfunc, *args) + return tuple(map(list, zip(*map_results))) + + +class FCEHead(nn.Layer): + """The class for implementing FCENet head. + FCENet(CVPR2021): Fourier Contour Embedding for Arbitrary-shaped Text + Detection. + + [https://arxiv.org/abs/2104.10442] + + Args: + in_channels (int): The number of input channels. + scales (list[int]) : The scale of each layer. + fourier_degree (int) : The maximum Fourier transform degree k. + """ + + def __init__(self, in_channels, fourier_degree=5): + super().__init__() + assert isinstance(in_channels, int) + + self.downsample_ratio = 1.0 + self.in_channels = in_channels + self.fourier_degree = fourier_degree + self.out_channels_cls = 4 + self.out_channels_reg = (2 * self.fourier_degree + 1) * 2 + + self.out_conv_cls = nn.Conv2D( + in_channels=self.in_channels, + out_channels=self.out_channels_cls, + kernel_size=3, + stride=1, + padding=1, + groups=1, + weight_attr=ParamAttr( + name='cls_weights', + initializer=Normal( + mean=0., std=0.01)), + bias_attr=True) + self.out_conv_reg = nn.Conv2D( + in_channels=self.in_channels, + out_channels=self.out_channels_reg, + kernel_size=3, + stride=1, + padding=1, + groups=1, + weight_attr=ParamAttr( + name='reg_weights', + initializer=Normal( + mean=0., std=0.01)), + bias_attr=True) + + def forward(self, feats, targets=None): + cls_res, reg_res = multi_apply(self.forward_single, feats) + level_num = len(cls_res) + outs = {} + if not self.training: + for i in range(level_num): + tr_pred = F.softmax(cls_res[i][:, 0:2, :, :], axis=1) + tcl_pred = F.softmax(cls_res[i][:, 2:, :, :], axis=1) + outs['level_{}'.format(i)] = paddle.concat( + [tr_pred, tcl_pred, reg_res[i]], axis=1) + else: + preds = [[cls_res[i], reg_res[i]] for i in range(level_num)] + outs['levels'] = preds + return outs + + def forward_single(self, x): + cls_predict = self.out_conv_cls(x) + reg_predict = self.out_conv_reg(x) + return cls_predict, reg_predict diff --git a/ppocr/modeling/heads/det_pse_head.py b/ppocr/modeling/heads/det_pse_head.py new file mode 100644 index 0000000000000000000000000000000000000000..32a5b48e190b7566411b19841b6aa14455b5d41d --- /dev/null +++ b/ppocr/modeling/heads/det_pse_head.py @@ -0,0 +1,37 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py +""" + +from paddle import nn + + +class PSEHead(nn.Layer): + def __init__(self, in_channels, hidden_dim=256, out_channels=7, **kwargs): + super(PSEHead, self).__init__() + self.conv1 = nn.Conv2D( + in_channels, hidden_dim, kernel_size=3, stride=1, padding=1) + self.bn1 = nn.BatchNorm2D(hidden_dim) + self.relu1 = nn.ReLU() + + self.conv2 = nn.Conv2D( + hidden_dim, out_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, x, **kwargs): + out = self.conv1(x) + out = self.relu1(self.bn1(out)) + out = self.conv2(out) + return {'maps': out} diff --git a/ppocr/modeling/heads/det_sast_head.py b/ppocr/modeling/heads/det_sast_head.py new file mode 100644 index 0000000000000000000000000000000000000000..7a88a2db6c29c8c4fa1ee94d27bd0701cdbc90f8 --- /dev/null +++ b/ppocr/modeling/heads/det_sast_head.py @@ -0,0 +1,128 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance") + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class SAST_Header1(nn.Layer): + def __init__(self, in_channels, **kwargs): + super(SAST_Header1, self).__init__() + out_channels = [64, 64, 128] + self.score_conv = nn.Sequential( + ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_score1'), + ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_score2'), + ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_score3'), + ConvBNLayer(out_channels[2], 1, 3, 1, act=None, name='f_score4') + ) + self.border_conv = nn.Sequential( + ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_border1'), + ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_border2'), + ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_border3'), + ConvBNLayer(out_channels[2], 4, 3, 1, act=None, name='f_border4') + ) + + def forward(self, x): + f_score = self.score_conv(x) + f_score = F.sigmoid(f_score) + f_border = self.border_conv(x) + return f_score, f_border + + +class SAST_Header2(nn.Layer): + def __init__(self, in_channels, **kwargs): + super(SAST_Header2, self).__init__() + out_channels = [64, 64, 128] + self.tvo_conv = nn.Sequential( + ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tvo1'), + ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tvo2'), + ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tvo3'), + ConvBNLayer(out_channels[2], 8, 3, 1, act=None, name='f_tvo4') + ) + self.tco_conv = nn.Sequential( + ConvBNLayer(in_channels, out_channels[0], 1, 1, act='relu', name='f_tco1'), + ConvBNLayer(out_channels[0], out_channels[1], 3, 1, act='relu', name='f_tco2'), + ConvBNLayer(out_channels[1], out_channels[2], 1, 1, act='relu', name='f_tco3'), + ConvBNLayer(out_channels[2], 2, 3, 1, act=None, name='f_tco4') + ) + + def forward(self, x): + f_tvo = self.tvo_conv(x) + f_tco = self.tco_conv(x) + return f_tvo, f_tco + + +class SASTHead(nn.Layer): + """ + """ + def __init__(self, in_channels, **kwargs): + super(SASTHead, self).__init__() + + self.head1 = SAST_Header1(in_channels) + self.head2 = SAST_Header2(in_channels) + + def forward(self, x, targets=None): + f_score, f_border = self.head1(x) + f_tvo, f_tco = self.head2(x) + + predicts = {} + predicts['f_score'] = f_score + predicts['f_border'] = f_border + predicts['f_tvo'] = f_tvo + predicts['f_tco'] = f_tco + return predicts \ No newline at end of file diff --git a/ppocr/modeling/heads/e2e_pg_head.py b/ppocr/modeling/heads/e2e_pg_head.py new file mode 100644 index 0000000000000000000000000000000000000000..514962ef97e503d331b6351c6d314070dfd8b15f --- /dev/null +++ b/ppocr/modeling/heads/e2e_pg_head.py @@ -0,0 +1,262 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance", + use_global_stats=False) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class PGHead(nn.Layer): + """ + """ + + def __init__(self, + in_channels, + character_dict_path='ppocr/utils/ic15_dict.txt', + **kwargs): + super(PGHead, self).__init__() + + # get character_length + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + character_length = len(lines) + 1 + + self.conv_f_score1 = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_score{}".format(1)) + self.conv_f_score2 = ConvBNLayer( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + padding=1, + act='relu', + name="conv_f_score{}".format(2)) + self.conv_f_score3 = ConvBNLayer( + in_channels=64, + out_channels=128, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_score{}".format(3)) + + self.conv1 = nn.Conv2D( + in_channels=128, + out_channels=1, + kernel_size=3, + stride=1, + padding=1, + groups=1, + weight_attr=ParamAttr(name="conv_f_score{}".format(4)), + bias_attr=False) + + self.conv_f_boder1 = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_boder{}".format(1)) + self.conv_f_boder2 = ConvBNLayer( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + padding=1, + act='relu', + name="conv_f_boder{}".format(2)) + self.conv_f_boder3 = ConvBNLayer( + in_channels=64, + out_channels=128, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_boder{}".format(3)) + self.conv2 = nn.Conv2D( + in_channels=128, + out_channels=4, + kernel_size=3, + stride=1, + padding=1, + groups=1, + weight_attr=ParamAttr(name="conv_f_boder{}".format(4)), + bias_attr=False) + self.conv_f_char1 = ConvBNLayer( + in_channels=in_channels, + out_channels=128, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_char{}".format(1)) + self.conv_f_char2 = ConvBNLayer( + in_channels=128, + out_channels=128, + kernel_size=3, + stride=1, + padding=1, + act='relu', + name="conv_f_char{}".format(2)) + self.conv_f_char3 = ConvBNLayer( + in_channels=128, + out_channels=256, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_char{}".format(3)) + self.conv_f_char4 = ConvBNLayer( + in_channels=256, + out_channels=256, + kernel_size=3, + stride=1, + padding=1, + act='relu', + name="conv_f_char{}".format(4)) + self.conv_f_char5 = ConvBNLayer( + in_channels=256, + out_channels=256, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_char{}".format(5)) + self.conv3 = nn.Conv2D( + in_channels=256, + out_channels=character_length, + kernel_size=3, + stride=1, + padding=1, + groups=1, + weight_attr=ParamAttr(name="conv_f_char{}".format(6)), + bias_attr=False) + + self.conv_f_direc1 = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_direc{}".format(1)) + self.conv_f_direc2 = ConvBNLayer( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + padding=1, + act='relu', + name="conv_f_direc{}".format(2)) + self.conv_f_direc3 = ConvBNLayer( + in_channels=64, + out_channels=128, + kernel_size=1, + stride=1, + padding=0, + act='relu', + name="conv_f_direc{}".format(3)) + self.conv4 = nn.Conv2D( + in_channels=128, + out_channels=2, + kernel_size=3, + stride=1, + padding=1, + groups=1, + weight_attr=ParamAttr(name="conv_f_direc{}".format(4)), + bias_attr=False) + + def forward(self, x, targets=None): + f_score = self.conv_f_score1(x) + f_score = self.conv_f_score2(f_score) + f_score = self.conv_f_score3(f_score) + f_score = self.conv1(f_score) + f_score = F.sigmoid(f_score) + + # f_border + f_border = self.conv_f_boder1(x) + f_border = self.conv_f_boder2(f_border) + f_border = self.conv_f_boder3(f_border) + f_border = self.conv2(f_border) + + f_char = self.conv_f_char1(x) + f_char = self.conv_f_char2(f_char) + f_char = self.conv_f_char3(f_char) + f_char = self.conv_f_char4(f_char) + f_char = self.conv_f_char5(f_char) + f_char = self.conv3(f_char) + + f_direction = self.conv_f_direc1(x) + f_direction = self.conv_f_direc2(f_direction) + f_direction = self.conv_f_direc3(f_direction) + f_direction = self.conv4(f_direction) + + predicts = {} + predicts['f_score'] = f_score + predicts['f_border'] = f_border + predicts['f_char'] = f_char + predicts['f_direction'] = f_direction + return predicts diff --git a/ppocr/modeling/heads/gcn.py b/ppocr/modeling/heads/gcn.py new file mode 100644 index 0000000000000000000000000000000000000000..d123f067cb7640575e7b6cfdeb0ab1826ab62aab --- /dev/null +++ b/ppocr/modeling/heads/gcn.py @@ -0,0 +1,113 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/modules/gcn.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class BatchNorm1D(nn.BatchNorm1D): + def __init__(self, + num_features, + eps=1e-05, + momentum=0.1, + affine=True, + track_running_stats=True): + momentum = 1 - momentum + weight_attr = None + bias_attr = None + if not affine: + weight_attr = paddle.ParamAttr(learning_rate=0.0) + bias_attr = paddle.ParamAttr(learning_rate=0.0) + super().__init__( + num_features, + momentum=momentum, + epsilon=eps, + weight_attr=weight_attr, + bias_attr=bias_attr, + use_global_stats=track_running_stats) + + +class MeanAggregator(nn.Layer): + def forward(self, features, A): + x = paddle.bmm(A, features) + return x + + +class GraphConv(nn.Layer): + def __init__(self, in_dim, out_dim): + super().__init__() + self.in_dim = in_dim + self.out_dim = out_dim + self.weight = self.create_parameter( + [in_dim * 2, out_dim], + default_initializer=nn.initializer.XavierUniform()) + self.bias = self.create_parameter( + [out_dim], + is_bias=True, + default_initializer=nn.initializer.Assign([0] * out_dim)) + + self.aggregator = MeanAggregator() + + def forward(self, features, A): + b, n, d = features.shape + assert d == self.in_dim + agg_feats = self.aggregator(features, A) + cat_feats = paddle.concat([features, agg_feats], axis=2) + out = paddle.einsum('bnd,df->bnf', cat_feats, self.weight) + out = F.relu(out + self.bias) + return out + + +class GCN(nn.Layer): + def __init__(self, feat_len): + super(GCN, self).__init__() + self.bn0 = BatchNorm1D(feat_len, affine=False) + self.conv1 = GraphConv(feat_len, 512) + self.conv2 = GraphConv(512, 256) + self.conv3 = GraphConv(256, 128) + self.conv4 = GraphConv(128, 64) + self.classifier = nn.Sequential( + nn.Linear(64, 32), nn.PReLU(32), nn.Linear(32, 2)) + + def forward(self, x, A, knn_inds): + + num_local_graphs, num_max_nodes, feat_len = x.shape + + x = x.reshape([-1, feat_len]) + x = self.bn0(x) + x = x.reshape([num_local_graphs, num_max_nodes, feat_len]) + + x = self.conv1(x, A) + x = self.conv2(x, A) + x = self.conv3(x, A) + x = self.conv4(x, A) + k = knn_inds.shape[-1] + mid_feat_len = x.shape[-1] + edge_feat = paddle.zeros([num_local_graphs, k, mid_feat_len]) + for graph_ind in range(num_local_graphs): + edge_feat[graph_ind, :, :] = x[graph_ind][paddle.to_tensor(knn_inds[ + graph_ind])] + edge_feat = edge_feat.reshape([-1, mid_feat_len]) + pred = self.classifier(edge_feat) + + return pred diff --git a/ppocr/modeling/heads/kie_sdmgr_head.py b/ppocr/modeling/heads/kie_sdmgr_head.py new file mode 100644 index 0000000000000000000000000000000000000000..ac5f73fa7e5b182faa1456e069da79118d6f7068 --- /dev/null +++ b/ppocr/modeling/heads/kie_sdmgr_head.py @@ -0,0 +1,207 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# reference from : https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/kie/heads/sdmgr_head.py + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class SDMGRHead(nn.Layer): + def __init__(self, + in_channels, + num_chars=92, + visual_dim=16, + fusion_dim=1024, + node_input=32, + node_embed=256, + edge_input=5, + edge_embed=256, + num_gnn=2, + num_classes=26, + bidirectional=False): + super().__init__() + + self.fusion = Block([visual_dim, node_embed], node_embed, fusion_dim) + self.node_embed = nn.Embedding(num_chars, node_input, 0) + hidden = node_embed // 2 if bidirectional else node_embed + self.rnn = nn.LSTM( + input_size=node_input, hidden_size=hidden, num_layers=1) + self.edge_embed = nn.Linear(edge_input, edge_embed) + self.gnn_layers = nn.LayerList( + [GNNLayer(node_embed, edge_embed) for _ in range(num_gnn)]) + self.node_cls = nn.Linear(node_embed, num_classes) + self.edge_cls = nn.Linear(edge_embed, 2) + + def forward(self, input, targets): + relations, texts, x = input + node_nums, char_nums = [], [] + for text in texts: + node_nums.append(text.shape[0]) + char_nums.append(paddle.sum((text > -1).astype(int), axis=-1)) + + max_num = max([char_num.max() for char_num in char_nums]) + all_nodes = paddle.concat([ + paddle.concat( + [text, paddle.zeros( + (text.shape[0], max_num - text.shape[1]))], -1) + for text in texts + ]) + temp = paddle.clip(all_nodes, min=0).astype(int) + embed_nodes = self.node_embed(temp) + rnn_nodes, _ = self.rnn(embed_nodes) + + b, h, w = rnn_nodes.shape + nodes = paddle.zeros([b, w]) + all_nums = paddle.concat(char_nums) + valid = paddle.nonzero((all_nums > 0).astype(int)) + temp_all_nums = ( + paddle.gather(all_nums, valid) - 1).unsqueeze(-1).unsqueeze(-1) + temp_all_nums = paddle.expand(temp_all_nums, [ + temp_all_nums.shape[0], temp_all_nums.shape[1], rnn_nodes.shape[-1] + ]) + temp_all_nodes = paddle.gather(rnn_nodes, valid) + N, C, A = temp_all_nodes.shape + one_hot = F.one_hot( + temp_all_nums[:, 0, :], num_classes=C).transpose([0, 2, 1]) + one_hot = paddle.multiply( + temp_all_nodes, one_hot.astype("float32")).sum(axis=1, keepdim=True) + t = one_hot.expand([N, 1, A]).squeeze(1) + nodes = paddle.scatter(nodes, valid.squeeze(1), t) + + if x is not None: + nodes = self.fusion([x, nodes]) + + all_edges = paddle.concat( + [rel.reshape([-1, rel.shape[-1]]) for rel in relations]) + embed_edges = self.edge_embed(all_edges.astype('float32')) + embed_edges = F.normalize(embed_edges) + + for gnn_layer in self.gnn_layers: + nodes, cat_nodes = gnn_layer(nodes, embed_edges, node_nums) + + node_cls, edge_cls = self.node_cls(nodes), self.edge_cls(cat_nodes) + return node_cls, edge_cls + + +class GNNLayer(nn.Layer): + def __init__(self, node_dim=256, edge_dim=256): + super().__init__() + self.in_fc = nn.Linear(node_dim * 2 + edge_dim, node_dim) + self.coef_fc = nn.Linear(node_dim, 1) + self.out_fc = nn.Linear(node_dim, node_dim) + self.relu = nn.ReLU() + + def forward(self, nodes, edges, nums): + start, cat_nodes = 0, [] + for num in nums: + sample_nodes = nodes[start:start + num] + cat_nodes.append( + paddle.concat([ + paddle.expand(sample_nodes.unsqueeze(1), [-1, num, -1]), + paddle.expand(sample_nodes.unsqueeze(0), [num, -1, -1]) + ], -1).reshape([num**2, -1])) + start += num + cat_nodes = paddle.concat([paddle.concat(cat_nodes), edges], -1) + cat_nodes = self.relu(self.in_fc(cat_nodes)) + coefs = self.coef_fc(cat_nodes) + + start, residuals = 0, [] + for num in nums: + residual = F.softmax( + -paddle.eye(num).unsqueeze(-1) * 1e9 + + coefs[start:start + num**2].reshape([num, num, -1]), 1) + residuals.append((residual * cat_nodes[start:start + num**2] + .reshape([num, num, -1])).sum(1)) + start += num**2 + + nodes += self.relu(self.out_fc(paddle.concat(residuals))) + return [nodes, cat_nodes] + + +class Block(nn.Layer): + def __init__(self, + input_dims, + output_dim, + mm_dim=1600, + chunks=20, + rank=15, + shared=False, + dropout_input=0., + dropout_pre_lin=0., + dropout_output=0., + pos_norm='before_cat'): + super().__init__() + self.rank = rank + self.dropout_input = dropout_input + self.dropout_pre_lin = dropout_pre_lin + self.dropout_output = dropout_output + assert (pos_norm in ['before_cat', 'after_cat']) + self.pos_norm = pos_norm + # Modules + self.linear0 = nn.Linear(input_dims[0], mm_dim) + self.linear1 = (self.linear0 + if shared else nn.Linear(input_dims[1], mm_dim)) + self.merge_linears0 = nn.LayerList() + self.merge_linears1 = nn.LayerList() + self.chunks = self.chunk_sizes(mm_dim, chunks) + for size in self.chunks: + ml0 = nn.Linear(size, size * rank) + self.merge_linears0.append(ml0) + ml1 = ml0 if shared else nn.Linear(size, size * rank) + self.merge_linears1.append(ml1) + self.linear_out = nn.Linear(mm_dim, output_dim) + + def forward(self, x): + x0 = self.linear0(x[0]) + x1 = self.linear1(x[1]) + bs = x1.shape[0] + if self.dropout_input > 0: + x0 = F.dropout(x0, p=self.dropout_input, training=self.training) + x1 = F.dropout(x1, p=self.dropout_input, training=self.training) + x0_chunks = paddle.split(x0, self.chunks, -1) + x1_chunks = paddle.split(x1, self.chunks, -1) + zs = [] + for x0_c, x1_c, m0, m1 in zip(x0_chunks, x1_chunks, self.merge_linears0, + self.merge_linears1): + m = m0(x0_c) * m1(x1_c) # bs x split_size*rank + m = m.reshape([bs, self.rank, -1]) + z = paddle.sum(m, 1) + if self.pos_norm == 'before_cat': + z = paddle.sqrt(F.relu(z)) - paddle.sqrt(F.relu(-z)) + z = F.normalize(z) + zs.append(z) + z = paddle.concat(zs, 1) + if self.pos_norm == 'after_cat': + z = paddle.sqrt(F.relu(z)) - paddle.sqrt(F.relu(-z)) + z = F.normalize(z) + + if self.dropout_pre_lin > 0: + z = F.dropout(z, p=self.dropout_pre_lin, training=self.training) + z = self.linear_out(z) + if self.dropout_output > 0: + z = F.dropout(z, p=self.dropout_output, training=self.training) + return z + + def chunk_sizes(self, dim, chunks): + split_size = (dim + chunks - 1) // chunks + sizes_list = [split_size] * chunks + sizes_list[-1] = sizes_list[-1] - (sum(sizes_list) - dim) + return sizes_list diff --git a/ppocr/modeling/heads/local_graph.py b/ppocr/modeling/heads/local_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..50fe6d72236df7afc2de3fda9e2e5db404641f34 --- /dev/null +++ b/ppocr/modeling/heads/local_graph.py @@ -0,0 +1,388 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/modules/local_graph.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle +import paddle.nn as nn +from ppocr.ext_op import RoIAlignRotated + + +def normalize_adjacent_matrix(A): + assert A.ndim == 2 + assert A.shape[0] == A.shape[1] + + A = A + np.eye(A.shape[0]) + d = np.sum(A, axis=0) + d = np.clip(d, 0, None) + d_inv = np.power(d, -0.5).flatten() + d_inv[np.isinf(d_inv)] = 0.0 + d_inv = np.diag(d_inv) + G = A.dot(d_inv).transpose().dot(d_inv) + return G + + +def euclidean_distance_matrix(A, B): + """Calculate the Euclidean distance matrix. + + Args: + A (ndarray): The point sequence. + B (ndarray): The point sequence with the same dimensions as A. + + returns: + D (ndarray): The Euclidean distance matrix. + """ + assert A.ndim == 2 + assert B.ndim == 2 + assert A.shape[1] == B.shape[1] + + m = A.shape[0] + n = B.shape[0] + + A_dots = (A * A).sum(axis=1).reshape((m, 1)) * np.ones(shape=(1, n)) + B_dots = (B * B).sum(axis=1) * np.ones(shape=(m, 1)) + D_squared = A_dots + B_dots - 2 * A.dot(B.T) + + zero_mask = np.less(D_squared, 0.0) + D_squared[zero_mask] = 0.0 + D = np.sqrt(D_squared) + return D + + +def feature_embedding(input_feats, out_feat_len): + """Embed features. This code was partially adapted from + https://github.com/GXYM/DRRG licensed under the MIT license. + + Args: + input_feats (ndarray): The input features of shape (N, d), where N is + the number of nodes in graph, d is the input feature vector length. + out_feat_len (int): The length of output feature vector. + + Returns: + embedded_feats (ndarray): The embedded features. + """ + assert input_feats.ndim == 2 + assert isinstance(out_feat_len, int) + assert out_feat_len >= input_feats.shape[1] + + num_nodes = input_feats.shape[0] + feat_dim = input_feats.shape[1] + feat_repeat_times = out_feat_len // feat_dim + residue_dim = out_feat_len % feat_dim + + if residue_dim > 0: + embed_wave = np.array([ + np.power(1000, 2.0 * (j // 2) / feat_repeat_times + 1) + for j in range(feat_repeat_times + 1) + ]).reshape((feat_repeat_times + 1, 1, 1)) + repeat_feats = np.repeat( + np.expand_dims( + input_feats, axis=0), feat_repeat_times, axis=0) + residue_feats = np.hstack([ + input_feats[:, 0:residue_dim], np.zeros( + (num_nodes, feat_dim - residue_dim)) + ]) + residue_feats = np.expand_dims(residue_feats, axis=0) + repeat_feats = np.concatenate([repeat_feats, residue_feats], axis=0) + embedded_feats = repeat_feats / embed_wave + embedded_feats[:, 0::2] = np.sin(embedded_feats[:, 0::2]) + embedded_feats[:, 1::2] = np.cos(embedded_feats[:, 1::2]) + embedded_feats = np.transpose(embedded_feats, (1, 0, 2)).reshape( + (num_nodes, -1))[:, 0:out_feat_len] + else: + embed_wave = np.array([ + np.power(1000, 2.0 * (j // 2) / feat_repeat_times) + for j in range(feat_repeat_times) + ]).reshape((feat_repeat_times, 1, 1)) + repeat_feats = np.repeat( + np.expand_dims( + input_feats, axis=0), feat_repeat_times, axis=0) + embedded_feats = repeat_feats / embed_wave + embedded_feats[:, 0::2] = np.sin(embedded_feats[:, 0::2]) + embedded_feats[:, 1::2] = np.cos(embedded_feats[:, 1::2]) + embedded_feats = np.transpose(embedded_feats, (1, 0, 2)).reshape( + (num_nodes, -1)).astype(np.float32) + + return embedded_feats + + +class LocalGraphs: + def __init__(self, k_at_hops, num_adjacent_linkages, node_geo_feat_len, + pooling_scale, pooling_output_size, local_graph_thr): + + assert len(k_at_hops) == 2 + assert all(isinstance(n, int) for n in k_at_hops) + assert isinstance(num_adjacent_linkages, int) + assert isinstance(node_geo_feat_len, int) + assert isinstance(pooling_scale, float) + assert all(isinstance(n, int) for n in pooling_output_size) + assert isinstance(local_graph_thr, float) + + self.k_at_hops = k_at_hops + self.num_adjacent_linkages = num_adjacent_linkages + self.node_geo_feat_dim = node_geo_feat_len + self.pooling = RoIAlignRotated(pooling_output_size, pooling_scale) + self.local_graph_thr = local_graph_thr + + def generate_local_graphs(self, sorted_dist_inds, gt_comp_labels): + """Generate local graphs for GCN to predict which instance a text + component belongs to. + + Args: + sorted_dist_inds (ndarray): The complete graph node indices, which + is sorted according to the Euclidean distance. + gt_comp_labels(ndarray): The ground truth labels define the + instance to which the text components (nodes in graphs) belong. + + Returns: + pivot_local_graphs(list[list[int]]): The list of local graph + neighbor indices of pivots. + pivot_knns(list[list[int]]): The list of k-nearest neighbor indices + of pivots. + """ + + assert sorted_dist_inds.ndim == 2 + assert (sorted_dist_inds.shape[0] == sorted_dist_inds.shape[1] == + gt_comp_labels.shape[0]) + + knn_graph = sorted_dist_inds[:, 1:self.k_at_hops[0] + 1] + pivot_local_graphs = [] + pivot_knns = [] + for pivot_ind, knn in enumerate(knn_graph): + + local_graph_neighbors = set(knn) + + for neighbor_ind in knn: + local_graph_neighbors.update( + set(sorted_dist_inds[neighbor_ind, 1:self.k_at_hops[1] + + 1])) + + local_graph_neighbors.discard(pivot_ind) + pivot_local_graph = list(local_graph_neighbors) + pivot_local_graph.insert(0, pivot_ind) + pivot_knn = [pivot_ind] + list(knn) + + if pivot_ind < 1: + pivot_local_graphs.append(pivot_local_graph) + pivot_knns.append(pivot_knn) + else: + add_flag = True + for graph_ind, added_knn in enumerate(pivot_knns): + added_pivot_ind = added_knn[0] + added_local_graph = pivot_local_graphs[graph_ind] + + union = len( + set(pivot_local_graph[1:]).union( + set(added_local_graph[1:]))) + intersect = len( + set(pivot_local_graph[1:]).intersection( + set(added_local_graph[1:]))) + local_graph_iou = intersect / (union + 1e-8) + + if (local_graph_iou > self.local_graph_thr and + pivot_ind in added_knn and + gt_comp_labels[added_pivot_ind] == + gt_comp_labels[pivot_ind] and + gt_comp_labels[pivot_ind] != 0): + add_flag = False + break + if add_flag: + pivot_local_graphs.append(pivot_local_graph) + pivot_knns.append(pivot_knn) + + return pivot_local_graphs, pivot_knns + + def generate_gcn_input(self, node_feat_batch, node_label_batch, + local_graph_batch, knn_batch, sorted_dist_ind_batch): + """Generate graph convolution network input data. + + Args: + node_feat_batch (List[Tensor]): The batched graph node features. + node_label_batch (List[ndarray]): The batched text component + labels. + local_graph_batch (List[List[list[int]]]): The local graph node + indices of image batch. + knn_batch (List[List[list[int]]]): The knn graph node indices of + image batch. + sorted_dist_ind_batch (list[ndarray]): The node indices sorted + according to the Euclidean distance. + + Returns: + local_graphs_node_feat (Tensor): The node features of graph. + adjacent_matrices (Tensor): The adjacent matrices of local graphs. + pivots_knn_inds (Tensor): The k-nearest neighbor indices in + local graph. + gt_linkage (Tensor): The surpervision signal of GCN for linkage + prediction. + """ + assert isinstance(node_feat_batch, list) + assert isinstance(node_label_batch, list) + assert isinstance(local_graph_batch, list) + assert isinstance(knn_batch, list) + assert isinstance(sorted_dist_ind_batch, list) + + num_max_nodes = max([ + len(pivot_local_graph) + for pivot_local_graphs in local_graph_batch + for pivot_local_graph in pivot_local_graphs + ]) + + local_graphs_node_feat = [] + adjacent_matrices = [] + pivots_knn_inds = [] + pivots_gt_linkage = [] + + for batch_ind, sorted_dist_inds in enumerate(sorted_dist_ind_batch): + node_feats = node_feat_batch[batch_ind] + pivot_local_graphs = local_graph_batch[batch_ind] + pivot_knns = knn_batch[batch_ind] + node_labels = node_label_batch[batch_ind] + + for graph_ind, pivot_knn in enumerate(pivot_knns): + pivot_local_graph = pivot_local_graphs[graph_ind] + num_nodes = len(pivot_local_graph) + pivot_ind = pivot_local_graph[0] + node2ind_map = {j: i for i, j in enumerate(pivot_local_graph)} + + knn_inds = paddle.to_tensor( + [node2ind_map[i] for i in pivot_knn[1:]]) + pivot_feats = node_feats[pivot_ind] + normalized_feats = node_feats[paddle.to_tensor( + pivot_local_graph)] - pivot_feats + + adjacent_matrix = np.zeros( + (num_nodes, num_nodes), dtype=np.float32) + for node in pivot_local_graph: + neighbors = sorted_dist_inds[node, 1: + self.num_adjacent_linkages + 1] + for neighbor in neighbors: + if neighbor in pivot_local_graph: + + adjacent_matrix[node2ind_map[node], node2ind_map[ + neighbor]] = 1 + adjacent_matrix[node2ind_map[neighbor], + node2ind_map[node]] = 1 + + adjacent_matrix = normalize_adjacent_matrix(adjacent_matrix) + pad_adjacent_matrix = paddle.zeros( + (num_max_nodes, num_max_nodes)) + pad_adjacent_matrix[:num_nodes, :num_nodes] = paddle.cast( + paddle.to_tensor(adjacent_matrix), 'float32') + + pad_normalized_feats = paddle.concat( + [ + normalized_feats, paddle.zeros( + (num_max_nodes - num_nodes, + normalized_feats.shape[1])) + ], + axis=0) + local_graph_labels = node_labels[pivot_local_graph] + knn_labels = local_graph_labels[knn_inds.numpy()] + link_labels = ((node_labels[pivot_ind] == knn_labels) & + (node_labels[pivot_ind] > 0)).astype(np.int64) + link_labels = paddle.to_tensor(link_labels) + + local_graphs_node_feat.append(pad_normalized_feats) + adjacent_matrices.append(pad_adjacent_matrix) + pivots_knn_inds.append(knn_inds) + pivots_gt_linkage.append(link_labels) + + local_graphs_node_feat = paddle.stack(local_graphs_node_feat, 0) + adjacent_matrices = paddle.stack(adjacent_matrices, 0) + pivots_knn_inds = paddle.stack(pivots_knn_inds, 0) + pivots_gt_linkage = paddle.stack(pivots_gt_linkage, 0) + + return (local_graphs_node_feat, adjacent_matrices, pivots_knn_inds, + pivots_gt_linkage) + + def __call__(self, feat_maps, comp_attribs): + """Generate local graphs as GCN input. + + Args: + feat_maps (Tensor): The feature maps to extract the content + features of text components. + comp_attribs (ndarray): The text component attributes. + + Returns: + local_graphs_node_feat (Tensor): The node features of graph. + adjacent_matrices (Tensor): The adjacent matrices of local graphs. + pivots_knn_inds (Tensor): The k-nearest neighbor indices in local + graph. + gt_linkage (Tensor): The surpervision signal of GCN for linkage + prediction. + """ + + assert isinstance(feat_maps, paddle.Tensor) + assert comp_attribs.ndim == 3 + assert comp_attribs.shape[2] == 8 + + sorted_dist_inds_batch = [] + local_graph_batch = [] + knn_batch = [] + node_feat_batch = [] + node_label_batch = [] + + for batch_ind in range(comp_attribs.shape[0]): + num_comps = int(comp_attribs[batch_ind, 0, 0]) + comp_geo_attribs = comp_attribs[batch_ind, :num_comps, 1:7] + node_labels = comp_attribs[batch_ind, :num_comps, 7].astype( + np.int32) + + comp_centers = comp_geo_attribs[:, 0:2] + distance_matrix = euclidean_distance_matrix(comp_centers, + comp_centers) + + batch_id = np.zeros( + (comp_geo_attribs.shape[0], 1), dtype=np.float32) * batch_ind + comp_geo_attribs[:, -2] = np.clip(comp_geo_attribs[:, -2], -1, 1) + angle = np.arccos(comp_geo_attribs[:, -2]) * np.sign( + comp_geo_attribs[:, -1]) + angle = angle.reshape((-1, 1)) + rotated_rois = np.hstack( + [batch_id, comp_geo_attribs[:, :-2], angle]) + rois = paddle.to_tensor(rotated_rois) + content_feats = self.pooling(feat_maps[batch_ind].unsqueeze(0), + rois) + + content_feats = content_feats.reshape([content_feats.shape[0], -1]) + geo_feats = feature_embedding(comp_geo_attribs, + self.node_geo_feat_dim) + geo_feats = paddle.to_tensor(geo_feats) + node_feats = paddle.concat([content_feats, geo_feats], axis=-1) + + sorted_dist_inds = np.argsort(distance_matrix, axis=1) + pivot_local_graphs, pivot_knns = self.generate_local_graphs( + sorted_dist_inds, node_labels) + + node_feat_batch.append(node_feats) + node_label_batch.append(node_labels) + local_graph_batch.append(pivot_local_graphs) + knn_batch.append(pivot_knns) + sorted_dist_inds_batch.append(sorted_dist_inds) + + (node_feats, adjacent_matrices, knn_inds, gt_linkage) = \ + self.generate_gcn_input(node_feat_batch, + node_label_batch, + local_graph_batch, + knn_batch, + sorted_dist_inds_batch) + + return node_feats, adjacent_matrices, knn_inds, gt_linkage diff --git a/ppocr/modeling/heads/proposal_local_graph.py b/ppocr/modeling/heads/proposal_local_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..7887c4ff42f8ae9d1826a71f01208cd81bb2d52c --- /dev/null +++ b/ppocr/modeling/heads/proposal_local_graph.py @@ -0,0 +1,412 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/modules/proposal_local_graph.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import cv2 +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from lanms import merge_quadrangle_n9 as la_nms + +from ppocr.ext_op import RoIAlignRotated +from .local_graph import (euclidean_distance_matrix, feature_embedding, + normalize_adjacent_matrix) + + +def fill_hole(input_mask): + h, w = input_mask.shape + canvas = np.zeros((h + 2, w + 2), np.uint8) + canvas[1:h + 1, 1:w + 1] = input_mask.copy() + + mask = np.zeros((h + 4, w + 4), np.uint8) + + cv2.floodFill(canvas, mask, (0, 0), 1) + canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool) + + return ~canvas | input_mask + + +class ProposalLocalGraphs: + def __init__(self, k_at_hops, num_adjacent_linkages, node_geo_feat_len, + pooling_scale, pooling_output_size, nms_thr, min_width, + max_width, comp_shrink_ratio, comp_w_h_ratio, comp_score_thr, + text_region_thr, center_region_thr, center_region_area_thr): + + assert len(k_at_hops) == 2 + assert isinstance(k_at_hops, tuple) + assert isinstance(num_adjacent_linkages, int) + assert isinstance(node_geo_feat_len, int) + assert isinstance(pooling_scale, float) + assert isinstance(pooling_output_size, tuple) + assert isinstance(nms_thr, float) + assert isinstance(min_width, float) + assert isinstance(max_width, float) + assert isinstance(comp_shrink_ratio, float) + assert isinstance(comp_w_h_ratio, float) + assert isinstance(comp_score_thr, float) + assert isinstance(text_region_thr, float) + assert isinstance(center_region_thr, float) + assert isinstance(center_region_area_thr, int) + + self.k_at_hops = k_at_hops + self.active_connection = num_adjacent_linkages + self.local_graph_depth = len(self.k_at_hops) + self.node_geo_feat_dim = node_geo_feat_len + self.pooling = RoIAlignRotated(pooling_output_size, pooling_scale) + self.nms_thr = nms_thr + self.min_width = min_width + self.max_width = max_width + self.comp_shrink_ratio = comp_shrink_ratio + self.comp_w_h_ratio = comp_w_h_ratio + self.comp_score_thr = comp_score_thr + self.text_region_thr = text_region_thr + self.center_region_thr = center_region_thr + self.center_region_area_thr = center_region_area_thr + + def propose_comps(self, score_map, top_height_map, bot_height_map, sin_map, + cos_map, comp_score_thr, min_width, max_width, + comp_shrink_ratio, comp_w_h_ratio): + """Propose text components. + + Args: + score_map (ndarray): The score map for NMS. + top_height_map (ndarray): The predicted text height map from each + pixel in text center region to top sideline. + bot_height_map (ndarray): The predicted text height map from each + pixel in text center region to bottom sideline. + sin_map (ndarray): The predicted sin(theta) map. + cos_map (ndarray): The predicted cos(theta) map. + comp_score_thr (float): The score threshold of text component. + min_width (float): The minimum width of text components. + max_width (float): The maximum width of text components. + comp_shrink_ratio (float): The shrink ratio of text components. + comp_w_h_ratio (float): The width to height ratio of text + components. + + Returns: + text_comps (ndarray): The text components. + """ + + comp_centers = np.argwhere(score_map > comp_score_thr) + comp_centers = comp_centers[np.argsort(comp_centers[:, 0])] + y = comp_centers[:, 0] + x = comp_centers[:, 1] + + top_height = top_height_map[y, x].reshape((-1, 1)) * comp_shrink_ratio + bot_height = bot_height_map[y, x].reshape((-1, 1)) * comp_shrink_ratio + sin = sin_map[y, x].reshape((-1, 1)) + cos = cos_map[y, x].reshape((-1, 1)) + + top_mid_pts = comp_centers + np.hstack( + [top_height * sin, top_height * cos]) + bot_mid_pts = comp_centers - np.hstack( + [bot_height * sin, bot_height * cos]) + + width = (top_height + bot_height) * comp_w_h_ratio + width = np.clip(width, min_width, max_width) + r = width / 2 + + tl = top_mid_pts[:, ::-1] - np.hstack([-r * sin, r * cos]) + tr = top_mid_pts[:, ::-1] + np.hstack([-r * sin, r * cos]) + br = bot_mid_pts[:, ::-1] + np.hstack([-r * sin, r * cos]) + bl = bot_mid_pts[:, ::-1] - np.hstack([-r * sin, r * cos]) + text_comps = np.hstack([tl, tr, br, bl]).astype(np.float32) + + score = score_map[y, x].reshape((-1, 1)) + text_comps = np.hstack([text_comps, score]) + + return text_comps + + def propose_comps_and_attribs(self, text_region_map, center_region_map, + top_height_map, bot_height_map, sin_map, + cos_map): + """Generate text components and attributes. + + Args: + text_region_map (ndarray): The predicted text region probability + map. + center_region_map (ndarray): The predicted text center region + probability map. + top_height_map (ndarray): The predicted text height map from each + pixel in text center region to top sideline. + bot_height_map (ndarray): The predicted text height map from each + pixel in text center region to bottom sideline. + sin_map (ndarray): The predicted sin(theta) map. + cos_map (ndarray): The predicted cos(theta) map. + + Returns: + comp_attribs (ndarray): The text component attributes. + text_comps (ndarray): The text components. + """ + + assert (text_region_map.shape == center_region_map.shape == + top_height_map.shape == bot_height_map.shape == sin_map.shape == + cos_map.shape) + text_mask = text_region_map > self.text_region_thr + center_region_mask = ( + center_region_map > self.center_region_thr) * text_mask + + scale = np.sqrt(1.0 / (sin_map**2 + cos_map**2 + 1e-8)) + sin_map, cos_map = sin_map * scale, cos_map * scale + + center_region_mask = fill_hole(center_region_mask) + center_region_contours, _ = cv2.findContours( + center_region_mask.astype(np.uint8), cv2.RETR_TREE, + cv2.CHAIN_APPROX_SIMPLE) + + mask_sz = center_region_map.shape + comp_list = [] + for contour in center_region_contours: + current_center_mask = np.zeros(mask_sz) + cv2.drawContours(current_center_mask, [contour], -1, 1, -1) + if current_center_mask.sum() <= self.center_region_area_thr: + continue + score_map = text_region_map * current_center_mask + + text_comps = self.propose_comps( + score_map, top_height_map, bot_height_map, sin_map, cos_map, + self.comp_score_thr, self.min_width, self.max_width, + self.comp_shrink_ratio, self.comp_w_h_ratio) + + text_comps = la_nms(text_comps, self.nms_thr) + text_comp_mask = np.zeros(mask_sz) + text_comp_boxes = text_comps[:, :8].reshape( + (-1, 4, 2)).astype(np.int32) + + cv2.drawContours(text_comp_mask, text_comp_boxes, -1, 1, -1) + if (text_comp_mask * text_mask).sum() < text_comp_mask.sum() * 0.5: + continue + if text_comps.shape[-1] > 0: + comp_list.append(text_comps) + + if len(comp_list) <= 0: + return None, None + + text_comps = np.vstack(comp_list) + text_comp_boxes = text_comps[:, :8].reshape((-1, 4, 2)) + centers = np.mean(text_comp_boxes, axis=1).astype(np.int32) + x = centers[:, 0] + y = centers[:, 1] + + scores = [] + for text_comp_box in text_comp_boxes: + text_comp_box[:, 0] = np.clip(text_comp_box[:, 0], 0, + mask_sz[1] - 1) + text_comp_box[:, 1] = np.clip(text_comp_box[:, 1], 0, + mask_sz[0] - 1) + min_coord = np.min(text_comp_box, axis=0).astype(np.int32) + max_coord = np.max(text_comp_box, axis=0).astype(np.int32) + text_comp_box = text_comp_box - min_coord + box_sz = (max_coord - min_coord + 1) + temp_comp_mask = np.zeros((box_sz[1], box_sz[0]), dtype=np.uint8) + cv2.fillPoly(temp_comp_mask, [text_comp_box.astype(np.int32)], 1) + temp_region_patch = text_region_map[min_coord[1]:(max_coord[1] + 1), + min_coord[0]:(max_coord[0] + 1)] + score = cv2.mean(temp_region_patch, temp_comp_mask)[0] + scores.append(score) + scores = np.array(scores).reshape((-1, 1)) + text_comps = np.hstack([text_comps[:, :-1], scores]) + + h = top_height_map[y, x].reshape( + (-1, 1)) + bot_height_map[y, x].reshape((-1, 1)) + w = np.clip(h * self.comp_w_h_ratio, self.min_width, self.max_width) + sin = sin_map[y, x].reshape((-1, 1)) + cos = cos_map[y, x].reshape((-1, 1)) + + x = x.reshape((-1, 1)) + y = y.reshape((-1, 1)) + comp_attribs = np.hstack([x, y, h, w, cos, sin]) + + return comp_attribs, text_comps + + def generate_local_graphs(self, sorted_dist_inds, node_feats): + """Generate local graphs and graph convolution network input data. + + Args: + sorted_dist_inds (ndarray): The node indices sorted according to + the Euclidean distance. + node_feats (tensor): The features of nodes in graph. + + Returns: + local_graphs_node_feats (tensor): The features of nodes in local + graphs. + adjacent_matrices (tensor): The adjacent matrices. + pivots_knn_inds (tensor): The k-nearest neighbor indices in + local graphs. + pivots_local_graphs (tensor): The indices of nodes in local + graphs. + """ + + assert sorted_dist_inds.ndim == 2 + assert (sorted_dist_inds.shape[0] == sorted_dist_inds.shape[1] == + node_feats.shape[0]) + + knn_graph = sorted_dist_inds[:, 1:self.k_at_hops[0] + 1] + pivot_local_graphs = [] + pivot_knns = [] + + for pivot_ind, knn in enumerate(knn_graph): + + local_graph_neighbors = set(knn) + + for neighbor_ind in knn: + local_graph_neighbors.update( + set(sorted_dist_inds[neighbor_ind, 1:self.k_at_hops[1] + + 1])) + + local_graph_neighbors.discard(pivot_ind) + pivot_local_graph = list(local_graph_neighbors) + pivot_local_graph.insert(0, pivot_ind) + pivot_knn = [pivot_ind] + list(knn) + + pivot_local_graphs.append(pivot_local_graph) + pivot_knns.append(pivot_knn) + + num_max_nodes = max([ + len(pivot_local_graph) for pivot_local_graph in pivot_local_graphs + ]) + + local_graphs_node_feat = [] + adjacent_matrices = [] + pivots_knn_inds = [] + pivots_local_graphs = [] + + for graph_ind, pivot_knn in enumerate(pivot_knns): + pivot_local_graph = pivot_local_graphs[graph_ind] + num_nodes = len(pivot_local_graph) + pivot_ind = pivot_local_graph[0] + node2ind_map = {j: i for i, j in enumerate(pivot_local_graph)} + + knn_inds = paddle.cast( + paddle.to_tensor([node2ind_map[i] + for i in pivot_knn[1:]]), 'int64') + pivot_feats = node_feats[pivot_ind] + normalized_feats = node_feats[paddle.to_tensor( + pivot_local_graph)] - pivot_feats + + adjacent_matrix = np.zeros((num_nodes, num_nodes), dtype=np.float32) + for node in pivot_local_graph: + neighbors = sorted_dist_inds[node, 1:self.active_connection + 1] + for neighbor in neighbors: + if neighbor in pivot_local_graph: + adjacent_matrix[node2ind_map[node], node2ind_map[ + neighbor]] = 1 + adjacent_matrix[node2ind_map[neighbor], node2ind_map[ + node]] = 1 + + adjacent_matrix = normalize_adjacent_matrix(adjacent_matrix) + pad_adjacent_matrix = paddle.zeros((num_max_nodes, num_max_nodes), ) + pad_adjacent_matrix[:num_nodes, :num_nodes] = paddle.cast( + paddle.to_tensor(adjacent_matrix), 'float32') + + pad_normalized_feats = paddle.concat( + [ + normalized_feats, paddle.zeros( + (num_max_nodes - num_nodes, normalized_feats.shape[1]), + ) + ], + axis=0) + + local_graph_nodes = paddle.to_tensor(pivot_local_graph) + local_graph_nodes = paddle.concat( + [ + local_graph_nodes, paddle.zeros( + [num_max_nodes - num_nodes], dtype='int64') + ], + axis=-1) + + local_graphs_node_feat.append(pad_normalized_feats) + adjacent_matrices.append(pad_adjacent_matrix) + pivots_knn_inds.append(knn_inds) + pivots_local_graphs.append(local_graph_nodes) + + local_graphs_node_feat = paddle.stack(local_graphs_node_feat, 0) + adjacent_matrices = paddle.stack(adjacent_matrices, 0) + pivots_knn_inds = paddle.stack(pivots_knn_inds, 0) + pivots_local_graphs = paddle.stack(pivots_local_graphs, 0) + + return (local_graphs_node_feat, adjacent_matrices, pivots_knn_inds, + pivots_local_graphs) + + def __call__(self, preds, feat_maps): + """Generate local graphs and graph convolutional network input data. + + Args: + preds (tensor): The predicted maps. + feat_maps (tensor): The feature maps to extract content feature of + text components. + + Returns: + none_flag (bool): The flag showing whether the number of proposed + text components is 0. + local_graphs_node_feats (tensor): The features of nodes in local + graphs. + adjacent_matrices (tensor): The adjacent matrices. + pivots_knn_inds (tensor): The k-nearest neighbor indices in + local graphs. + pivots_local_graphs (tensor): The indices of nodes in local + graphs. + text_comps (ndarray): The predicted text components. + """ + if preds.ndim == 4: + assert preds.shape[0] == 1 + preds = paddle.squeeze(preds) + pred_text_region = F.sigmoid(preds[0]).numpy() + pred_center_region = F.sigmoid(preds[1]).numpy() + pred_sin_map = preds[2].numpy() + pred_cos_map = preds[3].numpy() + pred_top_height_map = preds[4].numpy() + pred_bot_height_map = preds[5].numpy() + + comp_attribs, text_comps = self.propose_comps_and_attribs( + pred_text_region, pred_center_region, pred_top_height_map, + pred_bot_height_map, pred_sin_map, pred_cos_map) + + if comp_attribs is None or len(comp_attribs) < 2: + none_flag = True + return none_flag, (0, 0, 0, 0, 0) + + comp_centers = comp_attribs[:, 0:2] + distance_matrix = euclidean_distance_matrix(comp_centers, comp_centers) + + geo_feats = feature_embedding(comp_attribs, self.node_geo_feat_dim) + geo_feats = paddle.to_tensor(geo_feats) + + batch_id = np.zeros((comp_attribs.shape[0], 1), dtype=np.float32) + comp_attribs = comp_attribs.astype(np.float32) + angle = np.arccos(comp_attribs[:, -2]) * np.sign(comp_attribs[:, -1]) + angle = angle.reshape((-1, 1)) + rotated_rois = np.hstack([batch_id, comp_attribs[:, :-2], angle]) + rois = paddle.to_tensor(rotated_rois) + + content_feats = self.pooling(feat_maps, rois) + content_feats = content_feats.reshape([content_feats.shape[0], -1]) + node_feats = paddle.concat([content_feats, geo_feats], axis=-1) + + sorted_dist_inds = np.argsort(distance_matrix, axis=1) + (local_graphs_node_feat, adjacent_matrices, pivots_knn_inds, + pivots_local_graphs) = self.generate_local_graphs(sorted_dist_inds, + node_feats) + + none_flag = False + return none_flag, (local_graphs_node_feat, adjacent_matrices, + pivots_knn_inds, pivots_local_graphs, text_comps) diff --git a/ppocr/modeling/heads/rec_abinet_head.py b/ppocr/modeling/heads/rec_abinet_head.py new file mode 100644 index 0000000000000000000000000000000000000000..2309ad65e6ebd32592df19eed0ce1fbd11cb9a81 --- /dev/null +++ b/ppocr/modeling/heads/rec_abinet_head.py @@ -0,0 +1,297 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FangShancheng/ABINet/tree/main/modules +""" + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle.nn import LayerList +from ppocr.modeling.heads.rec_nrtr_head import TransformerBlock, PositionalEncoding + + +class BCNLanguage(nn.Layer): + def __init__(self, + d_model=512, + nhead=8, + num_layers=4, + dim_feedforward=2048, + dropout=0., + max_length=25, + detach=True, + num_classes=37): + super().__init__() + + self.d_model = d_model + self.detach = detach + self.max_length = max_length + 1 # additional stop token + self.proj = nn.Linear(num_classes, d_model, bias_attr=False) + self.token_encoder = PositionalEncoding( + dropout=0.1, dim=d_model, max_len=self.max_length) + self.pos_encoder = PositionalEncoding( + dropout=0, dim=d_model, max_len=self.max_length) + + self.decoder = nn.LayerList([ + TransformerBlock( + d_model=d_model, + nhead=nhead, + dim_feedforward=dim_feedforward, + attention_dropout_rate=dropout, + residual_dropout_rate=dropout, + with_self_attn=False, + with_cross_attn=True) for i in range(num_layers) + ]) + + self.cls = nn.Linear(d_model, num_classes) + + def forward(self, tokens, lengths): + """ + Args: + tokens: (B, N, C) where N is length, B is batch size and C is classes number + lengths: (B,) + """ + if self.detach: tokens = tokens.detach() + embed = self.proj(tokens) # (B, N, C) + embed = self.token_encoder(embed) # (B, N, C) + padding_mask = _get_mask(lengths, self.max_length) + zeros = paddle.zeros_like(embed) # (B, N, C) + qeury = self.pos_encoder(zeros) + for decoder_layer in self.decoder: + qeury = decoder_layer(qeury, embed, cross_mask=padding_mask) + output = qeury # (B, N, C) + + logits = self.cls(output) # (B, N, C) + + return output, logits + + +def encoder_layer(in_c, out_c, k=3, s=2, p=1): + return nn.Sequential( + nn.Conv2D(in_c, out_c, k, s, p), nn.BatchNorm2D(out_c), nn.ReLU()) + + +def decoder_layer(in_c, + out_c, + k=3, + s=1, + p=1, + mode='nearest', + scale_factor=None, + size=None): + align_corners = False if mode == 'nearest' else True + return nn.Sequential( + nn.Upsample( + size=size, + scale_factor=scale_factor, + mode=mode, + align_corners=align_corners), + nn.Conv2D(in_c, out_c, k, s, p), + nn.BatchNorm2D(out_c), + nn.ReLU()) + + +class PositionAttention(nn.Layer): + def __init__(self, + max_length, + in_channels=512, + num_channels=64, + h=8, + w=32, + mode='nearest', + **kwargs): + super().__init__() + self.max_length = max_length + self.k_encoder = nn.Sequential( + encoder_layer( + in_channels, num_channels, s=(1, 2)), + encoder_layer( + num_channels, num_channels, s=(2, 2)), + encoder_layer( + num_channels, num_channels, s=(2, 2)), + encoder_layer( + num_channels, num_channels, s=(2, 2))) + self.k_decoder = nn.Sequential( + decoder_layer( + num_channels, num_channels, scale_factor=2, mode=mode), + decoder_layer( + num_channels, num_channels, scale_factor=2, mode=mode), + decoder_layer( + num_channels, num_channels, scale_factor=2, mode=mode), + decoder_layer( + num_channels, in_channels, size=(h, w), mode=mode)) + + self.pos_encoder = PositionalEncoding( + dropout=0, dim=in_channels, max_len=max_length) + self.project = nn.Linear(in_channels, in_channels) + + def forward(self, x): + B, C, H, W = x.shape + k, v = x, x + + # calculate key vector + features = [] + for i in range(0, len(self.k_encoder)): + k = self.k_encoder[i](k) + features.append(k) + for i in range(0, len(self.k_decoder) - 1): + k = self.k_decoder[i](k) + # print(k.shape, features[len(self.k_decoder) - 2 - i].shape) + k = k + features[len(self.k_decoder) - 2 - i] + k = self.k_decoder[-1](k) + + # calculate query vector + # TODO q=f(q,k) + zeros = paddle.zeros( + (B, self.max_length, C), dtype=x.dtype) # (T, N, C) + q = self.pos_encoder(zeros) # (B, N, C) + q = self.project(q) # (B, N, C) + + # calculate attention + attn_scores = q @k.flatten(2) # (B, N, (H*W)) + attn_scores = attn_scores / (C**0.5) + attn_scores = F.softmax(attn_scores, axis=-1) + + v = v.flatten(2).transpose([0, 2, 1]) # (B, (H*W), C) + attn_vecs = attn_scores @v # (B, N, C) + + return attn_vecs, attn_scores.reshape([0, self.max_length, H, W]) + + +class ABINetHead(nn.Layer): + def __init__(self, + in_channels, + out_channels, + d_model=512, + nhead=8, + num_layers=3, + dim_feedforward=2048, + dropout=0.1, + max_length=25, + use_lang=False, + iter_size=1): + super().__init__() + self.max_length = max_length + 1 + self.pos_encoder = PositionalEncoding( + dropout=0.1, dim=d_model, max_len=8 * 32) + self.encoder = nn.LayerList([ + TransformerBlock( + d_model=d_model, + nhead=nhead, + dim_feedforward=dim_feedforward, + attention_dropout_rate=dropout, + residual_dropout_rate=dropout, + with_self_attn=True, + with_cross_attn=False) for i in range(num_layers) + ]) + self.decoder = PositionAttention( + max_length=max_length + 1, # additional stop token + mode='nearest', ) + self.out_channels = out_channels + self.cls = nn.Linear(d_model, self.out_channels) + self.use_lang = use_lang + if use_lang: + self.iter_size = iter_size + self.language = BCNLanguage( + d_model=d_model, + nhead=nhead, + num_layers=4, + dim_feedforward=dim_feedforward, + dropout=dropout, + max_length=max_length, + num_classes=self.out_channels) + # alignment + self.w_att_align = nn.Linear(2 * d_model, d_model) + self.cls_align = nn.Linear(d_model, self.out_channels) + + def forward(self, x, targets=None): + x = x.transpose([0, 2, 3, 1]) + _, H, W, C = x.shape + feature = x.flatten(1, 2) + feature = self.pos_encoder(feature) + for encoder_layer in self.encoder: + feature = encoder_layer(feature) + feature = feature.reshape([0, H, W, C]).transpose([0, 3, 1, 2]) + v_feature, attn_scores = self.decoder( + feature) # (B, N, C), (B, C, H, W) + vis_logits = self.cls(v_feature) # (B, N, C) + logits = vis_logits + vis_lengths = _get_length(vis_logits) + if self.use_lang: + align_logits = vis_logits + align_lengths = vis_lengths + all_l_res, all_a_res = [], [] + for i in range(self.iter_size): + tokens = F.softmax(align_logits, axis=-1) + lengths = align_lengths + lengths = paddle.clip( + lengths, 2, self.max_length) # TODO:move to langauge model + l_feature, l_logits = self.language(tokens, lengths) + + # alignment + all_l_res.append(l_logits) + fuse = paddle.concat((l_feature, v_feature), -1) + f_att = F.sigmoid(self.w_att_align(fuse)) + output = f_att * v_feature + (1 - f_att) * l_feature + align_logits = self.cls_align(output) # (B, N, C) + + align_lengths = _get_length(align_logits) + all_a_res.append(align_logits) + if self.training: + return { + 'align': all_a_res, + 'lang': all_l_res, + 'vision': vis_logits + } + else: + logits = align_logits + if self.training: + return logits + else: + return F.softmax(logits, -1) + + +def _get_length(logit): + """ Greed decoder to obtain length from logit""" + out = (logit.argmax(-1) == 0) + abn = out.any(-1) + out_int = out.cast('int32') + out = (out_int.cumsum(-1) == 1) & out + out = out.cast('int32') + out = out.argmax(-1) + out = out + 1 + len_seq = paddle.zeros_like(out) + logit.shape[1] + out = paddle.where(abn, out, len_seq) + return out + + +def _get_mask(length, max_length): + """Generate a square mask for the sequence. The masked positions are filled with float('-inf'). + Unmasked positions are filled with float(0.0). + """ + length = length.unsqueeze(-1) + B = paddle.shape(length)[0] + grid = paddle.arange(0, max_length).unsqueeze(0).tile([B, 1]) + zero_mask = paddle.zeros([B, max_length], dtype='float32') + inf_mask = paddle.full([B, max_length], '-inf', dtype='float32') + diag_mask = paddle.diag( + paddle.full( + [max_length], '-inf', dtype=paddle.float32), + offset=0, + name=None) + mask = paddle.where(grid >= length, inf_mask, zero_mask) + mask = mask.unsqueeze(1) + diag_mask + return mask.unsqueeze(1) diff --git a/ppocr/modeling/heads/rec_aster_head.py b/ppocr/modeling/heads/rec_aster_head.py new file mode 100644 index 0000000000000000000000000000000000000000..c95e8fd31f84c26cf58f7fbbdaab6c825b10eea8 --- /dev/null +++ b/ppocr/modeling/heads/rec_aster_head.py @@ -0,0 +1,393 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/attention_recognition_head.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys + +import paddle +from paddle import nn +from paddle.nn import functional as F + + +class AsterHead(nn.Layer): + def __init__(self, + in_channels, + out_channels, + sDim, + attDim, + max_len_labels, + time_step=25, + beam_width=5, + **kwargs): + super(AsterHead, self).__init__() + self.num_classes = out_channels + self.in_planes = in_channels + self.sDim = sDim + self.attDim = attDim + self.max_len_labels = max_len_labels + self.decoder = AttentionRecognitionHead(in_channels, out_channels, sDim, + attDim, max_len_labels) + self.time_step = time_step + self.embeder = Embedding(self.time_step, in_channels) + self.beam_width = beam_width + self.eos = self.num_classes - 3 + + def forward(self, x, targets=None, embed=None): + return_dict = {} + embedding_vectors = self.embeder(x) + + if self.training: + rec_targets, rec_lengths, _ = targets + rec_pred = self.decoder([x, rec_targets, rec_lengths], + embedding_vectors) + return_dict['rec_pred'] = rec_pred + return_dict['embedding_vectors'] = embedding_vectors + else: + rec_pred, rec_pred_scores = self.decoder.beam_search( + x, self.beam_width, self.eos, embedding_vectors) + return_dict['rec_pred'] = rec_pred + return_dict['rec_pred_scores'] = rec_pred_scores + return_dict['embedding_vectors'] = embedding_vectors + + return return_dict + + +class Embedding(nn.Layer): + def __init__(self, in_timestep, in_planes, mid_dim=4096, embed_dim=300): + super(Embedding, self).__init__() + self.in_timestep = in_timestep + self.in_planes = in_planes + self.embed_dim = embed_dim + self.mid_dim = mid_dim + self.eEmbed = nn.Linear( + in_timestep * in_planes, + self.embed_dim) # Embed encoder output to a word-embedding like + + def forward(self, x): + x = paddle.reshape(x, [paddle.shape(x)[0], -1]) + x = self.eEmbed(x) + return x + + +class AttentionRecognitionHead(nn.Layer): + """ + input: [b x 16 x 64 x in_planes] + output: probability sequence: [b x T x num_classes] + """ + + def __init__(self, in_channels, out_channels, sDim, attDim, max_len_labels): + super(AttentionRecognitionHead, self).__init__() + self.num_classes = out_channels # this is the output classes. So it includes the . + self.in_planes = in_channels + self.sDim = sDim + self.attDim = attDim + self.max_len_labels = max_len_labels + + self.decoder = DecoderUnit( + sDim=sDim, xDim=in_channels, yDim=self.num_classes, attDim=attDim) + + def forward(self, x, embed): + x, targets, lengths = x + batch_size = paddle.shape(x)[0] + # Decoder + state = self.decoder.get_initial_state(embed) + outputs = [] + for i in range(max(lengths)): + if i == 0: + y_prev = paddle.full( + shape=[batch_size], fill_value=self.num_classes) + else: + y_prev = targets[:, i - 1] + output, state = self.decoder(x, state, y_prev) + outputs.append(output) + outputs = paddle.concat([_.unsqueeze(1) for _ in outputs], 1) + return outputs + + # inference stage. + def sample(self, x): + x, _, _ = x + batch_size = x.size(0) + # Decoder + state = paddle.zeros([1, batch_size, self.sDim]) + + predicted_ids, predicted_scores = [], [] + for i in range(self.max_len_labels): + if i == 0: + y_prev = paddle.full( + shape=[batch_size], fill_value=self.num_classes) + else: + y_prev = predicted + + output, state = self.decoder(x, state, y_prev) + output = F.softmax(output, axis=1) + score, predicted = output.max(1) + predicted_ids.append(predicted.unsqueeze(1)) + predicted_scores.append(score.unsqueeze(1)) + predicted_ids = paddle.concat([predicted_ids, 1]) + predicted_scores = paddle.concat([predicted_scores, 1]) + # return predicted_ids.squeeze(), predicted_scores.squeeze() + return predicted_ids, predicted_scores + + def beam_search(self, x, beam_width, eos, embed): + def _inflate(tensor, times, dim): + repeat_dims = [1] * tensor.dim() + repeat_dims[dim] = times + output = paddle.tile(tensor, repeat_dims) + return output + + # https://github.com/IBM/pytorch-seq2seq/blob/fede87655ddce6c94b38886089e05321dc9802af/seq2seq/models/TopKDecoder.py + batch_size, l, d = x.shape + x = paddle.tile( + paddle.transpose( + x.unsqueeze(1), perm=[1, 0, 2, 3]), [beam_width, 1, 1, 1]) + inflated_encoder_feats = paddle.reshape( + paddle.transpose( + x, perm=[1, 0, 2, 3]), [-1, l, d]) + + # Initialize the decoder + state = self.decoder.get_initial_state(embed, tile_times=beam_width) + + pos_index = paddle.reshape( + paddle.arange(batch_size) * beam_width, shape=[-1, 1]) + + # Initialize the scores + sequence_scores = paddle.full( + shape=[batch_size * beam_width, 1], fill_value=-float('Inf')) + index = [i * beam_width for i in range(0, batch_size)] + sequence_scores[index] = 0.0 + + # Initialize the input vector + y_prev = paddle.full( + shape=[batch_size * beam_width], fill_value=self.num_classes) + + # Store decisions for backtracking + stored_scores = list() + stored_predecessors = list() + stored_emitted_symbols = list() + + for i in range(self.max_len_labels): + output, state = self.decoder(inflated_encoder_feats, state, y_prev) + state = paddle.unsqueeze(state, axis=0) + log_softmax_output = paddle.nn.functional.log_softmax( + output, axis=1) + + sequence_scores = _inflate(sequence_scores, self.num_classes, 1) + sequence_scores += log_softmax_output + scores, candidates = paddle.topk( + paddle.reshape(sequence_scores, [batch_size, -1]), + beam_width, + axis=1) + + # Reshape input = (bk, 1) and sequence_scores = (bk, 1) + y_prev = paddle.reshape( + candidates % self.num_classes, shape=[batch_size * beam_width]) + sequence_scores = paddle.reshape( + scores, shape=[batch_size * beam_width, 1]) + + # Update fields for next timestep + pos_index = paddle.expand_as(pos_index, candidates) + predecessors = paddle.cast( + candidates / self.num_classes + pos_index, dtype='int64') + predecessors = paddle.reshape( + predecessors, shape=[batch_size * beam_width, 1]) + state = paddle.index_select( + state, index=predecessors.squeeze(), axis=1) + + # Update sequence socres and erase scores for symbol so that they aren't expanded + stored_scores.append(sequence_scores.clone()) + y_prev = paddle.reshape(y_prev, shape=[-1, 1]) + eos_prev = paddle.full_like(y_prev, fill_value=eos) + mask = eos_prev == y_prev + mask = paddle.nonzero(mask) + if mask.dim() > 0: + sequence_scores = sequence_scores.numpy() + mask = mask.numpy() + sequence_scores[mask] = -float('inf') + sequence_scores = paddle.to_tensor(sequence_scores) + + # Cache results for backtracking + stored_predecessors.append(predecessors) + y_prev = paddle.squeeze(y_prev) + stored_emitted_symbols.append(y_prev) + + # Do backtracking to return the optimal values + #====== backtrak ======# + # Initialize return variables given different types + p = list() + l = [[self.max_len_labels] * beam_width for _ in range(batch_size) + ] # Placeholder for lengths of top-k sequences + + # the last step output of the beams are not sorted + # thus they are sorted here + sorted_score, sorted_idx = paddle.topk( + paddle.reshape( + stored_scores[-1], shape=[batch_size, beam_width]), + beam_width) + + # initialize the sequence scores with the sorted last step beam scores + s = sorted_score.clone() + + batch_eos_found = [0] * batch_size # the number of EOS found + # in the backward loop below for each batch + t = self.max_len_labels - 1 + # initialize the back pointer with the sorted order of the last step beams. + # add pos_index for indexing variable with b*k as the first dimension. + t_predecessors = paddle.reshape( + sorted_idx + pos_index.expand_as(sorted_idx), + shape=[batch_size * beam_width]) + while t >= 0: + # Re-order the variables with the back pointer + current_symbol = paddle.index_select( + stored_emitted_symbols[t], index=t_predecessors, axis=0) + t_predecessors = paddle.index_select( + stored_predecessors[t].squeeze(), index=t_predecessors, axis=0) + eos_indices = stored_emitted_symbols[t] == eos + eos_indices = paddle.nonzero(eos_indices) + + if eos_indices.dim() > 0: + for i in range(eos_indices.shape[0] - 1, -1, -1): + # Indices of the EOS symbol for both variables + # with b*k as the first dimension, and b, k for + # the first two dimensions + idx = eos_indices[i] + b_idx = int(idx[0] / beam_width) + # The indices of the replacing position + # according to the replacement strategy noted above + res_k_idx = beam_width - (batch_eos_found[b_idx] % + beam_width) - 1 + batch_eos_found[b_idx] += 1 + res_idx = b_idx * beam_width + res_k_idx + + # Replace the old information in return variables + # with the new ended sequence information + t_predecessors[res_idx] = stored_predecessors[t][idx[0]] + current_symbol[res_idx] = stored_emitted_symbols[t][idx[0]] + s[b_idx, res_k_idx] = stored_scores[t][idx[0], 0] + l[b_idx][res_k_idx] = t + 1 + + # record the back tracked results + p.append(current_symbol) + t -= 1 + + # Sort and re-order again as the added ended sequences may change + # the order (very unlikely) + s, re_sorted_idx = s.topk(beam_width) + for b_idx in range(batch_size): + l[b_idx] = [ + l[b_idx][k_idx.item()] for k_idx in re_sorted_idx[b_idx, :] + ] + + re_sorted_idx = paddle.reshape( + re_sorted_idx + pos_index.expand_as(re_sorted_idx), + [batch_size * beam_width]) + + # Reverse the sequences and re-order at the same time + # It is reversed because the backtracking happens in reverse time order + p = [ + paddle.reshape( + paddle.index_select(step, re_sorted_idx, 0), + shape=[batch_size, beam_width, -1]) for step in reversed(p) + ] + p = paddle.concat(p, -1)[:, 0, :] + return p, paddle.ones_like(p) + + +class AttentionUnit(nn.Layer): + def __init__(self, sDim, xDim, attDim): + super(AttentionUnit, self).__init__() + + self.sDim = sDim + self.xDim = xDim + self.attDim = attDim + + self.sEmbed = nn.Linear(sDim, attDim) + self.xEmbed = nn.Linear(xDim, attDim) + self.wEmbed = nn.Linear(attDim, 1) + + def forward(self, x, sPrev): + batch_size, T, _ = x.shape # [b x T x xDim] + x = paddle.reshape(x, [-1, self.xDim]) # [(b x T) x xDim] + xProj = self.xEmbed(x) # [(b x T) x attDim] + xProj = paddle.reshape(xProj, [batch_size, T, -1]) # [b x T x attDim] + + sPrev = sPrev.squeeze(0) + sProj = self.sEmbed(sPrev) # [b x attDim] + sProj = paddle.unsqueeze(sProj, 1) # [b x 1 x attDim] + sProj = paddle.expand(sProj, + [batch_size, T, self.attDim]) # [b x T x attDim] + + sumTanh = paddle.tanh(sProj + xProj) + sumTanh = paddle.reshape(sumTanh, [-1, self.attDim]) + + vProj = self.wEmbed(sumTanh) # [(b x T) x 1] + vProj = paddle.reshape(vProj, [batch_size, T]) + alpha = F.softmax( + vProj, axis=1) # attention weights for each sample in the minibatch + return alpha + + +class DecoderUnit(nn.Layer): + def __init__(self, sDim, xDim, yDim, attDim): + super(DecoderUnit, self).__init__() + self.sDim = sDim + self.xDim = xDim + self.yDim = yDim + self.attDim = attDim + self.emdDim = attDim + + self.attention_unit = AttentionUnit(sDim, xDim, attDim) + self.tgt_embedding = nn.Embedding( + yDim + 1, self.emdDim, weight_attr=nn.initializer.Normal( + std=0.01)) # the last is used for + self.gru = nn.GRUCell(input_size=xDim + self.emdDim, hidden_size=sDim) + self.fc = nn.Linear( + sDim, + yDim, + weight_attr=nn.initializer.Normal(std=0.01), + bias_attr=nn.initializer.Constant(value=0)) + self.embed_fc = nn.Linear(300, self.sDim) + + def get_initial_state(self, embed, tile_times=1): + assert embed.shape[1] == 300 + state = self.embed_fc(embed) # N * sDim + if tile_times != 1: + state = state.unsqueeze(1) + trans_state = paddle.transpose(state, perm=[1, 0, 2]) + state = paddle.tile(trans_state, repeat_times=[tile_times, 1, 1]) + trans_state = paddle.transpose(state, perm=[1, 0, 2]) + state = paddle.reshape(trans_state, shape=[-1, self.sDim]) + state = state.unsqueeze(0) # 1 * N * sDim + return state + + def forward(self, x, sPrev, yPrev): + # x: feature sequence from the image decoder. + batch_size, T, _ = x.shape + alpha = self.attention_unit(x, sPrev) + context = paddle.squeeze(paddle.matmul(alpha.unsqueeze(1), x), axis=1) + yPrev = paddle.cast(yPrev, dtype="int64") + yProj = self.tgt_embedding(yPrev) + + concat_context = paddle.concat([yProj, context], 1) + concat_context = paddle.squeeze(concat_context, 1) + sPrev = paddle.squeeze(sPrev, 0) + output, state = self.gru(concat_context, sPrev) + output = paddle.squeeze(output, axis=1) + output = self.fc(output) + return output, state \ No newline at end of file diff --git a/ppocr/modeling/heads/rec_att_head.py b/ppocr/modeling/heads/rec_att_head.py new file mode 100644 index 0000000000000000000000000000000000000000..6349ee0c2c0460f45f02bc1998aac4bdb6bdd632 --- /dev/null +++ b/ppocr/modeling/heads/rec_att_head.py @@ -0,0 +1,205 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np + + +class AttentionHead(nn.Layer): + def __init__(self, in_channels, out_channels, hidden_size, **kwargs): + super(AttentionHead, self).__init__() + self.input_size = in_channels + self.hidden_size = hidden_size + self.num_classes = out_channels + + self.attention_cell = AttentionGRUCell( + in_channels, hidden_size, out_channels, use_gru=False) + self.generator = nn.Linear(hidden_size, out_channels) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char, onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None, batch_max_length=25): + batch_size = paddle.shape(inputs)[0] + num_steps = batch_max_length + + hidden = paddle.zeros((batch_size, self.hidden_size)) + output_hiddens = [] + + if targets is not None: + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets[:, i], onehot_dim=self.num_classes) + (outputs, hidden), alpha = self.attention_cell(hidden, inputs, + char_onehots) + output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) + output = paddle.concat(output_hiddens, axis=1) + probs = self.generator(output) + else: + targets = paddle.zeros(shape=[batch_size], dtype="int32") + probs = None + char_onehots = None + outputs = None + alpha = None + + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets, onehot_dim=self.num_classes) + (outputs, hidden), alpha = self.attention_cell(hidden, inputs, + char_onehots) + probs_step = self.generator(outputs) + if probs is None: + probs = paddle.unsqueeze(probs_step, axis=1) + else: + probs = paddle.concat( + [probs, paddle.unsqueeze( + probs_step, axis=1)], axis=1) + next_input = probs_step.argmax(axis=1) + targets = next_input + if not self.training: + probs = paddle.nn.functional.softmax(probs, axis=2) + return probs + + +class AttentionGRUCell(nn.Layer): + def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): + super(AttentionGRUCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False) + self.h2h = nn.Linear(hidden_size, hidden_size) + self.score = nn.Linear(hidden_size, 1, bias_attr=False) + + self.rnn = nn.GRUCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden), axis=1) + + res = paddle.add(batch_H_proj, prev_hidden_proj) + res = paddle.tanh(res) + e = self.score(res) + + alpha = F.softmax(e, axis=1) + alpha = paddle.transpose(alpha, [0, 2, 1]) + context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1) + concat_context = paddle.concat([context, char_onehots], 1) + + cur_hidden = self.rnn(concat_context, prev_hidden) + + return cur_hidden, alpha + + +class AttentionLSTM(nn.Layer): + def __init__(self, in_channels, out_channels, hidden_size, **kwargs): + super(AttentionLSTM, self).__init__() + self.input_size = in_channels + self.hidden_size = hidden_size + self.num_classes = out_channels + + self.attention_cell = AttentionLSTMCell( + in_channels, hidden_size, out_channels, use_gru=False) + self.generator = nn.Linear(hidden_size, out_channels) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char, onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None, batch_max_length=25): + batch_size = inputs.shape[0] + num_steps = batch_max_length + + hidden = (paddle.zeros((batch_size, self.hidden_size)), paddle.zeros( + (batch_size, self.hidden_size))) + output_hiddens = [] + + if targets is not None: + for i in range(num_steps): + # one-hot vectors for a i-th char + char_onehots = self._char_to_onehot( + targets[:, i], onehot_dim=self.num_classes) + hidden, alpha = self.attention_cell(hidden, inputs, + char_onehots) + + hidden = (hidden[1][0], hidden[1][1]) + output_hiddens.append(paddle.unsqueeze(hidden[0], axis=1)) + output = paddle.concat(output_hiddens, axis=1) + probs = self.generator(output) + + else: + targets = paddle.zeros(shape=[batch_size], dtype="int32") + probs = None + char_onehots = None + alpha = None + + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets, onehot_dim=self.num_classes) + hidden, alpha = self.attention_cell(hidden, inputs, + char_onehots) + probs_step = self.generator(hidden[0]) + hidden = (hidden[1][0], hidden[1][1]) + if probs is None: + probs = paddle.unsqueeze(probs_step, axis=1) + else: + probs = paddle.concat( + [probs, paddle.unsqueeze( + probs_step, axis=1)], axis=1) + + next_input = probs_step.argmax(axis=1) + + targets = next_input + if not self.training: + probs = paddle.nn.functional.softmax(probs, axis=2) + return probs + + +class AttentionLSTMCell(nn.Layer): + def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): + super(AttentionLSTMCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False) + self.h2h = nn.Linear(hidden_size, hidden_size) + self.score = nn.Linear(hidden_size, 1, bias_attr=False) + if not use_gru: + self.rnn = nn.LSTMCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + else: + self.rnn = nn.GRUCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden[0]), axis=1) + res = paddle.add(batch_H_proj, prev_hidden_proj) + res = paddle.tanh(res) + e = self.score(res) + + alpha = F.softmax(e, axis=1) + alpha = paddle.transpose(alpha, [0, 2, 1]) + context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1) + concat_context = paddle.concat([context, char_onehots], 1) + cur_hidden = self.rnn(concat_context, prev_hidden) + + return cur_hidden, alpha diff --git a/ppocr/modeling/heads/rec_can_head.py b/ppocr/modeling/heads/rec_can_head.py new file mode 100644 index 0000000000000000000000000000000000000000..1ce4c4b548085fe8cd3f69d18183b326f0706d88 --- /dev/null +++ b/ppocr/modeling/heads/rec_can_head.py @@ -0,0 +1,319 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/LBH1024/CAN/models/can.py +https://github.com/LBH1024/CAN/models/counting.py +https://github.com/LBH1024/CAN/models/decoder.py +https://github.com/LBH1024/CAN/models/attention.py + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle.nn as nn +import paddle +import math +''' +Counting Module +''' + + +class ChannelAtt(nn.Layer): + def __init__(self, channel, reduction): + super(ChannelAtt, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2D(1) + + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction), + nn.ReLU(), nn.Linear(channel // reduction, channel), nn.Sigmoid()) + + def forward(self, x): + b, c, _, _ = x.shape + y = paddle.reshape(self.avg_pool(x), [b, c]) + y = paddle.reshape(self.fc(y), [b, c, 1, 1]) + return x * y + + +class CountingDecoder(nn.Layer): + def __init__(self, in_channel, out_channel, kernel_size): + super(CountingDecoder, self).__init__() + self.in_channel = in_channel + self.out_channel = out_channel + + self.trans_layer = nn.Sequential( + nn.Conv2D( + self.in_channel, + 512, + kernel_size=kernel_size, + padding=kernel_size // 2, + bias_attr=False), + nn.BatchNorm2D(512)) + + self.channel_att = ChannelAtt(512, 16) + + self.pred_layer = nn.Sequential( + nn.Conv2D( + 512, self.out_channel, kernel_size=1, bias_attr=False), + nn.Sigmoid()) + + def forward(self, x, mask): + b, _, h, w = x.shape + x = self.trans_layer(x) + x = self.channel_att(x) + x = self.pred_layer(x) + + if mask is not None: + x = x * mask + x = paddle.reshape(x, [b, self.out_channel, -1]) + x1 = paddle.sum(x, axis=-1) + + return x1, paddle.reshape(x, [b, self.out_channel, h, w]) + + +''' +Attention Decoder +''' + + +class PositionEmbeddingSine(nn.Layer): + def __init__(self, + num_pos_feats=64, + temperature=10000, + normalize=False, + scale=None): + super().__init__() + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, x, mask): + y_embed = paddle.cumsum(mask, 1, dtype='float32') + x_embed = paddle.cumsum(mask, 2, dtype='float32') + + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + dim_t = paddle.arange(self.num_pos_feats, dtype='float32') + dim_d = paddle.expand(paddle.to_tensor(2), dim_t.shape) + dim_t = self.temperature**(2 * (dim_t / dim_d).astype('int64') / + self.num_pos_feats) + + pos_x = paddle.unsqueeze(x_embed, [3]) / dim_t + pos_y = paddle.unsqueeze(y_embed, [3]) / dim_t + + pos_x = paddle.flatten( + paddle.stack( + [ + paddle.sin(pos_x[:, :, :, 0::2]), + paddle.cos(pos_x[:, :, :, 1::2]) + ], + axis=4), + 3) + pos_y = paddle.flatten( + paddle.stack( + [ + paddle.sin(pos_y[:, :, :, 0::2]), + paddle.cos(pos_y[:, :, :, 1::2]) + ], + axis=4), + 3) + + pos = paddle.transpose( + paddle.concat( + [pos_y, pos_x], axis=3), [0, 3, 1, 2]) + + return pos + + +class AttDecoder(nn.Layer): + def __init__(self, ratio, is_train, input_size, hidden_size, + encoder_out_channel, dropout, dropout_ratio, word_num, + counting_decoder_out_channel, attention): + super(AttDecoder, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.out_channel = encoder_out_channel + self.attention_dim = attention['attention_dim'] + self.dropout_prob = dropout + self.ratio = ratio + self.word_num = word_num + + self.counting_num = counting_decoder_out_channel + self.is_train = is_train + + self.init_weight = nn.Linear(self.out_channel, self.hidden_size) + self.embedding = nn.Embedding(self.word_num, self.input_size) + self.word_input_gru = nn.GRUCell(self.input_size, self.hidden_size) + self.word_attention = Attention(hidden_size, attention['attention_dim']) + + self.encoder_feature_conv = nn.Conv2D( + self.out_channel, + self.attention_dim, + kernel_size=attention['word_conv_kernel'], + padding=attention['word_conv_kernel'] // 2) + + self.word_state_weight = nn.Linear(self.hidden_size, self.hidden_size) + self.word_embedding_weight = nn.Linear(self.input_size, + self.hidden_size) + self.word_context_weight = nn.Linear(self.out_channel, self.hidden_size) + self.counting_context_weight = nn.Linear(self.counting_num, + self.hidden_size) + self.word_convert = nn.Linear(self.hidden_size, self.word_num) + + if dropout: + self.dropout = nn.Dropout(dropout_ratio) + + def forward(self, cnn_features, labels, counting_preds, images_mask): + if self.is_train: + _, num_steps = labels.shape + else: + num_steps = 36 + + batch_size, _, height, width = cnn_features.shape + images_mask = images_mask[:, :, ::self.ratio, ::self.ratio] + + word_probs = paddle.zeros((batch_size, num_steps, self.word_num)) + word_alpha_sum = paddle.zeros((batch_size, 1, height, width)) + + hidden = self.init_hidden(cnn_features, images_mask) + counting_context_weighted = self.counting_context_weight(counting_preds) + cnn_features_trans = self.encoder_feature_conv(cnn_features) + + position_embedding = PositionEmbeddingSine(256, normalize=True) + pos = position_embedding(cnn_features_trans, images_mask[:, 0, :, :]) + + cnn_features_trans = cnn_features_trans + pos + + word = paddle.ones([batch_size, 1], dtype='int64') # init word as sos + word = word.squeeze(axis=1) + for i in range(num_steps): + word_embedding = self.embedding(word) + _, hidden = self.word_input_gru(word_embedding, hidden) + word_context_vec, _, word_alpha_sum = self.word_attention( + cnn_features, cnn_features_trans, hidden, word_alpha_sum, + images_mask) + + current_state = self.word_state_weight(hidden) + word_weighted_embedding = self.word_embedding_weight(word_embedding) + word_context_weighted = self.word_context_weight(word_context_vec) + + if self.dropout_prob: + word_out_state = self.dropout( + current_state + word_weighted_embedding + + word_context_weighted + counting_context_weighted) + else: + word_out_state = current_state + word_weighted_embedding + word_context_weighted + counting_context_weighted + + word_prob = self.word_convert(word_out_state) + word_probs[:, i] = word_prob + + if self.is_train: + word = labels[:, i] + else: + word = word_prob.argmax(1) + word = paddle.multiply( + word, labels[:, i] + ) # labels are oneslike tensor in infer/predict mode + + return word_probs + + def init_hidden(self, features, feature_mask): + average = paddle.sum(paddle.sum(features * feature_mask, axis=-1), + axis=-1) / paddle.sum( + (paddle.sum(feature_mask, axis=-1)), axis=-1) + average = self.init_weight(average) + return paddle.tanh(average) + + +''' +Attention Module +''' + + +class Attention(nn.Layer): + def __init__(self, hidden_size, attention_dim): + super(Attention, self).__init__() + self.hidden = hidden_size + self.attention_dim = attention_dim + self.hidden_weight = nn.Linear(self.hidden, self.attention_dim) + self.attention_conv = nn.Conv2D( + 1, 512, kernel_size=11, padding=5, bias_attr=False) + self.attention_weight = nn.Linear( + 512, self.attention_dim, bias_attr=False) + self.alpha_convert = nn.Linear(self.attention_dim, 1) + + def forward(self, + cnn_features, + cnn_features_trans, + hidden, + alpha_sum, + image_mask=None): + query = self.hidden_weight(hidden) + alpha_sum_trans = self.attention_conv(alpha_sum) + coverage_alpha = self.attention_weight( + paddle.transpose(alpha_sum_trans, [0, 2, 3, 1])) + alpha_score = paddle.tanh( + paddle.unsqueeze(query, [1, 2]) + coverage_alpha + paddle.transpose( + cnn_features_trans, [0, 2, 3, 1])) + energy = self.alpha_convert(alpha_score) + energy = energy - energy.max() + energy_exp = paddle.exp(paddle.squeeze(energy, -1)) + + if image_mask is not None: + energy_exp = energy_exp * paddle.squeeze(image_mask, 1) + alpha = energy_exp / (paddle.unsqueeze( + paddle.sum(paddle.sum(energy_exp, -1), -1), [1, 2]) + 1e-10) + alpha_sum = paddle.unsqueeze(alpha, 1) + alpha_sum + context_vector = paddle.sum( + paddle.sum((paddle.unsqueeze(alpha, 1) * cnn_features), -1), -1) + + return context_vector, alpha, alpha_sum + + +class CANHead(nn.Layer): + def __init__(self, in_channel, out_channel, ratio, attdecoder, **kwargs): + super(CANHead, self).__init__() + + self.in_channel = in_channel + self.out_channel = out_channel + + self.counting_decoder1 = CountingDecoder(self.in_channel, + self.out_channel, 3) # mscm + self.counting_decoder2 = CountingDecoder(self.in_channel, + self.out_channel, 5) + + self.decoder = AttDecoder(ratio, **attdecoder) + + self.ratio = ratio + + def forward(self, inputs, targets=None): + cnn_features, images_mask, labels = inputs + + counting_mask = images_mask[:, :, ::self.ratio, ::self.ratio] + counting_preds1, _ = self.counting_decoder1(cnn_features, counting_mask) + counting_preds2, _ = self.counting_decoder2(cnn_features, counting_mask) + counting_preds = (counting_preds1 + counting_preds2) / 2 + + word_probs = self.decoder(cnn_features, labels, counting_preds, + images_mask) + return word_probs, counting_preds, counting_preds1, counting_preds2 diff --git a/ppocr/modeling/heads/rec_ctc_head.py b/ppocr/modeling/heads/rec_ctc_head.py new file mode 100644 index 0000000000000000000000000000000000000000..6c1cf0659607186d54dfee6983b135f34d542446 --- /dev/null +++ b/ppocr/modeling/heads/rec_ctc_head.py @@ -0,0 +1,87 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import paddle +from paddle import ParamAttr, nn +from paddle.nn import functional as F + + +def get_para_bias_attr(l2_decay, k): + regularizer = paddle.regularizer.L2Decay(l2_decay) + stdv = 1.0 / math.sqrt(k * 1.0) + initializer = nn.initializer.Uniform(-stdv, stdv) + weight_attr = ParamAttr(regularizer=regularizer, initializer=initializer) + bias_attr = ParamAttr(regularizer=regularizer, initializer=initializer) + return [weight_attr, bias_attr] + + +class CTCHead(nn.Layer): + def __init__(self, + in_channels, + out_channels, + fc_decay=0.0004, + mid_channels=None, + return_feats=False, + **kwargs): + super(CTCHead, self).__init__() + if mid_channels is None: + weight_attr, bias_attr = get_para_bias_attr( + l2_decay=fc_decay, k=in_channels) + self.fc = nn.Linear( + in_channels, + out_channels, + weight_attr=weight_attr, + bias_attr=bias_attr) + else: + weight_attr1, bias_attr1 = get_para_bias_attr( + l2_decay=fc_decay, k=in_channels) + self.fc1 = nn.Linear( + in_channels, + mid_channels, + weight_attr=weight_attr1, + bias_attr=bias_attr1) + + weight_attr2, bias_attr2 = get_para_bias_attr( + l2_decay=fc_decay, k=mid_channels) + self.fc2 = nn.Linear( + mid_channels, + out_channels, + weight_attr=weight_attr2, + bias_attr=bias_attr2) + self.out_channels = out_channels + self.mid_channels = mid_channels + self.return_feats = return_feats + + def forward(self, x, targets=None): + if self.mid_channels is None: + predicts = self.fc(x) + else: + x = self.fc1(x) + predicts = self.fc2(x) + + if self.return_feats: + result = (x, predicts) + else: + result = predicts + if not self.training: + predicts = F.softmax(predicts, axis=2) + result = predicts + + return result diff --git a/ppocr/modeling/heads/rec_multi_head.py b/ppocr/modeling/heads/rec_multi_head.py new file mode 100644 index 0000000000000000000000000000000000000000..2f10e7bdf90025d3304128e720ce561c8bb269c1 --- /dev/null +++ b/ppocr/modeling/heads/rec_multi_head.py @@ -0,0 +1,73 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + +from ppocr.modeling.necks.rnn import Im2Seq, EncoderWithRNN, EncoderWithFC, SequenceEncoder, EncoderWithSVTR +from .rec_ctc_head import CTCHead +from .rec_sar_head import SARHead + + +class MultiHead(nn.Layer): + def __init__(self, in_channels, out_channels_list, **kwargs): + super().__init__() + self.head_list = kwargs.pop('head_list') + self.gtc_head = 'sar' + assert len(self.head_list) >= 2 + for idx, head_name in enumerate(self.head_list): + name = list(head_name)[0] + if name == 'SARHead': + # sar head + sar_args = self.head_list[idx][name] + self.sar_head = eval(name)(in_channels=in_channels, \ + out_channels=out_channels_list['SARLabelDecode'], **sar_args) + elif name == 'CTCHead': + # ctc neck + self.encoder_reshape = Im2Seq(in_channels) + neck_args = self.head_list[idx][name]['Neck'] + encoder_type = neck_args.pop('name') + self.encoder = encoder_type + self.ctc_encoder = SequenceEncoder(in_channels=in_channels, \ + encoder_type=encoder_type, **neck_args) + # ctc head + head_args = self.head_list[idx][name]['Head'] + self.ctc_head = eval(name)(in_channels=self.ctc_encoder.out_channels, \ + out_channels=out_channels_list['CTCLabelDecode'], **head_args) + else: + raise NotImplementedError( + '{} is not supported in MultiHead yet'.format(name)) + + def forward(self, x, targets=None): + ctc_encoder = self.ctc_encoder(x) + ctc_out = self.ctc_head(ctc_encoder, targets) + head_out = dict() + head_out['ctc'] = ctc_out + head_out['ctc_neck'] = ctc_encoder + # eval mode + if not self.training: + return ctc_out + if self.gtc_head == 'sar': + sar_out = self.sar_head(x, targets[1:]) + head_out['sar'] = sar_out + return head_out + else: + return head_out diff --git a/ppocr/modeling/heads/rec_nrtr_head.py b/ppocr/modeling/heads/rec_nrtr_head.py new file mode 100644 index 0000000000000000000000000000000000000000..2fffa521769f66476b1b8d54219fe401f34cd5ad --- /dev/null +++ b/ppocr/modeling/heads/rec_nrtr_head.py @@ -0,0 +1,672 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle.nn import LayerList +from paddle.nn import Dropout, Linear, LayerNorm +import numpy as np +from ppocr.modeling.backbones.rec_svtrnet import Mlp, zeros_, ones_ +from paddle.nn.initializer import XavierNormal as xavier_normal_ + + +class Transformer(nn.Layer): + """A transformer model. User is able to modify the attributes as needed. The architechture + is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer, + Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and + Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information + Processing Systems, pages 6000-6010. + Args: + d_model: the number of expected features in the encoder/decoder inputs (default=512). + nhead: the number of heads in the multiheadattention models (default=8). + num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6). + num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + custom_encoder: custom encoder (default=None). + custom_decoder: custom decoder (default=None). + """ + + def __init__(self, + d_model=512, + nhead=8, + num_encoder_layers=6, + beam_size=0, + num_decoder_layers=6, + max_len=25, + dim_feedforward=1024, + attention_dropout_rate=0.0, + residual_dropout_rate=0.1, + in_channels=0, + out_channels=0, + scale_embedding=True): + super(Transformer, self).__init__() + self.out_channels = out_channels + 1 + self.max_len = max_len + self.embedding = Embeddings( + d_model=d_model, + vocab=self.out_channels, + padding_idx=0, + scale_embedding=scale_embedding) + self.positional_encoding = PositionalEncoding( + dropout=residual_dropout_rate, dim=d_model) + + if num_encoder_layers > 0: + self.encoder = nn.LayerList([ + TransformerBlock( + d_model, + nhead, + dim_feedforward, + attention_dropout_rate, + residual_dropout_rate, + with_self_attn=True, + with_cross_attn=False) for i in range(num_encoder_layers) + ]) + else: + self.encoder = None + + self.decoder = nn.LayerList([ + TransformerBlock( + d_model, + nhead, + dim_feedforward, + attention_dropout_rate, + residual_dropout_rate, + with_self_attn=True, + with_cross_attn=True) for i in range(num_decoder_layers) + ]) + + self.beam_size = beam_size + self.d_model = d_model + self.nhead = nhead + self.tgt_word_prj = nn.Linear( + d_model, self.out_channels, bias_attr=False) + w0 = np.random.normal(0.0, d_model**-0.5, + (d_model, self.out_channels)).astype(np.float32) + self.tgt_word_prj.weight.set_value(w0) + self.apply(self._init_weights) + + def _init_weights(self, m): + + if isinstance(m, nn.Linear): + xavier_normal_(m.weight) + if m.bias is not None: + zeros_(m.bias) + + def forward_train(self, src, tgt): + tgt = tgt[:, :-1] + + tgt = self.embedding(tgt) + tgt = self.positional_encoding(tgt) + tgt_mask = self.generate_square_subsequent_mask(tgt.shape[1]) + + if self.encoder is not None: + src = self.positional_encoding(src) + for encoder_layer in self.encoder: + src = encoder_layer(src) + memory = src # B N C + else: + memory = src # B N C + for decoder_layer in self.decoder: + tgt = decoder_layer(tgt, memory, self_mask=tgt_mask) + output = tgt + logit = self.tgt_word_prj(output) + return logit + + def forward(self, src, targets=None): + """Take in and process masked source/target sequences. + Args: + src: the sequence to the encoder (required). + tgt: the sequence to the decoder (required). + Shape: + - src: :math:`(B, sN, C)`. + - tgt: :math:`(B, tN, C)`. + Examples: + >>> output = transformer_model(src, tgt) + """ + + if self.training: + max_len = targets[1].max() + tgt = targets[0][:, :2 + max_len] + return self.forward_train(src, tgt) + else: + if self.beam_size > 0: + return self.forward_beam(src) + else: + return self.forward_test(src) + + def forward_test(self, src): + + bs = paddle.shape(src)[0] + if self.encoder is not None: + src = self.positional_encoding(src) + for encoder_layer in self.encoder: + src = encoder_layer(src) + memory = src # B N C + else: + memory = src + dec_seq = paddle.full((bs, 1), 2, dtype=paddle.int64) + dec_prob = paddle.full((bs, 1), 1., dtype=paddle.float32) + for len_dec_seq in range(1, paddle.to_tensor(self.max_len)): + dec_seq_embed = self.embedding(dec_seq) + dec_seq_embed = self.positional_encoding(dec_seq_embed) + tgt_mask = self.generate_square_subsequent_mask( + paddle.shape(dec_seq_embed)[1]) + tgt = dec_seq_embed + for decoder_layer in self.decoder: + tgt = decoder_layer(tgt, memory, self_mask=tgt_mask) + dec_output = tgt + dec_output = dec_output[:, -1, :] + word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=-1) + preds_idx = paddle.argmax(word_prob, axis=-1) + if paddle.equal_all( + preds_idx, + paddle.full( + paddle.shape(preds_idx), 3, dtype='int64')): + break + preds_prob = paddle.max(word_prob, axis=-1) + dec_seq = paddle.concat( + [dec_seq, paddle.reshape(preds_idx, [-1, 1])], axis=1) + dec_prob = paddle.concat( + [dec_prob, paddle.reshape(preds_prob, [-1, 1])], axis=1) + return [dec_seq, dec_prob] + + def forward_beam(self, images): + """ Translation work in one batch """ + + def get_inst_idx_to_tensor_position_map(inst_idx_list): + """ Indicate the position of an instance in a tensor. """ + return { + inst_idx: tensor_position + for tensor_position, inst_idx in enumerate(inst_idx_list) + } + + def collect_active_part(beamed_tensor, curr_active_inst_idx, + n_prev_active_inst, n_bm): + """ Collect tensor parts associated to active instances. """ + + beamed_tensor_shape = paddle.shape(beamed_tensor) + n_curr_active_inst = len(curr_active_inst_idx) + new_shape = (n_curr_active_inst * n_bm, beamed_tensor_shape[1], + beamed_tensor_shape[2]) + + beamed_tensor = beamed_tensor.reshape([n_prev_active_inst, -1]) + beamed_tensor = beamed_tensor.index_select( + curr_active_inst_idx, axis=0) + beamed_tensor = beamed_tensor.reshape(new_shape) + + return beamed_tensor + + def collate_active_info(src_enc, inst_idx_to_position_map, + active_inst_idx_list): + # Sentences which are still active are collected, + # so the decoder will not run on completed sentences. + + n_prev_active_inst = len(inst_idx_to_position_map) + active_inst_idx = [ + inst_idx_to_position_map[k] for k in active_inst_idx_list + ] + active_inst_idx = paddle.to_tensor(active_inst_idx, dtype='int64') + active_src_enc = collect_active_part( + src_enc.transpose([1, 0, 2]), active_inst_idx, + n_prev_active_inst, n_bm).transpose([1, 0, 2]) + active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map( + active_inst_idx_list) + return active_src_enc, active_inst_idx_to_position_map + + def beam_decode_step(inst_dec_beams, len_dec_seq, enc_output, + inst_idx_to_position_map, n_bm): + """ Decode and update beam status, and then return active beam idx """ + + def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq): + dec_partial_seq = [ + b.get_current_state() for b in inst_dec_beams if not b.done + ] + dec_partial_seq = paddle.stack(dec_partial_seq) + dec_partial_seq = dec_partial_seq.reshape([-1, len_dec_seq]) + return dec_partial_seq + + def predict_word(dec_seq, enc_output, n_active_inst, n_bm): + dec_seq = self.embedding(dec_seq) + dec_seq = self.positional_encoding(dec_seq) + tgt_mask = self.generate_square_subsequent_mask( + paddle.shape(dec_seq)[1]) + tgt = dec_seq + for decoder_layer in self.decoder: + tgt = decoder_layer(tgt, enc_output, self_mask=tgt_mask) + dec_output = tgt + dec_output = dec_output[:, + -1, :] # Pick the last step: (bh * bm) * d_h + word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1) + word_prob = paddle.reshape(word_prob, [n_active_inst, n_bm, -1]) + return word_prob + + def collect_active_inst_idx_list(inst_beams, word_prob, + inst_idx_to_position_map): + active_inst_idx_list = [] + for inst_idx, inst_position in inst_idx_to_position_map.items(): + is_inst_complete = inst_beams[inst_idx].advance(word_prob[ + inst_position]) + if not is_inst_complete: + active_inst_idx_list += [inst_idx] + + return active_inst_idx_list + + n_active_inst = len(inst_idx_to_position_map) + dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq) + word_prob = predict_word(dec_seq, enc_output, n_active_inst, n_bm) + # Update the beam with predicted word prob information and collect incomplete instances + active_inst_idx_list = collect_active_inst_idx_list( + inst_dec_beams, word_prob, inst_idx_to_position_map) + return active_inst_idx_list + + def collect_hypothesis_and_scores(inst_dec_beams, n_best): + all_hyp, all_scores = [], [] + for inst_idx in range(len(inst_dec_beams)): + scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores() + all_scores += [scores[:n_best]] + hyps = [ + inst_dec_beams[inst_idx].get_hypothesis(i) + for i in tail_idxs[:n_best] + ] + all_hyp += [hyps] + return all_hyp, all_scores + + with paddle.no_grad(): + #-- Encode + if self.encoder is not None: + src = self.positional_encoding(images) + src_enc = self.encoder(src) + else: + src_enc = images + + n_bm = self.beam_size + src_shape = paddle.shape(src_enc) + inst_dec_beams = [Beam(n_bm) for _ in range(1)] + active_inst_idx_list = list(range(1)) + # Repeat data for beam search + src_enc = paddle.tile(src_enc, [1, n_bm, 1]) + inst_idx_to_position_map = get_inst_idx_to_tensor_position_map( + active_inst_idx_list) + # Decode + for len_dec_seq in range(1, paddle.to_tensor(self.max_len)): + src_enc_copy = src_enc.clone() + active_inst_idx_list = beam_decode_step( + inst_dec_beams, len_dec_seq, src_enc_copy, + inst_idx_to_position_map, n_bm) + if not active_inst_idx_list: + break # all instances have finished their path to + src_enc, inst_idx_to_position_map = collate_active_info( + src_enc_copy, inst_idx_to_position_map, + active_inst_idx_list) + batch_hyp, batch_scores = collect_hypothesis_and_scores(inst_dec_beams, + 1) + result_hyp = [] + hyp_scores = [] + for bs_hyp, score in zip(batch_hyp, batch_scores): + l = len(bs_hyp[0]) + bs_hyp_pad = bs_hyp[0] + [3] * (25 - l) + result_hyp.append(bs_hyp_pad) + score = float(score) / l + hyp_score = [score for _ in range(25)] + hyp_scores.append(hyp_score) + return [ + paddle.to_tensor( + np.array(result_hyp), dtype=paddle.int64), + paddle.to_tensor(hyp_scores) + ] + + def generate_square_subsequent_mask(self, sz): + """Generate a square mask for the sequence. The masked positions are filled with float('-inf'). + Unmasked positions are filled with float(0.0). + """ + mask = paddle.zeros([sz, sz], dtype='float32') + mask_inf = paddle.triu( + paddle.full( + shape=[sz, sz], dtype='float32', fill_value='-inf'), + diagonal=1) + mask = mask + mask_inf + return mask.unsqueeze([0, 1]) + + +class MultiheadAttention(nn.Layer): + """Allows the model to jointly attend to information + from different representation subspaces. + See reference: Attention Is All You Need + .. math:: + \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O + \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) + Args: + embed_dim: total dimension of the model + num_heads: parallel attention layers, or heads + """ + + def __init__(self, embed_dim, num_heads, dropout=0., self_attn=False): + super(MultiheadAttention, self).__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + # self.dropout = dropout + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + self.scale = self.head_dim**-0.5 + self.self_attn = self_attn + if self_attn: + self.qkv = nn.Linear(embed_dim, embed_dim * 3) + else: + self.q = nn.Linear(embed_dim, embed_dim) + self.kv = nn.Linear(embed_dim, embed_dim * 2) + self.attn_drop = nn.Dropout(dropout) + self.out_proj = nn.Linear(embed_dim, embed_dim) + + def forward(self, query, key=None, attn_mask=None): + + qN = query.shape[1] + + if self.self_attn: + qkv = self.qkv(query).reshape( + (0, qN, 3, self.num_heads, self.head_dim)).transpose( + (2, 0, 3, 1, 4)) + q, k, v = qkv[0], qkv[1], qkv[2] + else: + kN = key.shape[1] + q = self.q(query).reshape( + [0, qN, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) + kv = self.kv(key).reshape( + (0, kN, 2, self.num_heads, self.head_dim)).transpose( + (2, 0, 3, 1, 4)) + k, v = kv[0], kv[1] + + attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale + + if attn_mask is not None: + attn += attn_mask + + attn = F.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape( + (0, qN, self.embed_dim)) + x = self.out_proj(x) + + return x + + +class TransformerBlock(nn.Layer): + def __init__(self, + d_model, + nhead, + dim_feedforward=2048, + attention_dropout_rate=0.0, + residual_dropout_rate=0.1, + with_self_attn=True, + with_cross_attn=False, + epsilon=1e-5): + super(TransformerBlock, self).__init__() + self.with_self_attn = with_self_attn + if with_self_attn: + self.self_attn = MultiheadAttention( + d_model, + nhead, + dropout=attention_dropout_rate, + self_attn=with_self_attn) + self.norm1 = LayerNorm(d_model, epsilon=epsilon) + self.dropout1 = Dropout(residual_dropout_rate) + self.with_cross_attn = with_cross_attn + if with_cross_attn: + self.cross_attn = MultiheadAttention( #for self_attn of encoder or cross_attn of decoder + d_model, + nhead, + dropout=attention_dropout_rate) + self.norm2 = LayerNorm(d_model, epsilon=epsilon) + self.dropout2 = Dropout(residual_dropout_rate) + + self.mlp = Mlp(in_features=d_model, + hidden_features=dim_feedforward, + act_layer=nn.ReLU, + drop=residual_dropout_rate) + + self.norm3 = LayerNorm(d_model, epsilon=epsilon) + + self.dropout3 = Dropout(residual_dropout_rate) + + def forward(self, tgt, memory=None, self_mask=None, cross_mask=None): + if self.with_self_attn: + tgt1 = self.self_attn(tgt, attn_mask=self_mask) + tgt = self.norm1(tgt + self.dropout1(tgt1)) + + if self.with_cross_attn: + tgt2 = self.cross_attn(tgt, key=memory, attn_mask=cross_mask) + tgt = self.norm2(tgt + self.dropout2(tgt2)) + tgt = self.norm3(tgt + self.dropout3(self.mlp(tgt))) + return tgt + + +class PositionalEncoding(nn.Layer): + """Inject some information about the relative or absolute position of the tokens + in the sequence. The positional encodings have the same dimension as + the embeddings, so that the two can be summed. Here, we use sine and cosine + functions of different frequencies. + .. math:: + \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model)) + \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model)) + \text{where pos is the word position and i is the embed idx) + Args: + d_model: the embed dim (required). + dropout: the dropout value (default=0.1). + max_len: the max. length of the incoming sequence (default=5000). + Examples: + >>> pos_encoder = PositionalEncoding(d_model) + """ + + def __init__(self, dropout, dim, max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = paddle.zeros([max_len, dim]) + position = paddle.arange(0, max_len, dtype=paddle.float32).unsqueeze(1) + div_term = paddle.exp( + paddle.arange(0, dim, 2).astype('float32') * + (-math.log(10000.0) / dim)) + pe[:, 0::2] = paddle.sin(position * div_term) + pe[:, 1::2] = paddle.cos(position * div_term) + pe = paddle.unsqueeze(pe, 0) + pe = paddle.transpose(pe, [1, 0, 2]) + self.register_buffer('pe', pe) + + def forward(self, x): + """Inputs of forward function + Args: + x: the sequence fed to the positional encoder model (required). + Shape: + x: [sequence length, batch size, embed dim] + output: [sequence length, batch size, embed dim] + Examples: + >>> output = pos_encoder(x) + """ + x = x.transpose([1, 0, 2]) + x = x + self.pe[:paddle.shape(x)[0], :] + return self.dropout(x).transpose([1, 0, 2]) + + +class PositionalEncoding_2d(nn.Layer): + """Inject some information about the relative or absolute position of the tokens + in the sequence. The positional encodings have the same dimension as + the embeddings, so that the two can be summed. Here, we use sine and cosine + functions of different frequencies. + .. math:: + \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model)) + \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model)) + \text{where pos is the word position and i is the embed idx) + Args: + d_model: the embed dim (required). + dropout: the dropout value (default=0.1). + max_len: the max. length of the incoming sequence (default=5000). + Examples: + >>> pos_encoder = PositionalEncoding(d_model) + """ + + def __init__(self, dropout, dim, max_len=5000): + super(PositionalEncoding_2d, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = paddle.zeros([max_len, dim]) + position = paddle.arange(0, max_len, dtype=paddle.float32).unsqueeze(1) + div_term = paddle.exp( + paddle.arange(0, dim, 2).astype('float32') * + (-math.log(10000.0) / dim)) + pe[:, 0::2] = paddle.sin(position * div_term) + pe[:, 1::2] = paddle.cos(position * div_term) + pe = paddle.transpose(paddle.unsqueeze(pe, 0), [1, 0, 2]) + self.register_buffer('pe', pe) + + self.avg_pool_1 = nn.AdaptiveAvgPool2D((1, 1)) + self.linear1 = nn.Linear(dim, dim) + self.linear1.weight.data.fill_(1.) + self.avg_pool_2 = nn.AdaptiveAvgPool2D((1, 1)) + self.linear2 = nn.Linear(dim, dim) + self.linear2.weight.data.fill_(1.) + + def forward(self, x): + """Inputs of forward function + Args: + x: the sequence fed to the positional encoder model (required). + Shape: + x: [sequence length, batch size, embed dim] + output: [sequence length, batch size, embed dim] + Examples: + >>> output = pos_encoder(x) + """ + w_pe = self.pe[:paddle.shape(x)[-1], :] + w1 = self.linear1(self.avg_pool_1(x).squeeze()).unsqueeze(0) + w_pe = w_pe * w1 + w_pe = paddle.transpose(w_pe, [1, 2, 0]) + w_pe = paddle.unsqueeze(w_pe, 2) + + h_pe = self.pe[:paddle.shape(x).shape[-2], :] + w2 = self.linear2(self.avg_pool_2(x).squeeze()).unsqueeze(0) + h_pe = h_pe * w2 + h_pe = paddle.transpose(h_pe, [1, 2, 0]) + h_pe = paddle.unsqueeze(h_pe, 3) + + x = x + w_pe + h_pe + x = paddle.transpose( + paddle.reshape(x, + [x.shape[0], x.shape[1], x.shape[2] * x.shape[3]]), + [2, 0, 1]) + + return self.dropout(x) + + +class Embeddings(nn.Layer): + def __init__(self, d_model, vocab, padding_idx=None, scale_embedding=True): + super(Embeddings, self).__init__() + self.embedding = nn.Embedding(vocab, d_model, padding_idx=padding_idx) + w0 = np.random.normal(0.0, d_model**-0.5, + (vocab, d_model)).astype(np.float32) + self.embedding.weight.set_value(w0) + self.d_model = d_model + self.scale_embedding = scale_embedding + + def forward(self, x): + if self.scale_embedding: + x = self.embedding(x) + return x * math.sqrt(self.d_model) + return self.embedding(x) + + +class Beam(): + """ Beam search """ + + def __init__(self, size, device=False): + + self.size = size + self._done = False + # The score for each translation on the beam. + self.scores = paddle.zeros((size, ), dtype=paddle.float32) + self.all_scores = [] + # The backpointers at each time-step. + self.prev_ks = [] + # The outputs at each time-step. + self.next_ys = [paddle.full((size, ), 0, dtype=paddle.int64)] + self.next_ys[0][0] = 2 + + def get_current_state(self): + "Get the outputs for the current timestep." + return self.get_tentative_hypothesis() + + def get_current_origin(self): + "Get the backpointers for the current timestep." + return self.prev_ks[-1] + + @property + def done(self): + return self._done + + def advance(self, word_prob): + "Update beam status and check if finished or not." + num_words = word_prob.shape[1] + + # Sum the previous scores. + if len(self.prev_ks) > 0: + beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob) + else: + beam_lk = word_prob[0] + + flat_beam_lk = beam_lk.reshape([-1]) + best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, + True) # 1st sort + self.all_scores.append(self.scores) + self.scores = best_scores + # bestScoresId is flattened as a (beam x word) array, + # so we need to calculate which word and beam each score came from + prev_k = best_scores_id // num_words + self.prev_ks.append(prev_k) + self.next_ys.append(best_scores_id - prev_k * num_words) + # End condition is when top-of-beam is EOS. + if self.next_ys[-1][0] == 3: + self._done = True + self.all_scores.append(self.scores) + + return self._done + + def sort_scores(self): + "Sort the scores." + return self.scores, paddle.to_tensor( + [i for i in range(int(self.scores.shape[0]))], dtype='int32') + + def get_the_best_score_and_idx(self): + "Get the score of the best in the beam." + scores, ids = self.sort_scores() + return scores[1], ids[1] + + def get_tentative_hypothesis(self): + "Get the decoded sequence for the current timestep." + if len(self.next_ys) == 1: + dec_seq = self.next_ys[0].unsqueeze(1) + else: + _, keys = self.sort_scores() + hyps = [self.get_hypothesis(k) for k in keys] + hyps = [[2] + h for h in hyps] + dec_seq = paddle.to_tensor(hyps, dtype='int64') + return dec_seq + + def get_hypothesis(self, k): + """ Walk back to construct the full hypothesis. """ + hyp = [] + for j in range(len(self.prev_ks) - 1, -1, -1): + hyp.append(self.next_ys[j + 1][k]) + k = self.prev_ks[j][k] + return list(map(lambda x: x.item(), hyp[::-1])) diff --git a/ppocr/modeling/heads/rec_pren_head.py b/ppocr/modeling/heads/rec_pren_head.py new file mode 100644 index 0000000000000000000000000000000000000000..c9e4b3e9f82f60b7671e56ddfa02baff5e62f37b --- /dev/null +++ b/ppocr/modeling/heads/rec_pren_head.py @@ -0,0 +1,34 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn +from paddle.nn import functional as F + + +class PRENHead(nn.Layer): + def __init__(self, in_channels, out_channels, **kwargs): + super(PRENHead, self).__init__() + self.linear = nn.Linear(in_channels, out_channels) + + def forward(self, x, targets=None): + predicts = self.linear(x) + + if not self.training: + predicts = F.softmax(predicts, axis=2) + + return predicts diff --git a/ppocr/modeling/heads/rec_rfl_head.py b/ppocr/modeling/heads/rec_rfl_head.py new file mode 100644 index 0000000000000000000000000000000000000000..1ded8cde939f83f383f049d22d1b4750b7d15eb4 --- /dev/null +++ b/ppocr/modeling/heads/rec_rfl_head.py @@ -0,0 +1,108 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/hikopensource/DAVAR-Lab-OCR/blob/main/davarocr/davar_rcg/models/sequence_heads/counting_head.py +""" +import paddle +import paddle.nn as nn +from paddle.nn.initializer import TruncatedNormal, Constant, Normal, KaimingNormal + +from .rec_att_head import AttentionLSTM + +kaiming_init_ = KaimingNormal() +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + + +class CNTHead(nn.Layer): + def __init__(self, + embed_size=512, + encode_length=26, + out_channels=38, + **kwargs): + super(CNTHead, self).__init__() + + self.out_channels = out_channels + + self.Wv_fusion = nn.Linear(embed_size, embed_size, bias_attr=False) + self.Prediction_visual = nn.Linear(encode_length * embed_size, + self.out_channels) + + def forward(self, visual_feature): + + b, c, h, w = visual_feature.shape + visual_feature = visual_feature.reshape([b, c, h * w]).transpose( + [0, 2, 1]) + visual_feature_num = self.Wv_fusion(visual_feature) # batch * 26 * 512 + b, n, c = visual_feature_num.shape + # using visual feature directly calculate the text length + visual_feature_num = visual_feature_num.reshape([b, n * c]) + prediction_visual = self.Prediction_visual(visual_feature_num) + + return prediction_visual + + +class RFLHead(nn.Layer): + def __init__(self, + in_channels=512, + hidden_size=256, + batch_max_legnth=25, + out_channels=38, + use_cnt=True, + use_seq=True, + **kwargs): + + super(RFLHead, self).__init__() + assert use_cnt or use_seq + self.use_cnt = use_cnt + self.use_seq = use_seq + if self.use_cnt: + self.cnt_head = CNTHead( + embed_size=in_channels, + encode_length=batch_max_legnth + 1, + out_channels=out_channels, + **kwargs) + if self.use_seq: + self.seq_head = AttentionLSTM( + in_channels=in_channels, + out_channels=out_channels, + hidden_size=hidden_size, + **kwargs) + self.batch_max_legnth = batch_max_legnth + self.num_class = out_channels + self.apply(self.init_weights) + + def init_weights(self, m): + if isinstance(m, nn.Linear): + kaiming_init_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + + def forward(self, x, targets=None): + cnt_inputs, seq_inputs = x + if self.use_cnt: + cnt_outputs = self.cnt_head(cnt_inputs) + else: + cnt_outputs = None + if self.use_seq: + if self.training: + seq_outputs = self.seq_head(seq_inputs, targets[0], + self.batch_max_legnth) + else: + seq_outputs = self.seq_head(seq_inputs, None, + self.batch_max_legnth) + return cnt_outputs, seq_outputs + else: + return cnt_outputs diff --git a/ppocr/modeling/heads/rec_robustscanner_head.py b/ppocr/modeling/heads/rec_robustscanner_head.py new file mode 100644 index 0000000000000000000000000000000000000000..7956059ecfe01f27db364d3d748d6af24dad0aac --- /dev/null +++ b/ppocr/modeling/heads/rec_robustscanner_head.py @@ -0,0 +1,709 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/encoders/channel_reduction_encoder.py +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/decoders/robust_scanner_decoder.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + +class BaseDecoder(nn.Layer): + def __init__(self, **kwargs): + super().__init__() + + def forward_train(self, feat, out_enc, targets, img_metas): + raise NotImplementedError + + def forward_test(self, feat, out_enc, img_metas): + raise NotImplementedError + + def forward(self, + feat, + out_enc, + label=None, + valid_ratios=None, + word_positions=None, + train_mode=True): + self.train_mode = train_mode + + if train_mode: + return self.forward_train(feat, out_enc, label, valid_ratios, word_positions) + return self.forward_test(feat, out_enc, valid_ratios, word_positions) + +class ChannelReductionEncoder(nn.Layer): + """Change the channel number with a one by one convoluational layer. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + """ + + def __init__(self, + in_channels, + out_channels, + **kwargs): + super(ChannelReductionEncoder, self).__init__() + + self.layer = nn.Conv2D( + in_channels, out_channels, kernel_size=1, stride=1, padding=0, weight_attr=nn.initializer.XavierNormal()) + + def forward(self, feat): + """ + Args: + feat (Tensor): Image features with the shape of + :math:`(N, C_{in}, H, W)`. + + Returns: + Tensor: A tensor of shape :math:`(N, C_{out}, H, W)`. + """ + return self.layer(feat) + + +def masked_fill(x, mask, value): + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + +class DotProductAttentionLayer(nn.Layer): + + def __init__(self, dim_model=None): + super().__init__() + + self.scale = dim_model**-0.5 if dim_model is not None else 1. + + def forward(self, query, key, value, h, w, valid_ratios=None): + query = paddle.transpose(query, (0, 2, 1)) + logits = paddle.matmul(query, key) * self.scale + n, c, t = logits.shape + # reshape to (n, c, h, w) + logits = paddle.reshape(logits, [n, c, h, w]) + if valid_ratios is not None: + # cal mask of attention weight + for i, valid_ratio in enumerate(valid_ratios): + valid_width = min(w, int(w * valid_ratio + 0.5)) + if valid_width < w: + logits[i, :, :, valid_width:] = float('-inf') + + # reshape to (n, c, h, w) + logits = paddle.reshape(logits, [n, c, t]) + weights = F.softmax(logits, axis=2) + value = paddle.transpose(value, (0, 2, 1)) + glimpse = paddle.matmul(weights, value) + glimpse = paddle.transpose(glimpse, (0, 2, 1)) + return glimpse + +class SequenceAttentionDecoder(BaseDecoder): + """Sequence attention decoder for RobustScanner. + + RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for + Robust Text Recognition `_ + + Args: + num_classes (int): Number of output classes :math:`C`. + rnn_layers (int): Number of RNN layers. + dim_input (int): Dimension :math:`D_i` of input vector ``feat``. + dim_model (int): Dimension :math:`D_m` of the model. Should also be the + same as encoder output vector ``out_enc``. + max_seq_len (int): Maximum output sequence length :math:`T`. + start_idx (int): The index of ``. + mask (bool): Whether to mask input features according to + ``img_meta['valid_ratio']``. + padding_idx (int): The index of ``. + dropout (float): Dropout rate. + return_feature (bool): Return feature or logits as the result. + encode_value (bool): Whether to use the output of encoder ``out_enc`` + as `value` of attention layer. If False, the original feature + ``feat`` will be used. + + Warning: + This decoder will not predict the final class which is assumed to be + ``. Therefore, its output size is always :math:`C - 1`. `` + is also ignored by loss as specified in + :obj:`mmocr.models.textrecog.recognizer.EncodeDecodeRecognizer`. + """ + + def __init__(self, + num_classes=None, + rnn_layers=2, + dim_input=512, + dim_model=128, + max_seq_len=40, + start_idx=0, + mask=True, + padding_idx=None, + dropout=0, + return_feature=False, + encode_value=False): + super().__init__() + + self.num_classes = num_classes + self.dim_input = dim_input + self.dim_model = dim_model + self.return_feature = return_feature + self.encode_value = encode_value + self.max_seq_len = max_seq_len + self.start_idx = start_idx + self.mask = mask + + self.embedding = nn.Embedding( + self.num_classes, self.dim_model, padding_idx=padding_idx) + + self.sequence_layer = nn.LSTM( + input_size=dim_model, + hidden_size=dim_model, + num_layers=rnn_layers, + time_major=False, + dropout=dropout) + + self.attention_layer = DotProductAttentionLayer() + + self.prediction = None + if not self.return_feature: + pred_num_classes = num_classes - 1 + self.prediction = nn.Linear( + dim_model if encode_value else dim_input, pred_num_classes) + + def forward_train(self, feat, out_enc, targets, valid_ratios): + """ + Args: + feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. + out_enc (Tensor): Encoder output of shape + :math:`(N, D_m, H, W)`. + targets (Tensor): a tensor of shape :math:`(N, T)`. Each element is the index of a + character. + valid_ratios (Tensor): valid length ratio of img. + Returns: + Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if + ``return_feature=False``. Otherwise it would be the hidden feature + before the prediction projection layer, whose shape is + :math:`(N, T, D_m)`. + """ + + tgt_embedding = self.embedding(targets) + + n, c_enc, h, w = out_enc.shape + assert c_enc == self.dim_model + _, c_feat, _, _ = feat.shape + assert c_feat == self.dim_input + _, len_q, c_q = tgt_embedding.shape + assert c_q == self.dim_model + assert len_q <= self.max_seq_len + + query, _ = self.sequence_layer(tgt_embedding) + query = paddle.transpose(query, (0, 2, 1)) + key = paddle.reshape(out_enc, [n, c_enc, h * w]) + if self.encode_value: + value = key + else: + value = paddle.reshape(feat, [n, c_feat, h * w]) + + attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) + attn_out = paddle.transpose(attn_out, (0, 2, 1)) + + if self.return_feature: + return attn_out + + out = self.prediction(attn_out) + + return out + + def forward_test(self, feat, out_enc, valid_ratios): + """ + Args: + feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. + out_enc (Tensor): Encoder output of shape + :math:`(N, D_m, H, W)`. + valid_ratios (Tensor): valid length ratio of img. + + Returns: + Tensor: The output logit sequence tensor of shape + :math:`(N, T, C-1)`. + """ + seq_len = self.max_seq_len + batch_size = feat.shape[0] + + decode_sequence = (paddle.ones((batch_size, seq_len), dtype='int64') * self.start_idx) + + outputs = [] + for i in range(seq_len): + step_out = self.forward_test_step(feat, out_enc, decode_sequence, + i, valid_ratios) + outputs.append(step_out) + max_idx = paddle.argmax(step_out, axis=1, keepdim=False) + if i < seq_len - 1: + decode_sequence[:, i + 1] = max_idx + + outputs = paddle.stack(outputs, 1) + + return outputs + + def forward_test_step(self, feat, out_enc, decode_sequence, current_step, + valid_ratios): + """ + Args: + feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. + out_enc (Tensor): Encoder output of shape + :math:`(N, D_m, H, W)`. + decode_sequence (Tensor): Shape :math:`(N, T)`. The tensor that + stores history decoding result. + current_step (int): Current decoding step. + valid_ratios (Tensor): valid length ratio of img + + Returns: + Tensor: Shape :math:`(N, C-1)`. The logit tensor of predicted + tokens at current time step. + """ + + embed = self.embedding(decode_sequence) + + n, c_enc, h, w = out_enc.shape + assert c_enc == self.dim_model + _, c_feat, _, _ = feat.shape + assert c_feat == self.dim_input + _, _, c_q = embed.shape + assert c_q == self.dim_model + + query, _ = self.sequence_layer(embed) + query = paddle.transpose(query, (0, 2, 1)) + key = paddle.reshape(out_enc, [n, c_enc, h * w]) + if self.encode_value: + value = key + else: + value = paddle.reshape(feat, [n, c_feat, h * w]) + + # [n, c, l] + attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) + out = attn_out[:, :, current_step] + + if self.return_feature: + return out + + out = self.prediction(out) + out = F.softmax(out, dim=-1) + + return out + + +class PositionAwareLayer(nn.Layer): + + def __init__(self, dim_model, rnn_layers=2): + super().__init__() + + self.dim_model = dim_model + + self.rnn = nn.LSTM( + input_size=dim_model, + hidden_size=dim_model, + num_layers=rnn_layers, + time_major=False) + + self.mixer = nn.Sequential( + nn.Conv2D( + dim_model, dim_model, kernel_size=3, stride=1, padding=1), + nn.ReLU(), + nn.Conv2D( + dim_model, dim_model, kernel_size=3, stride=1, padding=1)) + + def forward(self, img_feature): + n, c, h, w = img_feature.shape + rnn_input = paddle.transpose(img_feature, (0, 2, 3, 1)) + rnn_input = paddle.reshape(rnn_input, (n * h, w, c)) + rnn_output, _ = self.rnn(rnn_input) + rnn_output = paddle.reshape(rnn_output, (n, h, w, c)) + rnn_output = paddle.transpose(rnn_output, (0, 3, 1, 2)) + out = self.mixer(rnn_output) + return out + + +class PositionAttentionDecoder(BaseDecoder): + """Position attention decoder for RobustScanner. + + RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for + Robust Text Recognition `_ + + Args: + num_classes (int): Number of output classes :math:`C`. + rnn_layers (int): Number of RNN layers. + dim_input (int): Dimension :math:`D_i` of input vector ``feat``. + dim_model (int): Dimension :math:`D_m` of the model. Should also be the + same as encoder output vector ``out_enc``. + max_seq_len (int): Maximum output sequence length :math:`T`. + mask (bool): Whether to mask input features according to + ``img_meta['valid_ratio']``. + return_feature (bool): Return feature or logits as the result. + encode_value (bool): Whether to use the output of encoder ``out_enc`` + as `value` of attention layer. If False, the original feature + ``feat`` will be used. + + Warning: + This decoder will not predict the final class which is assumed to be + ``. Therefore, its output size is always :math:`C - 1`. `` + is also ignored by loss + + """ + + def __init__(self, + num_classes=None, + rnn_layers=2, + dim_input=512, + dim_model=128, + max_seq_len=40, + mask=True, + return_feature=False, + encode_value=False): + super().__init__() + + self.num_classes = num_classes + self.dim_input = dim_input + self.dim_model = dim_model + self.max_seq_len = max_seq_len + self.return_feature = return_feature + self.encode_value = encode_value + self.mask = mask + + self.embedding = nn.Embedding(self.max_seq_len + 1, self.dim_model) + + self.position_aware_module = PositionAwareLayer( + self.dim_model, rnn_layers) + + self.attention_layer = DotProductAttentionLayer() + + self.prediction = None + if not self.return_feature: + pred_num_classes = num_classes - 1 + self.prediction = nn.Linear( + dim_model if encode_value else dim_input, pred_num_classes) + + def _get_position_index(self, length, batch_size): + position_index_list = [] + for i in range(batch_size): + position_index = paddle.arange(0, end=length, step=1, dtype='int64') + position_index_list.append(position_index) + batch_position_index = paddle.stack(position_index_list, axis=0) + return batch_position_index + + def forward_train(self, feat, out_enc, targets, valid_ratios, position_index): + """ + Args: + feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. + out_enc (Tensor): Encoder output of shape + :math:`(N, D_m, H, W)`. + targets (dict): A dict with the key ``padded_targets``, a + tensor of shape :math:`(N, T)`. Each element is the index of a + character. + valid_ratios (Tensor): valid length ratio of img. + position_index (Tensor): The position of each word. + + Returns: + Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if + ``return_feature=False``. Otherwise it will be the hidden feature + before the prediction projection layer, whose shape is + :math:`(N, T, D_m)`. + """ + n, c_enc, h, w = out_enc.shape + assert c_enc == self.dim_model + _, c_feat, _, _ = feat.shape + assert c_feat == self.dim_input + _, len_q = targets.shape + assert len_q <= self.max_seq_len + + position_out_enc = self.position_aware_module(out_enc) + + query = self.embedding(position_index) + query = paddle.transpose(query, (0, 2, 1)) + key = paddle.reshape(position_out_enc, (n, c_enc, h * w)) + if self.encode_value: + value = paddle.reshape(out_enc,(n, c_enc, h * w)) + else: + value = paddle.reshape(feat,(n, c_feat, h * w)) + + attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) + attn_out = paddle.transpose(attn_out, (0, 2, 1)) # [n, len_q, dim_v] + + if self.return_feature: + return attn_out + + return self.prediction(attn_out) + + def forward_test(self, feat, out_enc, valid_ratios, position_index): + """ + Args: + feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. + out_enc (Tensor): Encoder output of shape + :math:`(N, D_m, H, W)`. + valid_ratios (Tensor): valid length ratio of img + position_index (Tensor): The position of each word. + + Returns: + Tensor: A raw logit tensor of shape :math:`(N, T, C-1)` if + ``return_feature=False``. Otherwise it would be the hidden feature + before the prediction projection layer, whose shape is + :math:`(N, T, D_m)`. + """ + n, c_enc, h, w = out_enc.shape + assert c_enc == self.dim_model + _, c_feat, _, _ = feat.shape + assert c_feat == self.dim_input + + position_out_enc = self.position_aware_module(out_enc) + + query = self.embedding(position_index) + query = paddle.transpose(query, (0, 2, 1)) + key = paddle.reshape(position_out_enc, (n, c_enc, h * w)) + if self.encode_value: + value = paddle.reshape(out_enc,(n, c_enc, h * w)) + else: + value = paddle.reshape(feat,(n, c_feat, h * w)) + + attn_out = self.attention_layer(query, key, value, h, w, valid_ratios) + attn_out = paddle.transpose(attn_out, (0, 2, 1)) # [n, len_q, dim_v] + + if self.return_feature: + return attn_out + + return self.prediction(attn_out) + +class RobustScannerFusionLayer(nn.Layer): + + def __init__(self, dim_model, dim=-1): + super(RobustScannerFusionLayer, self).__init__() + + self.dim_model = dim_model + self.dim = dim + self.linear_layer = nn.Linear(dim_model * 2, dim_model * 2) + + def forward(self, x0, x1): + assert x0.shape == x1.shape + fusion_input = paddle.concat([x0, x1], self.dim) + output = self.linear_layer(fusion_input) + output = F.glu(output, self.dim) + return output + +class RobustScannerDecoder(BaseDecoder): + """Decoder for RobustScanner. + + RobustScanner: `RobustScanner: Dynamically Enhancing Positional Clues for + Robust Text Recognition `_ + + Args: + num_classes (int): Number of output classes :math:`C`. + dim_input (int): Dimension :math:`D_i` of input vector ``feat``. + dim_model (int): Dimension :math:`D_m` of the model. Should also be the + same as encoder output vector ``out_enc``. + max_seq_len (int): Maximum output sequence length :math:`T`. + start_idx (int): The index of ``. + mask (bool): Whether to mask input features according to + ``img_meta['valid_ratio']``. + padding_idx (int): The index of ``. + encode_value (bool): Whether to use the output of encoder ``out_enc`` + as `value` of attention layer. If False, the original feature + ``feat`` will be used. + + Warning: + This decoder will not predict the final class which is assumed to be + ``. Therefore, its output size is always :math:`C - 1`. `` + is also ignored by loss as specified in + :obj:`mmocr.models.textrecog.recognizer.EncodeDecodeRecognizer`. + """ + + def __init__(self, + num_classes=None, + dim_input=512, + dim_model=128, + hybrid_decoder_rnn_layers=2, + hybrid_decoder_dropout=0, + position_decoder_rnn_layers=2, + max_seq_len=40, + start_idx=0, + mask=True, + padding_idx=None, + encode_value=False): + super().__init__() + self.num_classes = num_classes + self.dim_input = dim_input + self.dim_model = dim_model + self.max_seq_len = max_seq_len + self.encode_value = encode_value + self.start_idx = start_idx + self.padding_idx = padding_idx + self.mask = mask + + # init hybrid decoder + self.hybrid_decoder = SequenceAttentionDecoder( + num_classes=num_classes, + rnn_layers=hybrid_decoder_rnn_layers, + dim_input=dim_input, + dim_model=dim_model, + max_seq_len=max_seq_len, + start_idx=start_idx, + mask=mask, + padding_idx=padding_idx, + dropout=hybrid_decoder_dropout, + encode_value=encode_value, + return_feature=True + ) + + # init position decoder + self.position_decoder = PositionAttentionDecoder( + num_classes=num_classes, + rnn_layers=position_decoder_rnn_layers, + dim_input=dim_input, + dim_model=dim_model, + max_seq_len=max_seq_len, + mask=mask, + encode_value=encode_value, + return_feature=True + ) + + + self.fusion_module = RobustScannerFusionLayer( + self.dim_model if encode_value else dim_input) + + pred_num_classes = num_classes - 1 + self.prediction = nn.Linear(dim_model if encode_value else dim_input, + pred_num_classes) + + def forward_train(self, feat, out_enc, target, valid_ratios, word_positions): + """ + Args: + feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. + out_enc (Tensor): Encoder output of shape + :math:`(N, D_m, H, W)`. + target (dict): A dict with the key ``padded_targets``, a + tensor of shape :math:`(N, T)`. Each element is the index of a + character. + valid_ratios (Tensor): + word_positions (Tensor): The position of each word. + + Returns: + Tensor: A raw logit tensor of shape :math:`(N, T, C-1)`. + """ + hybrid_glimpse = self.hybrid_decoder.forward_train( + feat, out_enc, target, valid_ratios) + position_glimpse = self.position_decoder.forward_train( + feat, out_enc, target, valid_ratios, word_positions) + + fusion_out = self.fusion_module(hybrid_glimpse, position_glimpse) + + out = self.prediction(fusion_out) + + return out + + def forward_test(self, feat, out_enc, valid_ratios, word_positions): + """ + Args: + feat (Tensor): Tensor of shape :math:`(N, D_i, H, W)`. + out_enc (Tensor): Encoder output of shape + :math:`(N, D_m, H, W)`. + valid_ratios (Tensor): + word_positions (Tensor): The position of each word. + Returns: + Tensor: The output logit sequence tensor of shape + :math:`(N, T, C-1)`. + """ + seq_len = self.max_seq_len + batch_size = feat.shape[0] + + decode_sequence = (paddle.ones((batch_size, seq_len), dtype='int64') * self.start_idx) + + position_glimpse = self.position_decoder.forward_test( + feat, out_enc, valid_ratios, word_positions) + + outputs = [] + for i in range(seq_len): + hybrid_glimpse_step = self.hybrid_decoder.forward_test_step( + feat, out_enc, decode_sequence, i, valid_ratios) + + fusion_out = self.fusion_module(hybrid_glimpse_step, + position_glimpse[:, i, :]) + + char_out = self.prediction(fusion_out) + char_out = F.softmax(char_out, -1) + outputs.append(char_out) + max_idx = paddle.argmax(char_out, axis=1, keepdim=False) + if i < seq_len - 1: + decode_sequence[:, i + 1] = max_idx + + outputs = paddle.stack(outputs, 1) + + return outputs + +class RobustScannerHead(nn.Layer): + def __init__(self, + out_channels, # 90 + unknown + start + padding + in_channels, + enc_outchannles=128, + hybrid_dec_rnn_layers=2, + hybrid_dec_dropout=0, + position_dec_rnn_layers=2, + start_idx=0, + max_text_length=40, + mask=True, + padding_idx=None, + encode_value=False, + **kwargs): + super(RobustScannerHead, self).__init__() + + # encoder module + self.encoder = ChannelReductionEncoder( + in_channels=in_channels, out_channels=enc_outchannles) + + # decoder module + self.decoder =RobustScannerDecoder( + num_classes=out_channels, + dim_input=in_channels, + dim_model=enc_outchannles, + hybrid_decoder_rnn_layers=hybrid_dec_rnn_layers, + hybrid_decoder_dropout=hybrid_dec_dropout, + position_decoder_rnn_layers=position_dec_rnn_layers, + max_seq_len=max_text_length, + start_idx=start_idx, + mask=mask, + padding_idx=padding_idx, + encode_value=encode_value) + + def forward(self, inputs, targets=None): + ''' + targets: [label, valid_ratio, word_positions] + ''' + out_enc = self.encoder(inputs) + valid_ratios = None + word_positions = targets[-1] + + if len(targets) > 1: + valid_ratios = targets[-2] + + if self.training: + label = targets[0] # label + label = paddle.to_tensor(label, dtype='int64') + final_out = self.decoder( + inputs, out_enc, label, valid_ratios, word_positions) + if not self.training: + final_out = self.decoder( + inputs, + out_enc, + label=None, + valid_ratios=valid_ratios, + word_positions=word_positions, + train_mode=False) + return final_out diff --git a/ppocr/modeling/heads/rec_sar_head.py b/ppocr/modeling/heads/rec_sar_head.py new file mode 100644 index 0000000000000000000000000000000000000000..5e64cae85afafc555f2519ed6dd3f05eafff7ea2 --- /dev/null +++ b/ppocr/modeling/heads/rec_sar_head.py @@ -0,0 +1,410 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/encoders/sar_encoder.py +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textrecog/decoders/sar_decoder.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + + +class SAREncoder(nn.Layer): + """ + Args: + enc_bi_rnn (bool): If True, use bidirectional RNN in encoder. + enc_drop_rnn (float): Dropout probability of RNN layer in encoder. + enc_gru (bool): If True, use GRU, else LSTM in encoder. + d_model (int): Dim of channels from backbone. + d_enc (int): Dim of encoder RNN layer. + mask (bool): If True, mask padding in RNN sequence. + """ + + def __init__(self, + enc_bi_rnn=False, + enc_drop_rnn=0.1, + enc_gru=False, + d_model=512, + d_enc=512, + mask=True, + **kwargs): + super().__init__() + assert isinstance(enc_bi_rnn, bool) + assert isinstance(enc_drop_rnn, (int, float)) + assert 0 <= enc_drop_rnn < 1.0 + assert isinstance(enc_gru, bool) + assert isinstance(d_model, int) + assert isinstance(d_enc, int) + assert isinstance(mask, bool) + + self.enc_bi_rnn = enc_bi_rnn + self.enc_drop_rnn = enc_drop_rnn + self.mask = mask + + # LSTM Encoder + if enc_bi_rnn: + direction = 'bidirectional' + else: + direction = 'forward' + kwargs = dict( + input_size=d_model, + hidden_size=d_enc, + num_layers=2, + time_major=False, + dropout=enc_drop_rnn, + direction=direction) + if enc_gru: + self.rnn_encoder = nn.GRU(**kwargs) + else: + self.rnn_encoder = nn.LSTM(**kwargs) + + # global feature transformation + encoder_rnn_out_size = d_enc * (int(enc_bi_rnn) + 1) + self.linear = nn.Linear(encoder_rnn_out_size, encoder_rnn_out_size) + + def forward(self, feat, img_metas=None): + if img_metas is not None: + assert len(img_metas[0]) == paddle.shape(feat)[0] + + valid_ratios = None + if img_metas is not None and self.mask: + valid_ratios = img_metas[-1] + + h_feat = feat.shape[2] # bsz c h w + feat_v = F.max_pool2d( + feat, kernel_size=(h_feat, 1), stride=1, padding=0) + feat_v = feat_v.squeeze(2) # bsz * C * W + feat_v = paddle.transpose(feat_v, perm=[0, 2, 1]) # bsz * W * C + holistic_feat = self.rnn_encoder(feat_v)[0] # bsz * T * C + + if valid_ratios is not None: + valid_hf = [] + T = paddle.shape(holistic_feat)[1] + for i in range(paddle.shape(valid_ratios)[0]): + valid_step = paddle.minimum( + T, paddle.ceil(valid_ratios[i] * T).astype('int32')) - 1 + valid_hf.append(holistic_feat[i, valid_step, :]) + valid_hf = paddle.stack(valid_hf, axis=0) + else: + valid_hf = holistic_feat[:, -1, :] # bsz * C + holistic_feat = self.linear(valid_hf) # bsz * C + + return holistic_feat + + +class BaseDecoder(nn.Layer): + def __init__(self, **kwargs): + super().__init__() + + def forward_train(self, feat, out_enc, targets, img_metas): + raise NotImplementedError + + def forward_test(self, feat, out_enc, img_metas): + raise NotImplementedError + + def forward(self, + feat, + out_enc, + label=None, + img_metas=None, + train_mode=True): + self.train_mode = train_mode + + if train_mode: + return self.forward_train(feat, out_enc, label, img_metas) + return self.forward_test(feat, out_enc, img_metas) + + +class ParallelSARDecoder(BaseDecoder): + """ + Args: + out_channels (int): Output class number. + enc_bi_rnn (bool): If True, use bidirectional RNN in encoder. + dec_bi_rnn (bool): If True, use bidirectional RNN in decoder. + dec_drop_rnn (float): Dropout of RNN layer in decoder. + dec_gru (bool): If True, use GRU, else LSTM in decoder. + d_model (int): Dim of channels from backbone. + d_enc (int): Dim of encoder RNN layer. + d_k (int): Dim of channels of attention module. + pred_dropout (float): Dropout probability of prediction layer. + max_seq_len (int): Maximum sequence length for decoding. + mask (bool): If True, mask padding in feature map. + start_idx (int): Index of start token. + padding_idx (int): Index of padding token. + pred_concat (bool): If True, concat glimpse feature from + attention with holistic feature and hidden state. + """ + + def __init__( + self, + out_channels, # 90 + unknown + start + padding + enc_bi_rnn=False, + dec_bi_rnn=False, + dec_drop_rnn=0.0, + dec_gru=False, + d_model=512, + d_enc=512, + d_k=64, + pred_dropout=0.1, + max_text_length=30, + mask=True, + pred_concat=True, + **kwargs): + super().__init__() + + self.num_classes = out_channels + self.enc_bi_rnn = enc_bi_rnn + self.d_k = d_k + self.start_idx = out_channels - 2 + self.padding_idx = out_channels - 1 + self.max_seq_len = max_text_length + self.mask = mask + self.pred_concat = pred_concat + + encoder_rnn_out_size = d_enc * (int(enc_bi_rnn) + 1) + decoder_rnn_out_size = encoder_rnn_out_size * (int(dec_bi_rnn) + 1) + + # 2D attention layer + self.conv1x1_1 = nn.Linear(decoder_rnn_out_size, d_k) + self.conv3x3_1 = nn.Conv2D( + d_model, d_k, kernel_size=3, stride=1, padding=1) + self.conv1x1_2 = nn.Linear(d_k, 1) + + # Decoder RNN layer + if dec_bi_rnn: + direction = 'bidirectional' + else: + direction = 'forward' + + kwargs = dict( + input_size=encoder_rnn_out_size, + hidden_size=encoder_rnn_out_size, + num_layers=2, + time_major=False, + dropout=dec_drop_rnn, + direction=direction) + if dec_gru: + self.rnn_decoder = nn.GRU(**kwargs) + else: + self.rnn_decoder = nn.LSTM(**kwargs) + + # Decoder input embedding + self.embedding = nn.Embedding( + self.num_classes, + encoder_rnn_out_size, + padding_idx=self.padding_idx) + + # Prediction layer + self.pred_dropout = nn.Dropout(pred_dropout) + pred_num_classes = self.num_classes - 1 + if pred_concat: + fc_in_channel = decoder_rnn_out_size + d_model + encoder_rnn_out_size + else: + fc_in_channel = d_model + self.prediction = nn.Linear(fc_in_channel, pred_num_classes) + + def _2d_attention(self, + decoder_input, + feat, + holistic_feat, + valid_ratios=None): + + y = self.rnn_decoder(decoder_input)[0] + # y: bsz * (seq_len + 1) * hidden_size + + attn_query = self.conv1x1_1(y) # bsz * (seq_len + 1) * attn_size + bsz, seq_len, attn_size = attn_query.shape + attn_query = paddle.unsqueeze(attn_query, axis=[3, 4]) + # (bsz, seq_len + 1, attn_size, 1, 1) + + attn_key = self.conv3x3_1(feat) + # bsz * attn_size * h * w + attn_key = attn_key.unsqueeze(1) + # bsz * 1 * attn_size * h * w + + attn_weight = paddle.tanh(paddle.add(attn_key, attn_query)) + + # bsz * (seq_len + 1) * attn_size * h * w + attn_weight = paddle.transpose(attn_weight, perm=[0, 1, 3, 4, 2]) + # bsz * (seq_len + 1) * h * w * attn_size + attn_weight = self.conv1x1_2(attn_weight) + # bsz * (seq_len + 1) * h * w * 1 + bsz, T, h, w, c = paddle.shape(attn_weight) + assert c == 1 + + if valid_ratios is not None: + # cal mask of attention weight + for i in range(paddle.shape(valid_ratios)[0]): + valid_width = paddle.minimum( + w, paddle.ceil(valid_ratios[i] * w).astype("int32")) + if valid_width < w: + attn_weight[i, :, :, valid_width:, :] = float('-inf') + + attn_weight = paddle.reshape(attn_weight, [bsz, T, -1]) + attn_weight = F.softmax(attn_weight, axis=-1) + + attn_weight = paddle.reshape(attn_weight, [bsz, T, h, w, c]) + attn_weight = paddle.transpose(attn_weight, perm=[0, 1, 4, 2, 3]) + # attn_weight: bsz * T * c * h * w + # feat: bsz * c * h * w + attn_feat = paddle.sum(paddle.multiply(feat.unsqueeze(1), attn_weight), + (3, 4), + keepdim=False) + # bsz * (seq_len + 1) * C + + # Linear transformation + if self.pred_concat: + hf_c = holistic_feat.shape[-1] + holistic_feat = paddle.expand( + holistic_feat, shape=[bsz, seq_len, hf_c]) + y = self.prediction(paddle.concat((y, attn_feat, holistic_feat), 2)) + else: + y = self.prediction(attn_feat) + # bsz * (seq_len + 1) * num_classes + if self.train_mode: + y = self.pred_dropout(y) + + return y + + def forward_train(self, feat, out_enc, label, img_metas): + ''' + img_metas: [label, valid_ratio] + ''' + if img_metas is not None: + assert paddle.shape(img_metas[0])[0] == paddle.shape(feat)[0] + + valid_ratios = None + if img_metas is not None and self.mask: + valid_ratios = img_metas[-1] + + lab_embedding = self.embedding(label) + # bsz * seq_len * emb_dim + out_enc = out_enc.unsqueeze(1) + # bsz * 1 * emb_dim + in_dec = paddle.concat((out_enc, lab_embedding), axis=1) + # bsz * (seq_len + 1) * C + out_dec = self._2d_attention( + in_dec, feat, out_enc, valid_ratios=valid_ratios) + + return out_dec[:, 1:, :] # bsz * seq_len * num_classes + + def forward_test(self, feat, out_enc, img_metas): + if img_metas is not None: + assert len(img_metas[0]) == feat.shape[0] + + valid_ratios = None + if img_metas is not None and self.mask: + valid_ratios = img_metas[-1] + + seq_len = self.max_seq_len + bsz = feat.shape[0] + start_token = paddle.full( + (bsz, ), fill_value=self.start_idx, dtype='int64') + # bsz + start_token = self.embedding(start_token) + # bsz * emb_dim + emb_dim = start_token.shape[1] + start_token = start_token.unsqueeze(1) + start_token = paddle.expand(start_token, shape=[bsz, seq_len, emb_dim]) + # bsz * seq_len * emb_dim + out_enc = out_enc.unsqueeze(1) + # bsz * 1 * emb_dim + decoder_input = paddle.concat((out_enc, start_token), axis=1) + # bsz * (seq_len + 1) * emb_dim + + outputs = [] + for i in range(1, seq_len + 1): + decoder_output = self._2d_attention( + decoder_input, feat, out_enc, valid_ratios=valid_ratios) + char_output = decoder_output[:, i, :] # bsz * num_classes + char_output = F.softmax(char_output, -1) + outputs.append(char_output) + max_idx = paddle.argmax(char_output, axis=1, keepdim=False) + char_embedding = self.embedding(max_idx) # bsz * emb_dim + if i < seq_len: + decoder_input[:, i + 1, :] = char_embedding + + outputs = paddle.stack(outputs, 1) # bsz * seq_len * num_classes + + return outputs + + +class SARHead(nn.Layer): + def __init__(self, + in_channels, + out_channels, + enc_dim=512, + max_text_length=30, + enc_bi_rnn=False, + enc_drop_rnn=0.1, + enc_gru=False, + dec_bi_rnn=False, + dec_drop_rnn=0.0, + dec_gru=False, + d_k=512, + pred_dropout=0.1, + pred_concat=True, + **kwargs): + super(SARHead, self).__init__() + + # encoder module + self.encoder = SAREncoder( + enc_bi_rnn=enc_bi_rnn, + enc_drop_rnn=enc_drop_rnn, + enc_gru=enc_gru, + d_model=in_channels, + d_enc=enc_dim) + + # decoder module + self.decoder = ParallelSARDecoder( + out_channels=out_channels, + enc_bi_rnn=enc_bi_rnn, + dec_bi_rnn=dec_bi_rnn, + dec_drop_rnn=dec_drop_rnn, + dec_gru=dec_gru, + d_model=in_channels, + d_enc=enc_dim, + d_k=d_k, + pred_dropout=pred_dropout, + max_text_length=max_text_length, + pred_concat=pred_concat) + + def forward(self, feat, targets=None): + ''' + img_metas: [label, valid_ratio] + ''' + holistic_feat = self.encoder(feat, targets) # bsz c + + if self.training: + label = targets[0] # label + final_out = self.decoder( + feat, holistic_feat, label, img_metas=targets) + else: + final_out = self.decoder( + feat, + holistic_feat, + label=None, + img_metas=targets, + train_mode=False) + # (bsz, seq_len, num_classes) + + return final_out diff --git a/ppocr/modeling/heads/rec_spin_att_head.py b/ppocr/modeling/heads/rec_spin_att_head.py new file mode 100644 index 0000000000000000000000000000000000000000..86e35e4339d8e1006cfe43d6cf4f2f7d231082c4 --- /dev/null +++ b/ppocr/modeling/heads/rec_spin_att_head.py @@ -0,0 +1,115 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This code is refer from: +https://github.com/hikopensource/DAVAR-Lab-OCR/davarocr/davar_rcg/models/sequence_heads/att_head.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class SPINAttentionHead(nn.Layer): + def __init__(self, in_channels, out_channels, hidden_size, **kwargs): + super(SPINAttentionHead, self).__init__() + self.input_size = in_channels + self.hidden_size = hidden_size + self.num_classes = out_channels + + self.attention_cell = AttentionLSTMCell( + in_channels, hidden_size, out_channels, use_gru=False) + self.generator = nn.Linear(hidden_size, out_channels) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char, onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None, batch_max_length=25): + batch_size = paddle.shape(inputs)[0] + num_steps = batch_max_length + 1 # +1 for [sos] at end of sentence + + hidden = (paddle.zeros((batch_size, self.hidden_size)), + paddle.zeros((batch_size, self.hidden_size))) + output_hiddens = [] + if self.training: # for train + targets = targets[0] + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets[:, i], onehot_dim=self.num_classes) + (outputs, hidden), alpha = self.attention_cell(hidden, inputs, + char_onehots) + output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) + output = paddle.concat(output_hiddens, axis=1) + probs = self.generator(output) + else: + targets = paddle.zeros(shape=[batch_size], dtype="int32") + probs = None + char_onehots = None + outputs = None + alpha = None + + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets, onehot_dim=self.num_classes) + (outputs, hidden), alpha = self.attention_cell(hidden, inputs, + char_onehots) + probs_step = self.generator(outputs) + if probs is None: + probs = paddle.unsqueeze(probs_step, axis=1) + else: + probs = paddle.concat( + [probs, paddle.unsqueeze( + probs_step, axis=1)], axis=1) + next_input = probs_step.argmax(axis=1) + targets = next_input + if not self.training: + probs = paddle.nn.functional.softmax(probs, axis=2) + return probs + + +class AttentionLSTMCell(nn.Layer): + def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): + super(AttentionLSTMCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False) + self.h2h = nn.Linear(hidden_size, hidden_size) + self.score = nn.Linear(hidden_size, 1, bias_attr=False) + if not use_gru: + self.rnn = nn.LSTMCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + else: + self.rnn = nn.GRUCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden[0]), axis=1) + res = paddle.add(batch_H_proj, prev_hidden_proj) + res = paddle.tanh(res) + e = self.score(res) + + alpha = F.softmax(e, axis=1) + alpha = paddle.transpose(alpha, [0, 2, 1]) + context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1) + concat_context = paddle.concat([context, char_onehots], 1) + cur_hidden = self.rnn(concat_context, prev_hidden) + + return cur_hidden, alpha diff --git a/ppocr/modeling/heads/rec_srn_head.py b/ppocr/modeling/heads/rec_srn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..1070d8cd648eb686c0a2e66df092b7dc6de29c42 --- /dev/null +++ b/ppocr/modeling/heads/rec_srn_head.py @@ -0,0 +1,278 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import numpy as np +from .self_attention import WrapEncoderForFeature +from .self_attention import WrapEncoder +from paddle.static import Program +from ppocr.modeling.backbones.rec_resnet_fpn import ResNetFPN + +from collections import OrderedDict +gradient_clip = 10 + + +class PVAM(nn.Layer): + def __init__(self, in_channels, char_num, max_text_length, num_heads, + num_encoder_tus, hidden_dims): + super(PVAM, self).__init__() + self.char_num = char_num + self.max_length = max_text_length + self.num_heads = num_heads + self.num_encoder_TUs = num_encoder_tus + self.hidden_dims = hidden_dims + # Transformer encoder + t = 256 + c = 512 + self.wrap_encoder_for_feature = WrapEncoderForFeature( + src_vocab_size=1, + max_length=t, + n_layer=self.num_encoder_TUs, + n_head=self.num_heads, + d_key=int(self.hidden_dims / self.num_heads), + d_value=int(self.hidden_dims / self.num_heads), + d_model=self.hidden_dims, + d_inner_hid=self.hidden_dims, + prepostprocess_dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + preprocess_cmd="n", + postprocess_cmd="da", + weight_sharing=True) + + # PVAM + self.flatten0 = paddle.nn.Flatten(start_axis=0, stop_axis=1) + self.fc0 = paddle.nn.Linear( + in_features=in_channels, + out_features=in_channels, ) + self.emb = paddle.nn.Embedding( + num_embeddings=self.max_length, embedding_dim=in_channels) + self.flatten1 = paddle.nn.Flatten(start_axis=0, stop_axis=2) + self.fc1 = paddle.nn.Linear( + in_features=in_channels, out_features=1, bias_attr=False) + + def forward(self, inputs, encoder_word_pos, gsrm_word_pos): + b, c, h, w = inputs.shape + conv_features = paddle.reshape(inputs, shape=[-1, c, h * w]) + conv_features = paddle.transpose(conv_features, perm=[0, 2, 1]) + # transformer encoder + b, t, c = conv_features.shape + + enc_inputs = [conv_features, encoder_word_pos, None] + word_features = self.wrap_encoder_for_feature(enc_inputs) + + # pvam + b, t, c = word_features.shape + word_features = self.fc0(word_features) + word_features_ = paddle.reshape(word_features, [-1, 1, t, c]) + word_features_ = paddle.tile(word_features_, [1, self.max_length, 1, 1]) + word_pos_feature = self.emb(gsrm_word_pos) + word_pos_feature_ = paddle.reshape(word_pos_feature, + [-1, self.max_length, 1, c]) + word_pos_feature_ = paddle.tile(word_pos_feature_, [1, 1, t, 1]) + y = word_pos_feature_ + word_features_ + y = F.tanh(y) + attention_weight = self.fc1(y) + attention_weight = paddle.reshape( + attention_weight, shape=[-1, self.max_length, t]) + attention_weight = F.softmax(attention_weight, axis=-1) + pvam_features = paddle.matmul(attention_weight, + word_features) #[b, max_length, c] + return pvam_features + + +class GSRM(nn.Layer): + def __init__(self, in_channels, char_num, max_text_length, num_heads, + num_encoder_tus, num_decoder_tus, hidden_dims): + super(GSRM, self).__init__() + self.char_num = char_num + self.max_length = max_text_length + self.num_heads = num_heads + self.num_encoder_TUs = num_encoder_tus + self.num_decoder_TUs = num_decoder_tus + self.hidden_dims = hidden_dims + + self.fc0 = paddle.nn.Linear( + in_features=in_channels, out_features=self.char_num) + self.wrap_encoder0 = WrapEncoder( + src_vocab_size=self.char_num + 1, + max_length=self.max_length, + n_layer=self.num_decoder_TUs, + n_head=self.num_heads, + d_key=int(self.hidden_dims / self.num_heads), + d_value=int(self.hidden_dims / self.num_heads), + d_model=self.hidden_dims, + d_inner_hid=self.hidden_dims, + prepostprocess_dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + preprocess_cmd="n", + postprocess_cmd="da", + weight_sharing=True) + + self.wrap_encoder1 = WrapEncoder( + src_vocab_size=self.char_num + 1, + max_length=self.max_length, + n_layer=self.num_decoder_TUs, + n_head=self.num_heads, + d_key=int(self.hidden_dims / self.num_heads), + d_value=int(self.hidden_dims / self.num_heads), + d_model=self.hidden_dims, + d_inner_hid=self.hidden_dims, + prepostprocess_dropout=0.1, + attention_dropout=0.1, + relu_dropout=0.1, + preprocess_cmd="n", + postprocess_cmd="da", + weight_sharing=True) + + self.mul = lambda x: paddle.matmul(x=x, + y=self.wrap_encoder0.prepare_decoder.emb0.weight, + transpose_y=True) + + def forward(self, inputs, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2): + # ===== GSRM Visual-to-semantic embedding block ===== + b, t, c = inputs.shape + pvam_features = paddle.reshape(inputs, [-1, c]) + word_out = self.fc0(pvam_features) + word_ids = paddle.argmax(F.softmax(word_out), axis=1) + word_ids = paddle.reshape(x=word_ids, shape=[-1, t, 1]) + + #===== GSRM Semantic reasoning block ===== + """ + This module is achieved through bi-transformers, + ngram_feature1 is the froward one, ngram_fetaure2 is the backward one + """ + pad_idx = self.char_num + + word1 = paddle.cast(word_ids, "float32") + word1 = F.pad(word1, [1, 0], value=1.0 * pad_idx, data_format="NLC") + word1 = paddle.cast(word1, "int64") + word1 = word1[:, :-1, :] + word2 = word_ids + + enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1] + enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2] + + gsrm_feature1 = self.wrap_encoder0(enc_inputs_1) + gsrm_feature2 = self.wrap_encoder1(enc_inputs_2) + + gsrm_feature2 = F.pad(gsrm_feature2, [0, 1], + value=0., + data_format="NLC") + gsrm_feature2 = gsrm_feature2[:, 1:, ] + gsrm_features = gsrm_feature1 + gsrm_feature2 + + gsrm_out = self.mul(gsrm_features) + + b, t, c = gsrm_out.shape + gsrm_out = paddle.reshape(gsrm_out, [-1, c]) + + return gsrm_features, word_out, gsrm_out + + +class VSFD(nn.Layer): + def __init__(self, in_channels=512, pvam_ch=512, char_num=38): + super(VSFD, self).__init__() + self.char_num = char_num + self.fc0 = paddle.nn.Linear( + in_features=in_channels * 2, out_features=pvam_ch) + self.fc1 = paddle.nn.Linear( + in_features=pvam_ch, out_features=self.char_num) + + def forward(self, pvam_feature, gsrm_feature): + b, t, c1 = pvam_feature.shape + b, t, c2 = gsrm_feature.shape + combine_feature_ = paddle.concat([pvam_feature, gsrm_feature], axis=2) + img_comb_feature_ = paddle.reshape( + combine_feature_, shape=[-1, c1 + c2]) + img_comb_feature_map = self.fc0(img_comb_feature_) + img_comb_feature_map = F.sigmoid(img_comb_feature_map) + img_comb_feature_map = paddle.reshape( + img_comb_feature_map, shape=[-1, t, c1]) + combine_feature = img_comb_feature_map * pvam_feature + ( + 1.0 - img_comb_feature_map) * gsrm_feature + img_comb_feature = paddle.reshape(combine_feature, shape=[-1, c1]) + + out = self.fc1(img_comb_feature) + return out + + +class SRNHead(nn.Layer): + def __init__(self, in_channels, out_channels, max_text_length, num_heads, + num_encoder_TUs, num_decoder_TUs, hidden_dims, **kwargs): + super(SRNHead, self).__init__() + self.char_num = out_channels + self.max_length = max_text_length + self.num_heads = num_heads + self.num_encoder_TUs = num_encoder_TUs + self.num_decoder_TUs = num_decoder_TUs + self.hidden_dims = hidden_dims + + self.pvam = PVAM( + in_channels=in_channels, + char_num=self.char_num, + max_text_length=self.max_length, + num_heads=self.num_heads, + num_encoder_tus=self.num_encoder_TUs, + hidden_dims=self.hidden_dims) + + self.gsrm = GSRM( + in_channels=in_channels, + char_num=self.char_num, + max_text_length=self.max_length, + num_heads=self.num_heads, + num_encoder_tus=self.num_encoder_TUs, + num_decoder_tus=self.num_decoder_TUs, + hidden_dims=self.hidden_dims) + self.vsfd = VSFD(in_channels=in_channels, char_num=self.char_num) + + self.gsrm.wrap_encoder1.prepare_decoder.emb0 = self.gsrm.wrap_encoder0.prepare_decoder.emb0 + + def forward(self, inputs, targets=None): + others = targets[-4:] + encoder_word_pos = others[0] + gsrm_word_pos = others[1] + gsrm_slf_attn_bias1 = others[2] + gsrm_slf_attn_bias2 = others[3] + + pvam_feature = self.pvam(inputs, encoder_word_pos, gsrm_word_pos) + + gsrm_feature, word_out, gsrm_out = self.gsrm( + pvam_feature, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2) + + final_out = self.vsfd(pvam_feature, gsrm_feature) + if not self.training: + final_out = F.softmax(final_out, axis=1) + + _, decoded_out = paddle.topk(final_out, k=1) + + predicts = OrderedDict([ + ('predict', final_out), + ('pvam_feature', pvam_feature), + ('decoded_out', decoded_out), + ('word_out', word_out), + ('gsrm_out', gsrm_out), + ]) + + return predicts diff --git a/ppocr/modeling/heads/rec_visionlan_head.py b/ppocr/modeling/heads/rec_visionlan_head.py new file mode 100644 index 0000000000000000000000000000000000000000..86054d9bbb12613e3119b4c0d72f4670344d773a --- /dev/null +++ b/ppocr/modeling/heads/rec_visionlan_head.py @@ -0,0 +1,468 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/wangyuxin87/VisionLAN +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.initializer import Normal, XavierNormal +import numpy as np + + +class PositionalEncoding(nn.Layer): + def __init__(self, d_hid, n_position=200): + super(PositionalEncoding, self).__init__() + self.register_buffer( + 'pos_table', self._get_sinusoid_encoding_table(n_position, d_hid)) + + def _get_sinusoid_encoding_table(self, n_position, d_hid): + ''' Sinusoid position encoding table ''' + + def get_position_angle_vec(position): + return [ + position / np.power(10000, 2 * (hid_j // 2) / d_hid) + for hid_j in range(d_hid) + ] + + sinusoid_table = np.array( + [get_position_angle_vec(pos_i) for pos_i in range(n_position)]) + sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i + sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 + sinusoid_table = paddle.to_tensor(sinusoid_table, dtype='float32') + sinusoid_table = paddle.unsqueeze(sinusoid_table, axis=0) + return sinusoid_table + + def forward(self, x): + return x + self.pos_table[:, :x.shape[1]].clone().detach() + + +class ScaledDotProductAttention(nn.Layer): + "Scaled Dot-Product Attention" + + def __init__(self, temperature, attn_dropout=0.1): + super(ScaledDotProductAttention, self).__init__() + self.temperature = temperature + self.dropout = nn.Dropout(attn_dropout) + self.softmax = nn.Softmax(axis=2) + + def forward(self, q, k, v, mask=None): + k = paddle.transpose(k, perm=[0, 2, 1]) + attn = paddle.bmm(q, k) + attn = attn / self.temperature + if mask is not None: + attn = attn.masked_fill(mask, -1e9) + if mask.dim() == 3: + mask = paddle.unsqueeze(mask, axis=1) + elif mask.dim() == 2: + mask = paddle.unsqueeze(mask, axis=1) + mask = paddle.unsqueeze(mask, axis=1) + repeat_times = [ + attn.shape[1] // mask.shape[1], attn.shape[2] // mask.shape[2] + ] + mask = paddle.tile(mask, [1, repeat_times[0], repeat_times[1], 1]) + attn[mask == 0] = -1e9 + attn = self.softmax(attn) + attn = self.dropout(attn) + output = paddle.bmm(attn, v) + return output + + +class MultiHeadAttention(nn.Layer): + " Multi-Head Attention module" + + def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): + super(MultiHeadAttention, self).__init__() + self.n_head = n_head + self.d_k = d_k + self.d_v = d_v + self.w_qs = nn.Linear( + d_model, + n_head * d_k, + weight_attr=ParamAttr(initializer=Normal( + mean=0, std=np.sqrt(2.0 / (d_model + d_k))))) + self.w_ks = nn.Linear( + d_model, + n_head * d_k, + weight_attr=ParamAttr(initializer=Normal( + mean=0, std=np.sqrt(2.0 / (d_model + d_k))))) + self.w_vs = nn.Linear( + d_model, + n_head * d_v, + weight_attr=ParamAttr(initializer=Normal( + mean=0, std=np.sqrt(2.0 / (d_model + d_v))))) + + self.attention = ScaledDotProductAttention(temperature=np.power(d_k, + 0.5)) + self.layer_norm = nn.LayerNorm(d_model) + self.fc = nn.Linear( + n_head * d_v, + d_model, + weight_attr=ParamAttr(initializer=XavierNormal())) + self.dropout = nn.Dropout(dropout) + + def forward(self, q, k, v, mask=None): + d_k, d_v, n_head = self.d_k, self.d_v, self.n_head + sz_b, len_q, _ = q.shape + sz_b, len_k, _ = k.shape + sz_b, len_v, _ = v.shape + residual = q + + q = self.w_qs(q) + q = paddle.reshape( + q, shape=[-1, len_q, n_head, d_k]) # 4*21*512 ---- 4*21*8*64 + k = self.w_ks(k) + k = paddle.reshape(k, shape=[-1, len_k, n_head, d_k]) + v = self.w_vs(v) + v = paddle.reshape(v, shape=[-1, len_v, n_head, d_v]) + + q = paddle.transpose(q, perm=[2, 0, 1, 3]) + q = paddle.reshape(q, shape=[-1, len_q, d_k]) # (n*b) x lq x dk + k = paddle.transpose(k, perm=[2, 0, 1, 3]) + k = paddle.reshape(k, shape=[-1, len_k, d_k]) # (n*b) x lk x dk + v = paddle.transpose(v, perm=[2, 0, 1, 3]) + v = paddle.reshape(v, shape=[-1, len_v, d_v]) # (n*b) x lv x dv + + mask = paddle.tile( + mask, + [n_head, 1, 1]) if mask is not None else None # (n*b) x .. x .. + output = self.attention(q, k, v, mask=mask) + output = paddle.reshape(output, shape=[n_head, -1, len_q, d_v]) + output = paddle.transpose(output, perm=[1, 2, 0, 3]) + output = paddle.reshape( + output, shape=[-1, len_q, n_head * d_v]) # b x lq x (n*dv) + output = self.dropout(self.fc(output)) + output = self.layer_norm(output + residual) + return output + + +class PositionwiseFeedForward(nn.Layer): + def __init__(self, d_in, d_hid, dropout=0.1): + super(PositionwiseFeedForward, self).__init__() + self.w_1 = nn.Conv1D(d_in, d_hid, 1) # position-wise + self.w_2 = nn.Conv1D(d_hid, d_in, 1) # position-wise + self.layer_norm = nn.LayerNorm(d_in) + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + residual = x + x = paddle.transpose(x, perm=[0, 2, 1]) + x = self.w_2(F.relu(self.w_1(x))) + x = paddle.transpose(x, perm=[0, 2, 1]) + x = self.dropout(x) + x = self.layer_norm(x + residual) + return x + + +class EncoderLayer(nn.Layer): + ''' Compose with two layers ''' + + def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): + super(EncoderLayer, self).__init__() + self.slf_attn = MultiHeadAttention( + n_head, d_model, d_k, d_v, dropout=dropout) + self.pos_ffn = PositionwiseFeedForward( + d_model, d_inner, dropout=dropout) + + def forward(self, enc_input, slf_attn_mask=None): + enc_output = self.slf_attn( + enc_input, enc_input, enc_input, mask=slf_attn_mask) + enc_output = self.pos_ffn(enc_output) + return enc_output + + +class Transformer_Encoder(nn.Layer): + def __init__(self, + n_layers=2, + n_head=8, + d_word_vec=512, + d_k=64, + d_v=64, + d_model=512, + d_inner=2048, + dropout=0.1, + n_position=256): + super(Transformer_Encoder, self).__init__() + self.position_enc = PositionalEncoding( + d_word_vec, n_position=n_position) + self.dropout = nn.Dropout(p=dropout) + self.layer_stack = nn.LayerList([ + EncoderLayer( + d_model, d_inner, n_head, d_k, d_v, dropout=dropout) + for _ in range(n_layers) + ]) + self.layer_norm = nn.LayerNorm(d_model, epsilon=1e-6) + + def forward(self, enc_output, src_mask, return_attns=False): + enc_output = self.dropout( + self.position_enc(enc_output)) # position embeding + for enc_layer in self.layer_stack: + enc_output = enc_layer(enc_output, slf_attn_mask=src_mask) + enc_output = self.layer_norm(enc_output) + return enc_output + + +class PP_layer(nn.Layer): + def __init__(self, n_dim=512, N_max_character=25, n_position=256): + + super(PP_layer, self).__init__() + self.character_len = N_max_character + self.f0_embedding = nn.Embedding(N_max_character, n_dim) + self.w0 = nn.Linear(N_max_character, n_position) + self.wv = nn.Linear(n_dim, n_dim) + self.we = nn.Linear(n_dim, N_max_character) + self.active = nn.Tanh() + self.softmax = nn.Softmax(axis=2) + + def forward(self, enc_output): + # enc_output: b,256,512 + reading_order = paddle.arange(self.character_len, dtype='int64') + reading_order = reading_order.unsqueeze(0).expand( + [enc_output.shape[0], self.character_len]) # (S,) -> (B, S) + reading_order = self.f0_embedding(reading_order) # b,25,512 + + # calculate attention + reading_order = paddle.transpose(reading_order, perm=[0, 2, 1]) + t = self.w0(reading_order) # b,512,256 + t = self.active( + paddle.transpose( + t, perm=[0, 2, 1]) + self.wv(enc_output)) # b,256,512 + t = self.we(t) # b,256,25 + t = self.softmax(paddle.transpose(t, perm=[0, 2, 1])) # b,25,256 + g_output = paddle.bmm(t, enc_output) # b,25,512 + return g_output + + +class Prediction(nn.Layer): + def __init__(self, + n_dim=512, + n_position=256, + N_max_character=25, + n_class=37): + super(Prediction, self).__init__() + self.pp = PP_layer( + n_dim=n_dim, N_max_character=N_max_character, n_position=n_position) + self.pp_share = PP_layer( + n_dim=n_dim, N_max_character=N_max_character, n_position=n_position) + self.w_vrm = nn.Linear(n_dim, n_class) # output layer + self.w_share = nn.Linear(n_dim, n_class) # output layer + self.nclass = n_class + + def forward(self, cnn_feature, f_res, f_sub, train_mode=False, + use_mlm=True): + if train_mode: + if not use_mlm: + g_output = self.pp(cnn_feature) # b,25,512 + g_output = self.w_vrm(g_output) + f_res = 0 + f_sub = 0 + return g_output, f_res, f_sub + g_output = self.pp(cnn_feature) # b,25,512 + f_res = self.pp_share(f_res) + f_sub = self.pp_share(f_sub) + g_output = self.w_vrm(g_output) + f_res = self.w_share(f_res) + f_sub = self.w_share(f_sub) + return g_output, f_res, f_sub + else: + g_output = self.pp(cnn_feature) # b,25,512 + g_output = self.w_vrm(g_output) + return g_output + + +class MLM(nn.Layer): + "Architecture of MLM" + + def __init__(self, n_dim=512, n_position=256, max_text_length=25): + super(MLM, self).__init__() + self.MLM_SequenceModeling_mask = Transformer_Encoder( + n_layers=2, n_position=n_position) + self.MLM_SequenceModeling_WCL = Transformer_Encoder( + n_layers=1, n_position=n_position) + self.pos_embedding = nn.Embedding(max_text_length, n_dim) + self.w0_linear = nn.Linear(1, n_position) + self.wv = nn.Linear(n_dim, n_dim) + self.active = nn.Tanh() + self.we = nn.Linear(n_dim, 1) + self.sigmoid = nn.Sigmoid() + + def forward(self, x, label_pos): + # transformer unit for generating mask_c + feature_v_seq = self.MLM_SequenceModeling_mask(x, src_mask=None) + # position embedding layer + label_pos = paddle.to_tensor(label_pos, dtype='int64') + pos_emb = self.pos_embedding(label_pos) + pos_emb = self.w0_linear(paddle.unsqueeze(pos_emb, axis=2)) + pos_emb = paddle.transpose(pos_emb, perm=[0, 2, 1]) + # fusion position embedding with features V & generate mask_c + att_map_sub = self.active(pos_emb + self.wv(feature_v_seq)) + att_map_sub = self.we(att_map_sub) # b,256,1 + att_map_sub = paddle.transpose(att_map_sub, perm=[0, 2, 1]) + att_map_sub = self.sigmoid(att_map_sub) # b,1,256 + # WCL + ## generate inputs for WCL + att_map_sub = paddle.transpose(att_map_sub, perm=[0, 2, 1]) + f_res = x * (1 - att_map_sub) # second path with remaining string + f_sub = x * att_map_sub # first path with occluded character + ## transformer units in WCL + f_res = self.MLM_SequenceModeling_WCL(f_res, src_mask=None) + f_sub = self.MLM_SequenceModeling_WCL(f_sub, src_mask=None) + return f_res, f_sub, att_map_sub + + +def trans_1d_2d(x): + b, w_h, c = x.shape # b, 256, 512 + x = paddle.transpose(x, perm=[0, 2, 1]) + x = paddle.reshape(x, [-1, c, 32, 8]) + x = paddle.transpose(x, perm=[0, 1, 3, 2]) # [b, c, 8, 32] + return x + + +class MLM_VRM(nn.Layer): + """ + MLM+VRM, MLM is only used in training. + ratio controls the occluded number in a batch. + The pipeline of VisionLAN in testing is very concise with only a backbone + sequence modeling(transformer unit) + prediction layer(pp layer). + x: input image + label_pos: character index + training_step: LF or LA process + output + text_pre: prediction of VRM + test_rem: prediction of remaining string in MLM + text_mas: prediction of occluded character in MLM + mask_c_show: visualization of Mask_c + """ + + def __init__(self, + n_layers=3, + n_position=256, + n_dim=512, + max_text_length=25, + nclass=37): + super(MLM_VRM, self).__init__() + self.MLM = MLM(n_dim=n_dim, + n_position=n_position, + max_text_length=max_text_length) + self.SequenceModeling = Transformer_Encoder( + n_layers=n_layers, n_position=n_position) + self.Prediction = Prediction( + n_dim=n_dim, + n_position=n_position, + N_max_character=max_text_length + + 1, # N_max_character = 1 eos + 25 characters + n_class=nclass) + self.nclass = nclass + self.max_text_length = max_text_length + + def forward(self, x, label_pos, training_step, train_mode=False): + b, c, h, w = x.shape + nT = self.max_text_length + x = paddle.transpose(x, perm=[0, 1, 3, 2]) + x = paddle.reshape(x, [-1, c, h * w]) + x = paddle.transpose(x, perm=[0, 2, 1]) + if train_mode: + if training_step == 'LF_1': + f_res = 0 + f_sub = 0 + x = self.SequenceModeling(x, src_mask=None) + text_pre, test_rem, text_mas = self.Prediction( + x, f_res, f_sub, train_mode=True, use_mlm=False) + return text_pre, text_pre, text_pre, text_pre + elif training_step == 'LF_2': + # MLM + f_res, f_sub, mask_c = self.MLM(x, label_pos) + x = self.SequenceModeling(x, src_mask=None) + text_pre, test_rem, text_mas = self.Prediction( + x, f_res, f_sub, train_mode=True) + mask_c_show = trans_1d_2d(mask_c) + return text_pre, test_rem, text_mas, mask_c_show + elif training_step == 'LA': + # MLM + f_res, f_sub, mask_c = self.MLM(x, label_pos) + ## use the mask_c (1 for occluded character and 0 for remaining characters) to occlude input + ## ratio controls the occluded number in a batch + character_mask = paddle.zeros_like(mask_c) + + ratio = b // 2 + if ratio >= 1: + with paddle.no_grad(): + character_mask[0:ratio, :, :] = mask_c[0:ratio, :, :] + else: + character_mask = mask_c + x = x * (1 - character_mask) + # VRM + ## transformer unit for VRM + x = self.SequenceModeling(x, src_mask=None) + ## prediction layer for MLM and VSR + text_pre, test_rem, text_mas = self.Prediction( + x, f_res, f_sub, train_mode=True) + mask_c_show = trans_1d_2d(mask_c) + return text_pre, test_rem, text_mas, mask_c_show + else: + raise NotImplementedError + else: # VRM is only used in the testing stage + f_res = 0 + f_sub = 0 + contextual_feature = self.SequenceModeling(x, src_mask=None) + text_pre = self.Prediction( + contextual_feature, + f_res, + f_sub, + train_mode=False, + use_mlm=False) + text_pre = paddle.transpose( + text_pre, perm=[1, 0, 2]) # (26, b, 37)) + return text_pre, x + + +class VLHead(nn.Layer): + """ + Architecture of VisionLAN + """ + + def __init__(self, + in_channels, + out_channels=36, + n_layers=3, + n_position=256, + n_dim=512, + max_text_length=25, + training_step='LA'): + super(VLHead, self).__init__() + self.MLM_VRM = MLM_VRM( + n_layers=n_layers, + n_position=n_position, + n_dim=n_dim, + max_text_length=max_text_length, + nclass=out_channels + 1) + self.training_step = training_step + + def forward(self, feat, targets=None): + + if self.training: + label_pos = targets[-2] + text_pre, test_rem, text_mas, mask_map = self.MLM_VRM( + feat, label_pos, self.training_step, train_mode=True) + return text_pre, test_rem, text_mas, mask_map + else: + text_pre, x = self.MLM_VRM( + feat, targets, self.training_step, train_mode=False) + return text_pre, x diff --git a/ppocr/modeling/heads/self_attention.py b/ppocr/modeling/heads/self_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..6e4c65e3931ae74a0fde2a16694a69fdfa69b5ed --- /dev/null +++ b/ppocr/modeling/heads/self_attention.py @@ -0,0 +1,405 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import paddle +from paddle import ParamAttr, nn +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import numpy as np +gradient_clip = 10 + + +class WrapEncoderForFeature(nn.Layer): + def __init__(self, + src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + bos_idx=0): + super(WrapEncoderForFeature, self).__init__() + + self.prepare_encoder = PrepareEncoder( + src_vocab_size, + d_model, + max_length, + prepostprocess_dropout, + bos_idx=bos_idx, + word_emb_param_name="src_word_emb_table") + self.encoder = Encoder(n_layer, n_head, d_key, d_value, d_model, + d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, + postprocess_cmd) + + def forward(self, enc_inputs): + conv_features, src_pos, src_slf_attn_bias = enc_inputs + enc_input = self.prepare_encoder(conv_features, src_pos) + enc_output = self.encoder(enc_input, src_slf_attn_bias) + return enc_output + + +class WrapEncoder(nn.Layer): + """ + embedder + encoder + """ + + def __init__(self, + src_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + bos_idx=0): + super(WrapEncoder, self).__init__() + + self.prepare_decoder = PrepareDecoder( + src_vocab_size, + d_model, + max_length, + prepostprocess_dropout, + bos_idx=bos_idx) + self.encoder = Encoder(n_layer, n_head, d_key, d_value, d_model, + d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, + postprocess_cmd) + + def forward(self, enc_inputs): + src_word, src_pos, src_slf_attn_bias = enc_inputs + enc_input = self.prepare_decoder(src_word, src_pos) + enc_output = self.encoder(enc_input, src_slf_attn_bias) + return enc_output + + +class Encoder(nn.Layer): + """ + encoder + """ + + def __init__(self, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + + super(Encoder, self).__init__() + + self.encoder_layers = list() + for i in range(n_layer): + self.encoder_layers.append( + self.add_sublayer( + "layer_%d" % i, + EncoderLayer(n_head, d_key, d_value, d_model, d_inner_hid, + prepostprocess_dropout, attention_dropout, + relu_dropout, preprocess_cmd, + postprocess_cmd))) + self.processer = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + + def forward(self, enc_input, attn_bias): + for encoder_layer in self.encoder_layers: + enc_output = encoder_layer(enc_input, attn_bias) + enc_input = enc_output + enc_output = self.processer(enc_output) + return enc_output + + +class EncoderLayer(nn.Layer): + """ + EncoderLayer + """ + + def __init__(self, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + + super(EncoderLayer, self).__init__() + self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, + attention_dropout) + self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.ffn = FFN(d_inner_hid, d_model, relu_dropout) + self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + def forward(self, enc_input, attn_bias): + attn_output = self.self_attn( + self.preprocesser1(enc_input), None, None, attn_bias) + attn_output = self.postprocesser1(attn_output, enc_input) + ffn_output = self.ffn(self.preprocesser2(attn_output)) + ffn_output = self.postprocesser2(ffn_output, attn_output) + return ffn_output + + +class MultiHeadAttention(nn.Layer): + """ + Multi-Head Attention + """ + + def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.): + super(MultiHeadAttention, self).__init__() + self.n_head = n_head + self.d_key = d_key + self.d_value = d_value + self.d_model = d_model + self.dropout_rate = dropout_rate + self.q_fc = paddle.nn.Linear( + in_features=d_model, out_features=d_key * n_head, bias_attr=False) + self.k_fc = paddle.nn.Linear( + in_features=d_model, out_features=d_key * n_head, bias_attr=False) + self.v_fc = paddle.nn.Linear( + in_features=d_model, out_features=d_value * n_head, bias_attr=False) + self.proj_fc = paddle.nn.Linear( + in_features=d_value * n_head, out_features=d_model, bias_attr=False) + + def _prepare_qkv(self, queries, keys, values, cache=None): + if keys is None: # self-attention + keys, values = queries, queries + static_kv = False + else: # cross-attention + static_kv = True + + q = self.q_fc(queries) + q = paddle.reshape(x=q, shape=[0, 0, self.n_head, self.d_key]) + q = paddle.transpose(x=q, perm=[0, 2, 1, 3]) + + if cache is not None and static_kv and "static_k" in cache: + # for encoder-decoder attention in inference and has cached + k = cache["static_k"] + v = cache["static_v"] + else: + k = self.k_fc(keys) + v = self.v_fc(values) + k = paddle.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) + k = paddle.transpose(x=k, perm=[0, 2, 1, 3]) + v = paddle.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) + v = paddle.transpose(x=v, perm=[0, 2, 1, 3]) + + if cache is not None: + if static_kv and not "static_k" in cache: + # for encoder-decoder attention in inference and has not cached + cache["static_k"], cache["static_v"] = k, v + elif not static_kv: + # for decoder self-attention in inference + cache_k, cache_v = cache["k"], cache["v"] + k = paddle.concat([cache_k, k], axis=2) + v = paddle.concat([cache_v, v], axis=2) + cache["k"], cache["v"] = k, v + + return q, k, v + + def forward(self, queries, keys, values, attn_bias, cache=None): + # compute q ,k ,v + keys = queries if keys is None else keys + values = keys if values is None else values + q, k, v = self._prepare_qkv(queries, keys, values, cache) + + # scale dot product attention + product = paddle.matmul(x=q, y=k, transpose_y=True) + product = product * self.d_model**-0.5 + if attn_bias is not None: + product += attn_bias + weights = F.softmax(product) + if self.dropout_rate: + weights = F.dropout( + weights, p=self.dropout_rate, mode="downscale_in_infer") + out = paddle.matmul(weights, v) + + # combine heads + out = paddle.transpose(out, perm=[0, 2, 1, 3]) + out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) + + # project to output + out = self.proj_fc(out) + + return out + + +class PrePostProcessLayer(nn.Layer): + """ + PrePostProcessLayer + """ + + def __init__(self, process_cmd, d_model, dropout_rate): + super(PrePostProcessLayer, self).__init__() + self.process_cmd = process_cmd + self.functors = [] + for cmd in self.process_cmd: + if cmd == "a": # add residual connection + self.functors.append(lambda x, y: x + y if y is not None else x) + elif cmd == "n": # add layer normalization + self.functors.append( + self.add_sublayer( + "layer_norm_%d" % len(self.sublayers()), + paddle.nn.LayerNorm( + normalized_shape=d_model, + weight_attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.Constant(1.)), + bias_attr=paddle.ParamAttr( + initializer=paddle.nn.initializer.Constant(0.))))) + elif cmd == "d": # add dropout + self.functors.append(lambda x: F.dropout( + x, p=dropout_rate, mode="downscale_in_infer") + if dropout_rate else x) + + def forward(self, x, residual=None): + for i, cmd in enumerate(self.process_cmd): + if cmd == "a": + x = self.functors[i](x, residual) + else: + x = self.functors[i](x) + return x + + +class PrepareEncoder(nn.Layer): + def __init__(self, + src_vocab_size, + src_emb_dim, + src_max_len, + dropout_rate=0, + bos_idx=0, + word_emb_param_name=None, + pos_enc_param_name=None): + super(PrepareEncoder, self).__init__() + self.src_emb_dim = src_emb_dim + self.src_max_len = src_max_len + self.emb = paddle.nn.Embedding( + num_embeddings=self.src_max_len, embedding_dim=self.src_emb_dim) + self.dropout_rate = dropout_rate + + def forward(self, src_word, src_pos): + src_word_emb = src_word + src_word_emb = paddle.cast(src_word_emb, 'float32') + src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5) + src_pos = paddle.squeeze(src_pos, axis=-1) + src_pos_enc = self.emb(src_pos) + src_pos_enc.stop_gradient = True + enc_input = src_word_emb + src_pos_enc + if self.dropout_rate: + out = F.dropout( + x=enc_input, p=self.dropout_rate, mode="downscale_in_infer") + else: + out = enc_input + return out + + +class PrepareDecoder(nn.Layer): + def __init__(self, + src_vocab_size, + src_emb_dim, + src_max_len, + dropout_rate=0, + bos_idx=0, + word_emb_param_name=None, + pos_enc_param_name=None): + super(PrepareDecoder, self).__init__() + self.src_emb_dim = src_emb_dim + """ + self.emb0 = Embedding(num_embeddings=src_vocab_size, + embedding_dim=src_emb_dim) + """ + self.emb0 = paddle.nn.Embedding( + num_embeddings=src_vocab_size, + embedding_dim=self.src_emb_dim, + padding_idx=bos_idx, + weight_attr=paddle.ParamAttr( + name=word_emb_param_name, + initializer=nn.initializer.Normal(0., src_emb_dim**-0.5))) + self.emb1 = paddle.nn.Embedding( + num_embeddings=src_max_len, + embedding_dim=self.src_emb_dim, + weight_attr=paddle.ParamAttr(name=pos_enc_param_name)) + self.dropout_rate = dropout_rate + + def forward(self, src_word, src_pos): + src_word = paddle.cast(src_word, 'int64') + src_word = paddle.squeeze(src_word, axis=-1) + src_word_emb = self.emb0(src_word) + src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5) + src_pos = paddle.squeeze(src_pos, axis=-1) + src_pos_enc = self.emb1(src_pos) + src_pos_enc.stop_gradient = True + enc_input = src_word_emb + src_pos_enc + if self.dropout_rate: + out = F.dropout( + x=enc_input, p=self.dropout_rate, mode="downscale_in_infer") + else: + out = enc_input + return out + + +class FFN(nn.Layer): + """ + Feed-Forward Network + """ + + def __init__(self, d_inner_hid, d_model, dropout_rate): + super(FFN, self).__init__() + self.dropout_rate = dropout_rate + self.fc1 = paddle.nn.Linear( + in_features=d_model, out_features=d_inner_hid) + self.fc2 = paddle.nn.Linear( + in_features=d_inner_hid, out_features=d_model) + + def forward(self, x): + hidden = self.fc1(x) + hidden = F.relu(hidden) + if self.dropout_rate: + hidden = F.dropout( + hidden, p=self.dropout_rate, mode="downscale_in_infer") + out = self.fc2(hidden) + return out diff --git a/ppocr/modeling/heads/sr_rensnet_transformer.py b/ppocr/modeling/heads/sr_rensnet_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..654f3fca5486229c176246237708c4cf6a8da9ec --- /dev/null +++ b/ppocr/modeling/heads/sr_rensnet_transformer.py @@ -0,0 +1,427 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FudanVI/FudanOCR/blob/main/text-gestalt/loss/transformer_english_decomposition.py +""" +import copy +import math + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +def subsequent_mask(size): + """Generate a square mask for the sequence. The masked positions are filled with float('-inf'). + Unmasked positions are filled with float(0.0). + """ + mask = paddle.ones([1, size, size], dtype='float32') + mask_inf = paddle.triu( + paddle.full( + shape=[1, size, size], dtype='float32', fill_value='-inf'), + diagonal=1) + mask = mask + mask_inf + padding_mask = paddle.equal(mask, paddle.to_tensor(1, dtype=mask.dtype)) + return padding_mask + + +def clones(module, N): + return nn.LayerList([copy.deepcopy(module) for _ in range(N)]) + + +def masked_fill(x, mask, value): + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + +def attention(query, key, value, mask=None, dropout=None, attention_map=None): + d_k = query.shape[-1] + scores = paddle.matmul(query, + paddle.transpose(key, [0, 1, 3, 2])) / math.sqrt(d_k) + + if mask is not None: + scores = masked_fill(scores, mask == 0, float('-inf')) + else: + pass + + p_attn = F.softmax(scores, axis=-1) + + if dropout is not None: + p_attn = dropout(p_attn) + return paddle.matmul(p_attn, value), p_attn + + +class MultiHeadedAttention(nn.Layer): + def __init__(self, h, d_model, dropout=0.1, compress_attention=False): + super(MultiHeadedAttention, self).__init__() + assert d_model % h == 0 + self.d_k = d_model // h + self.h = h + self.linears = clones(nn.Linear(d_model, d_model), 4) + self.attn = None + self.dropout = nn.Dropout(p=dropout, mode="downscale_in_infer") + self.compress_attention = compress_attention + self.compress_attention_linear = nn.Linear(h, 1) + + def forward(self, query, key, value, mask=None, attention_map=None): + if mask is not None: + mask = mask.unsqueeze(1) + nbatches = query.shape[0] + + query, key, value = \ + [paddle.transpose(l(x).reshape([nbatches, -1, self.h, self.d_k]), [0,2,1,3]) + for l, x in zip(self.linears, (query, key, value))] + + x, attention_map = attention( + query, + key, + value, + mask=mask, + dropout=self.dropout, + attention_map=attention_map) + + x = paddle.reshape( + paddle.transpose(x, [0, 2, 1, 3]), + [nbatches, -1, self.h * self.d_k]) + + return self.linears[-1](x), attention_map + + +class ResNet(nn.Layer): + def __init__(self, num_in, block, layers): + super(ResNet, self).__init__() + + self.conv1 = nn.Conv2D(num_in, 64, kernel_size=3, stride=1, padding=1) + self.bn1 = nn.BatchNorm2D(64, use_global_stats=True) + self.relu1 = nn.ReLU() + self.pool = nn.MaxPool2D((2, 2), (2, 2)) + + self.conv2 = nn.Conv2D(64, 128, kernel_size=3, stride=1, padding=1) + self.bn2 = nn.BatchNorm2D(128, use_global_stats=True) + self.relu2 = nn.ReLU() + + self.layer1_pool = nn.MaxPool2D((2, 2), (2, 2)) + self.layer1 = self._make_layer(block, 128, 256, layers[0]) + self.layer1_conv = nn.Conv2D(256, 256, 3, 1, 1) + self.layer1_bn = nn.BatchNorm2D(256, use_global_stats=True) + self.layer1_relu = nn.ReLU() + + self.layer2_pool = nn.MaxPool2D((2, 2), (2, 2)) + self.layer2 = self._make_layer(block, 256, 256, layers[1]) + self.layer2_conv = nn.Conv2D(256, 256, 3, 1, 1) + self.layer2_bn = nn.BatchNorm2D(256, use_global_stats=True) + self.layer2_relu = nn.ReLU() + + self.layer3_pool = nn.MaxPool2D((2, 2), (2, 2)) + self.layer3 = self._make_layer(block, 256, 512, layers[2]) + self.layer3_conv = nn.Conv2D(512, 512, 3, 1, 1) + self.layer3_bn = nn.BatchNorm2D(512, use_global_stats=True) + self.layer3_relu = nn.ReLU() + + self.layer4_pool = nn.MaxPool2D((2, 2), (2, 2)) + self.layer4 = self._make_layer(block, 512, 512, layers[3]) + self.layer4_conv2 = nn.Conv2D(512, 1024, 3, 1, 1) + self.layer4_conv2_bn = nn.BatchNorm2D(1024, use_global_stats=True) + self.layer4_conv2_relu = nn.ReLU() + + def _make_layer(self, block, inplanes, planes, blocks): + + if inplanes != planes: + downsample = nn.Sequential( + nn.Conv2D(inplanes, planes, 3, 1, 1), + nn.BatchNorm2D( + planes, use_global_stats=True), ) + else: + downsample = None + layers = [] + layers.append(block(inplanes, planes, downsample)) + for i in range(1, blocks): + layers.append(block(planes, planes, downsample=None)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu1(x) + x = self.pool(x) + + x = self.conv2(x) + x = self.bn2(x) + x = self.relu2(x) + + x = self.layer1_pool(x) + x = self.layer1(x) + x = self.layer1_conv(x) + x = self.layer1_bn(x) + x = self.layer1_relu(x) + + x = self.layer2(x) + x = self.layer2_conv(x) + x = self.layer2_bn(x) + x = self.layer2_relu(x) + + x = self.layer3(x) + x = self.layer3_conv(x) + x = self.layer3_bn(x) + x = self.layer3_relu(x) + + x = self.layer4(x) + x = self.layer4_conv2(x) + x = self.layer4_conv2_bn(x) + x = self.layer4_conv2_relu(x) + + return x + + +class Bottleneck(nn.Layer): + def __init__(self, input_dim): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2D(input_dim, input_dim, 1) + self.bn1 = nn.BatchNorm2D(input_dim, use_global_stats=True) + self.relu = nn.ReLU() + + self.conv2 = nn.Conv2D(input_dim, input_dim, 3, 1, 1) + self.bn2 = nn.BatchNorm2D(input_dim, use_global_stats=True) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + out += residual + out = self.relu(out) + + return out + + +class PositionalEncoding(nn.Layer): + "Implement the PE function." + + def __init__(self, dropout, dim, max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout, mode="downscale_in_infer") + + pe = paddle.zeros([max_len, dim]) + position = paddle.arange(0, max_len, dtype=paddle.float32).unsqueeze(1) + div_term = paddle.exp( + paddle.arange(0, dim, 2).astype('float32') * + (-math.log(10000.0) / dim)) + pe[:, 0::2] = paddle.sin(position * div_term) + pe[:, 1::2] = paddle.cos(position * div_term) + pe = paddle.unsqueeze(pe, 0) + self.register_buffer('pe', pe) + + def forward(self, x): + x = x + self.pe[:, :paddle.shape(x)[1]] + return self.dropout(x) + + +class PositionwiseFeedForward(nn.Layer): + "Implements FFN equation." + + def __init__(self, d_model, d_ff, dropout=0.1): + super(PositionwiseFeedForward, self).__init__() + self.w_1 = nn.Linear(d_model, d_ff) + self.w_2 = nn.Linear(d_ff, d_model) + self.dropout = nn.Dropout(dropout, mode="downscale_in_infer") + + def forward(self, x): + return self.w_2(self.dropout(F.relu(self.w_1(x)))) + + +class Generator(nn.Layer): + "Define standard linear + softmax generation step." + + def __init__(self, d_model, vocab): + super(Generator, self).__init__() + self.proj = nn.Linear(d_model, vocab) + self.relu = nn.ReLU() + + def forward(self, x): + out = self.proj(x) + return out + + +class Embeddings(nn.Layer): + def __init__(self, d_model, vocab): + super(Embeddings, self).__init__() + self.lut = nn.Embedding(vocab, d_model) + self.d_model = d_model + + def forward(self, x): + embed = self.lut(x) * math.sqrt(self.d_model) + return embed + + +class LayerNorm(nn.Layer): + "Construct a layernorm module (See citation for details)." + + def __init__(self, features, eps=1e-6): + super(LayerNorm, self).__init__() + self.a_2 = self.create_parameter( + shape=[features], + default_initializer=paddle.nn.initializer.Constant(1.0)) + self.b_2 = self.create_parameter( + shape=[features], + default_initializer=paddle.nn.initializer.Constant(0.0)) + self.eps = eps + + def forward(self, x): + mean = x.mean(-1, keepdim=True) + std = x.std(-1, keepdim=True) + return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 + + +class Decoder(nn.Layer): + def __init__(self): + super(Decoder, self).__init__() + + self.mask_multihead = MultiHeadedAttention( + h=16, d_model=1024, dropout=0.1) + self.mul_layernorm1 = LayerNorm(1024) + + self.multihead = MultiHeadedAttention(h=16, d_model=1024, dropout=0.1) + self.mul_layernorm2 = LayerNorm(1024) + + self.pff = PositionwiseFeedForward(1024, 2048) + self.mul_layernorm3 = LayerNorm(1024) + + def forward(self, text, conv_feature, attention_map=None): + text_max_length = text.shape[1] + mask = subsequent_mask(text_max_length) + result = text + result = self.mul_layernorm1(result + self.mask_multihead( + text, text, text, mask=mask)[0]) + b, c, h, w = conv_feature.shape + conv_feature = paddle.transpose( + conv_feature.reshape([b, c, h * w]), [0, 2, 1]) + word_image_align, attention_map = self.multihead( + result, + conv_feature, + conv_feature, + mask=None, + attention_map=attention_map) + result = self.mul_layernorm2(result + word_image_align) + result = self.mul_layernorm3(result + self.pff(result)) + + return result, attention_map + + +class BasicBlock(nn.Layer): + def __init__(self, inplanes, planes, downsample): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2D( + inplanes, planes, kernel_size=3, stride=1, padding=1) + self.bn1 = nn.BatchNorm2D(planes, use_global_stats=True) + self.relu = nn.ReLU() + self.conv2 = nn.Conv2D( + planes, planes, kernel_size=3, stride=1, padding=1) + self.bn2 = nn.BatchNorm2D(planes, use_global_stats=True) + self.downsample = downsample + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample != None: + residual = self.downsample(residual) + + out += residual + out = self.relu(out) + + return out + + +class Encoder(nn.Layer): + def __init__(self): + super(Encoder, self).__init__() + self.cnn = ResNet(num_in=1, block=BasicBlock, layers=[1, 2, 5, 3]) + + def forward(self, input): + conv_result = self.cnn(input) + return conv_result + + +class Transformer(nn.Layer): + def __init__(self, in_channels=1, alphabet='0123456789'): + super(Transformer, self).__init__() + self.alphabet = alphabet + word_n_class = self.get_alphabet_len() + self.embedding_word_with_upperword = Embeddings(512, word_n_class) + self.pe = PositionalEncoding(dim=512, dropout=0.1, max_len=5000) + + self.encoder = Encoder() + self.decoder = Decoder() + self.generator_word_with_upperword = Generator(1024, word_n_class) + + for p in self.parameters(): + if p.dim() > 1: + nn.initializer.XavierNormal(p) + + def get_alphabet_len(self): + return len(self.alphabet) + + def forward(self, image, text_length, text_input, attention_map=None): + if image.shape[1] == 3: + R = image[:, 0:1, :, :] + G = image[:, 1:2, :, :] + B = image[:, 2:3, :, :] + image = 0.299 * R + 0.587 * G + 0.114 * B + + conv_feature = self.encoder(image) # batch, 1024, 8, 32 + max_length = max(text_length) + text_input = text_input[:, :max_length] + + text_embedding = self.embedding_word_with_upperword( + text_input) # batch, text_max_length, 512 + postion_embedding = self.pe( + paddle.zeros(text_embedding.shape)) # batch, text_max_length, 512 + text_input_with_pe = paddle.concat([text_embedding, postion_embedding], + 2) # batch, text_max_length, 1024 + batch, seq_len, _ = text_input_with_pe.shape + + text_input_with_pe, word_attention_map = self.decoder( + text_input_with_pe, conv_feature) + + word_decoder_result = self.generator_word_with_upperword( + text_input_with_pe) + + if self.training: + total_length = paddle.sum(text_length) + probs_res = paddle.zeros([total_length, self.get_alphabet_len()]) + start = 0 + + for index, length in enumerate(text_length): + length = int(length.numpy()) + probs_res[start:start + length, :] = word_decoder_result[ + index, 0:0 + length, :] + + start = start + length + + return probs_res, word_attention_map, None + else: + return word_decoder_result diff --git a/ppocr/modeling/heads/table_att_head.py b/ppocr/modeling/heads/table_att_head.py new file mode 100644 index 0000000000000000000000000000000000000000..e3fc8436e78bf3959eec8cb89efc66500fa56bdc --- /dev/null +++ b/ppocr/modeling/heads/table_att_head.py @@ -0,0 +1,248 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +import paddle.nn as nn +from paddle import ParamAttr +import paddle.nn.functional as F +import numpy as np + +from .rec_att_head import AttentionGRUCell + + +def get_para_bias_attr(l2_decay, k): + if l2_decay > 0: + regularizer = paddle.regularizer.L2Decay(l2_decay) + stdv = 1.0 / math.sqrt(k * 1.0) + initializer = nn.initializer.Uniform(-stdv, stdv) + else: + regularizer = None + initializer = None + weight_attr = ParamAttr(regularizer=regularizer, initializer=initializer) + bias_attr = ParamAttr(regularizer=regularizer, initializer=initializer) + return [weight_attr, bias_attr] + + +class TableAttentionHead(nn.Layer): + def __init__(self, + in_channels, + hidden_size, + in_max_len=488, + max_text_length=800, + out_channels=30, + loc_reg_num=4, + **kwargs): + super(TableAttentionHead, self).__init__() + self.input_size = in_channels[-1] + self.hidden_size = hidden_size + self.out_channels = out_channels + self.max_text_length = max_text_length + + self.structure_attention_cell = AttentionGRUCell( + self.input_size, hidden_size, self.out_channels, use_gru=False) + self.structure_generator = nn.Linear(hidden_size, self.out_channels) + self.in_max_len = in_max_len + + if self.in_max_len == 640: + self.loc_fea_trans = nn.Linear(400, self.max_text_length + 1) + elif self.in_max_len == 800: + self.loc_fea_trans = nn.Linear(625, self.max_text_length + 1) + else: + self.loc_fea_trans = nn.Linear(256, self.max_text_length + 1) + self.loc_generator = nn.Linear(self.input_size + hidden_size, + loc_reg_num) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char, onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None): + # if and else branch are both needed when you want to assign a variable + # if you modify the var in just one branch, then the modification will not work. + fea = inputs[-1] + last_shape = int(np.prod(fea.shape[2:])) # gry added + fea = paddle.reshape(fea, [fea.shape[0], fea.shape[1], last_shape]) + fea = fea.transpose([0, 2, 1]) # (NTC)(batch, width, channels) + batch_size = fea.shape[0] + + hidden = paddle.zeros((batch_size, self.hidden_size)) + output_hiddens = paddle.zeros( + (batch_size, self.max_text_length + 1, self.hidden_size)) + if self.training and targets is not None: + structure = targets[0] + for i in range(self.max_text_length + 1): + elem_onehots = self._char_to_onehot( + structure[:, i], onehot_dim=self.out_channels) + (outputs, hidden), alpha = self.structure_attention_cell( + hidden, fea, elem_onehots) + output_hiddens[:, i, :] = outputs + structure_probs = self.structure_generator(output_hiddens) + loc_fea = fea.transpose([0, 2, 1]) + loc_fea = self.loc_fea_trans(loc_fea) + loc_fea = loc_fea.transpose([0, 2, 1]) + loc_concat = paddle.concat([output_hiddens, loc_fea], axis=2) + loc_preds = self.loc_generator(loc_concat) + loc_preds = F.sigmoid(loc_preds) + else: + temp_elem = paddle.zeros(shape=[batch_size], dtype="int32") + structure_probs = None + loc_preds = None + elem_onehots = None + outputs = None + alpha = None + max_text_length = paddle.to_tensor(self.max_text_length) + for i in range(max_text_length + 1): + elem_onehots = self._char_to_onehot( + temp_elem, onehot_dim=self.out_channels) + (outputs, hidden), alpha = self.structure_attention_cell( + hidden, fea, elem_onehots) + output_hiddens[:, i, :] = outputs + structure_probs_step = self.structure_generator(outputs) + temp_elem = structure_probs_step.argmax(axis=1, dtype="int32") + + structure_probs = self.structure_generator(output_hiddens) + structure_probs = F.softmax(structure_probs) + loc_fea = fea.transpose([0, 2, 1]) + loc_fea = self.loc_fea_trans(loc_fea) + loc_fea = loc_fea.transpose([0, 2, 1]) + loc_concat = paddle.concat([output_hiddens, loc_fea], axis=2) + loc_preds = self.loc_generator(loc_concat) + loc_preds = F.sigmoid(loc_preds) + return {'structure_probs': structure_probs, 'loc_preds': loc_preds} + + +class SLAHead(nn.Layer): + def __init__(self, + in_channels, + hidden_size, + out_channels=30, + max_text_length=500, + loc_reg_num=4, + fc_decay=0.0, + **kwargs): + """ + @param in_channels: input shape + @param hidden_size: hidden_size for RNN and Embedding + @param out_channels: num_classes to rec + @param max_text_length: max text pred + """ + super().__init__() + in_channels = in_channels[-1] + self.hidden_size = hidden_size + self.max_text_length = max_text_length + self.emb = self._char_to_onehot + self.num_embeddings = out_channels + self.loc_reg_num = loc_reg_num + + # structure + self.structure_attention_cell = AttentionGRUCell( + in_channels, hidden_size, self.num_embeddings) + weight_attr, bias_attr = get_para_bias_attr( + l2_decay=fc_decay, k=hidden_size) + weight_attr1_1, bias_attr1_1 = get_para_bias_attr( + l2_decay=fc_decay, k=hidden_size) + weight_attr1_2, bias_attr1_2 = get_para_bias_attr( + l2_decay=fc_decay, k=hidden_size) + self.structure_generator = nn.Sequential( + nn.Linear( + self.hidden_size, + self.hidden_size, + weight_attr=weight_attr1_2, + bias_attr=bias_attr1_2), + nn.Linear( + hidden_size, + out_channels, + weight_attr=weight_attr, + bias_attr=bias_attr)) + # loc + weight_attr1, bias_attr1 = get_para_bias_attr( + l2_decay=fc_decay, k=self.hidden_size) + weight_attr2, bias_attr2 = get_para_bias_attr( + l2_decay=fc_decay, k=self.hidden_size) + self.loc_generator = nn.Sequential( + nn.Linear( + self.hidden_size, + self.hidden_size, + weight_attr=weight_attr1, + bias_attr=bias_attr1), + nn.Linear( + self.hidden_size, + loc_reg_num, + weight_attr=weight_attr2, + bias_attr=bias_attr2), + nn.Sigmoid()) + + def forward(self, inputs, targets=None): + fea = inputs[-1] + batch_size = fea.shape[0] + # reshape + fea = paddle.reshape(fea, [fea.shape[0], fea.shape[1], -1]) + fea = fea.transpose([0, 2, 1]) # (NTC)(batch, width, channels) + + hidden = paddle.zeros((batch_size, self.hidden_size)) + structure_preds = paddle.zeros( + (batch_size, self.max_text_length + 1, self.num_embeddings)) + loc_preds = paddle.zeros( + (batch_size, self.max_text_length + 1, self.loc_reg_num)) + structure_preds.stop_gradient = True + loc_preds.stop_gradient = True + if self.training and targets is not None: + structure = targets[0] + for i in range(self.max_text_length + 1): + hidden, structure_step, loc_step = self._decode(structure[:, i], + fea, hidden) + structure_preds[:, i, :] = structure_step + loc_preds[:, i, :] = loc_step + else: + pre_chars = paddle.zeros(shape=[batch_size], dtype="int32") + max_text_length = paddle.to_tensor(self.max_text_length) + # for export + loc_step, structure_step = None, None + for i in range(max_text_length + 1): + hidden, structure_step, loc_step = self._decode(pre_chars, fea, + hidden) + pre_chars = structure_step.argmax(axis=1, dtype="int32") + structure_preds[:, i, :] = structure_step + loc_preds[:, i, :] = loc_step + if not self.training: + structure_preds = F.softmax(structure_preds) + return {'structure_probs': structure_preds, 'loc_preds': loc_preds} + + def _decode(self, pre_chars, features, hidden): + """ + Predict table label and coordinates for each step + @param pre_chars: Table label in previous step + @param features: + @param hidden: hidden status in previous step + @return: + """ + emb_feature = self.emb(pre_chars) + # output shape is b * self.hidden_size + (output, hidden), alpha = self.structure_attention_cell( + hidden, features, emb_feature) + + # structure + structure_step = self.structure_generator(output) + # loc + loc_step = self.loc_generator(output) + return hidden, structure_step, loc_step + + def _char_to_onehot(self, input_char): + input_ont_hot = F.one_hot(input_char, self.num_embeddings) + return input_ont_hot diff --git a/ppocr/modeling/heads/table_master_head.py b/ppocr/modeling/heads/table_master_head.py new file mode 100644 index 0000000000000000000000000000000000000000..486f9cbea13c15b0f3a6d608789163f18f678914 --- /dev/null +++ b/ppocr/modeling/heads/table_master_head.py @@ -0,0 +1,281 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/JiaquanYe/TableMASTER-mmocr/blob/master/mmocr/models/textrecog/decoders/master_decoder.py +""" + +import copy +import math +import paddle +from paddle import nn +from paddle.nn import functional as F + + +class TableMasterHead(nn.Layer): + """ + Split to two transformer header at the last layer. + Cls_layer is used to structure token classification. + Bbox_layer is used to regress bbox coord. + """ + + def __init__(self, + in_channels, + out_channels=30, + headers=8, + d_ff=2048, + dropout=0, + max_text_length=500, + loc_reg_num=4, + **kwargs): + super(TableMasterHead, self).__init__() + hidden_size = in_channels[-1] + self.layers = clones( + DecoderLayer(headers, hidden_size, dropout, d_ff), 2) + self.cls_layer = clones( + DecoderLayer(headers, hidden_size, dropout, d_ff), 1) + self.bbox_layer = clones( + DecoderLayer(headers, hidden_size, dropout, d_ff), 1) + self.cls_fc = nn.Linear(hidden_size, out_channels) + self.bbox_fc = nn.Sequential( + # nn.Linear(hidden_size, hidden_size), + nn.Linear(hidden_size, loc_reg_num), + nn.Sigmoid()) + self.norm = nn.LayerNorm(hidden_size) + self.embedding = Embeddings(d_model=hidden_size, vocab=out_channels) + self.positional_encoding = PositionalEncoding(d_model=hidden_size) + + self.SOS = out_channels - 3 + self.PAD = out_channels - 1 + self.out_channels = out_channels + self.loc_reg_num = loc_reg_num + self.max_text_length = max_text_length + + def make_mask(self, tgt): + """ + Make mask for self attention. + :param src: [b, c, h, l_src] + :param tgt: [b, l_tgt] + :return: + """ + trg_pad_mask = (tgt != self.PAD).unsqueeze(1).unsqueeze(3) + + tgt_len = paddle.shape(tgt)[1] + trg_sub_mask = paddle.tril( + paddle.ones( + ([tgt_len, tgt_len]), dtype=paddle.float32)) + + tgt_mask = paddle.logical_and( + trg_pad_mask.astype(paddle.float32), trg_sub_mask) + return tgt_mask.astype(paddle.float32) + + def decode(self, input, feature, src_mask, tgt_mask): + # main process of transformer decoder. + x = self.embedding(input) # x: 1*x*512, feature: 1*3600,512 + x = self.positional_encoding(x) + + # origin transformer layers + for i, layer in enumerate(self.layers): + x = layer(x, feature, src_mask, tgt_mask) + + # cls head + for layer in self.cls_layer: + cls_x = layer(x, feature, src_mask, tgt_mask) + cls_x = self.norm(cls_x) + + # bbox head + for layer in self.bbox_layer: + bbox_x = layer(x, feature, src_mask, tgt_mask) + bbox_x = self.norm(bbox_x) + return self.cls_fc(cls_x), self.bbox_fc(bbox_x) + + def greedy_forward(self, SOS, feature): + input = SOS + output = paddle.zeros( + [input.shape[0], self.max_text_length + 1, self.out_channels]) + bbox_output = paddle.zeros( + [input.shape[0], self.max_text_length + 1, self.loc_reg_num]) + max_text_length = paddle.to_tensor(self.max_text_length) + for i in range(max_text_length + 1): + target_mask = self.make_mask(input) + out_step, bbox_output_step = self.decode(input, feature, None, + target_mask) + prob = F.softmax(out_step, axis=-1) + next_word = prob.argmax(axis=2, dtype="int64") + input = paddle.concat( + [input, next_word[:, -1].unsqueeze(-1)], axis=1) + if i == self.max_text_length: + output = out_step + bbox_output = bbox_output_step + return output, bbox_output + + def forward_train(self, out_enc, targets): + # x is token of label + # feat is feature after backbone before pe. + # out_enc is feature after pe. + padded_targets = targets[0] + src_mask = None + tgt_mask = self.make_mask(padded_targets[:, :-1]) + output, bbox_output = self.decode(padded_targets[:, :-1], out_enc, + src_mask, tgt_mask) + return {'structure_probs': output, 'loc_preds': bbox_output} + + def forward_test(self, out_enc): + batch_size = out_enc.shape[0] + SOS = paddle.zeros([batch_size, 1], dtype='int64') + self.SOS + output, bbox_output = self.greedy_forward(SOS, out_enc) + output = F.softmax(output) + return {'structure_probs': output, 'loc_preds': bbox_output} + + def forward(self, feat, targets=None): + feat = feat[-1] + b, c, h, w = feat.shape + feat = feat.reshape([b, c, h * w]) # flatten 2D feature map + feat = feat.transpose((0, 2, 1)) + out_enc = self.positional_encoding(feat) + if self.training: + return self.forward_train(out_enc, targets) + + return self.forward_test(out_enc) + + +class DecoderLayer(nn.Layer): + """ + Decoder is made of self attention, srouce attention and feed forward. + """ + + def __init__(self, headers, d_model, dropout, d_ff): + super(DecoderLayer, self).__init__() + self.self_attn = MultiHeadAttention(headers, d_model, dropout) + self.src_attn = MultiHeadAttention(headers, d_model, dropout) + self.feed_forward = FeedForward(d_model, d_ff, dropout) + self.sublayer = clones(SubLayerConnection(d_model, dropout), 3) + + def forward(self, x, feature, src_mask, tgt_mask): + x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask)) + x = self.sublayer[1]( + x, lambda x: self.src_attn(x, feature, feature, src_mask)) + return self.sublayer[2](x, self.feed_forward) + + +class MultiHeadAttention(nn.Layer): + def __init__(self, headers, d_model, dropout): + super(MultiHeadAttention, self).__init__() + + assert d_model % headers == 0 + self.d_k = int(d_model / headers) + self.headers = headers + self.linears = clones(nn.Linear(d_model, d_model), 4) + self.attn = None + self.dropout = nn.Dropout(dropout) + + def forward(self, query, key, value, mask=None): + B = query.shape[0] + + # 1) Do all the linear projections in batch from d_model => h x d_k + query, key, value = \ + [l(x).reshape([B, 0, self.headers, self.d_k]).transpose([0, 2, 1, 3]) + for l, x in zip(self.linears, (query, key, value))] + # 2) Apply attention on all the projected vectors in batch + x, self.attn = self_attention( + query, key, value, mask=mask, dropout=self.dropout) + x = x.transpose([0, 2, 1, 3]).reshape([B, 0, self.headers * self.d_k]) + return self.linears[-1](x) + + +class FeedForward(nn.Layer): + def __init__(self, d_model, d_ff, dropout): + super(FeedForward, self).__init__() + self.w_1 = nn.Linear(d_model, d_ff) + self.w_2 = nn.Linear(d_ff, d_model) + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + return self.w_2(self.dropout(F.relu(self.w_1(x)))) + + +class SubLayerConnection(nn.Layer): + """ + A residual connection followed by a layer norm. + Note for code simplicity the norm is first as opposed to last. + """ + + def __init__(self, size, dropout): + super(SubLayerConnection, self).__init__() + self.norm = nn.LayerNorm(size) + self.dropout = nn.Dropout(dropout) + + def forward(self, x, sublayer): + return x + self.dropout(sublayer(self.norm(x))) + + +def masked_fill(x, mask, value): + mask = mask.astype(x.dtype) + return x * paddle.logical_not(mask).astype(x.dtype) + mask * value + + +def self_attention(query, key, value, mask=None, dropout=None): + """ + Compute 'Scale Dot Product Attention' + """ + d_k = value.shape[-1] + + score = paddle.matmul(query, key.transpose([0, 1, 3, 2]) / math.sqrt(d_k)) + if mask is not None: + # score = score.masked_fill(mask == 0, -1e9) # b, h, L, L + score = masked_fill(score, mask == 0, -6.55e4) # for fp16 + + p_attn = F.softmax(score, axis=-1) + + if dropout is not None: + p_attn = dropout(p_attn) + return paddle.matmul(p_attn, value), p_attn + + +def clones(module, N): + """ Produce N identical layers """ + return nn.LayerList([copy.deepcopy(module) for _ in range(N)]) + + +class Embeddings(nn.Layer): + def __init__(self, d_model, vocab): + super(Embeddings, self).__init__() + self.lut = nn.Embedding(vocab, d_model) + self.d_model = d_model + + def forward(self, *input): + x = input[0] + return self.lut(x) * math.sqrt(self.d_model) + + +class PositionalEncoding(nn.Layer): + """ Implement the PE function. """ + + def __init__(self, d_model, dropout=0., max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + # Compute the positional encodings once in log space. + pe = paddle.zeros([max_len, d_model]) + position = paddle.arange(0, max_len).unsqueeze(1).astype('float32') + div_term = paddle.exp( + paddle.arange(0, d_model, 2) * -math.log(10000.0) / d_model) + pe[:, 0::2] = paddle.sin(position * div_term) + pe[:, 1::2] = paddle.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, feat, **kwargs): + feat = feat + self.pe[:, :paddle.shape(feat)[1]] # pe 1*5000*512 + return self.dropout(feat) diff --git a/ppocr/modeling/necks/__init__.py b/ppocr/modeling/necks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f5e89a5b80f665d77833ffedaa2c141a3022f25d --- /dev/null +++ b/ppocr/modeling/necks/__init__.py @@ -0,0 +1,43 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['build_neck'] + + +def build_neck(config): + from .db_fpn import DBFPN, RSEFPN, LKPAN + from .east_fpn import EASTFPN + from .sast_fpn import SASTFPN + from .rnn import SequenceEncoder + from .pg_fpn import PGFPN + from .table_fpn import TableFPN + from .fpn import FPN + from .fce_fpn import FCEFPN + from .pren_fpn import PRENFPN + from .csp_pan import CSPPAN + from .ct_fpn import CTFPN + from .fpn_unet import FPN_UNet + from .rf_adaptor import RFAdaptor + support_dict = [ + 'FPN', 'FCEFPN', 'LKPAN', 'DBFPN', 'RSEFPN', 'EASTFPN', 'SASTFPN', + 'SequenceEncoder', 'PGFPN', 'TableFPN', 'PRENFPN', 'CSPPAN', 'CTFPN', + 'RFAdaptor', 'FPN_UNet' + ] + + module_name = config.pop('name') + assert module_name in support_dict, Exception('neck only support {}'.format( + support_dict)) + + module_class = eval(module_name)(**config) + return module_class diff --git a/ppocr/modeling/necks/csp_pan.py b/ppocr/modeling/necks/csp_pan.py new file mode 100644 index 0000000000000000000000000000000000000000..f4f8547f7d80d25edfe66824aa4f104341ae29ef --- /dev/null +++ b/ppocr/modeling/necks/csp_pan.py @@ -0,0 +1,324 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The code is based on: +# https://github.com/PaddlePaddle/PaddleDetection/blob/release%2F2.3/ppdet/modeling/necks/csp_pan.py + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr + +__all__ = ['CSPPAN'] + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channel=96, + out_channel=96, + kernel_size=3, + stride=1, + groups=1, + act='leaky_relu'): + super(ConvBNLayer, self).__init__() + initializer = nn.initializer.KaimingUniform() + self.act = act + assert self.act in ['leaky_relu', "hard_swish"] + self.conv = nn.Conv2D( + in_channels=in_channel, + out_channels=out_channel, + kernel_size=kernel_size, + groups=groups, + padding=(kernel_size - 1) // 2, + stride=stride, + weight_attr=ParamAttr(initializer=initializer), + bias_attr=False) + self.bn = nn.BatchNorm2D(out_channel) + + def forward(self, x): + x = self.bn(self.conv(x)) + if self.act == "leaky_relu": + x = F.leaky_relu(x) + elif self.act == "hard_swish": + x = F.hardswish(x) + return x + + +class DPModule(nn.Layer): + """ + Depth-wise and point-wise module. + Args: + in_channel (int): The input channels of this Module. + out_channel (int): The output channels of this Module. + kernel_size (int): The conv2d kernel size of this Module. + stride (int): The conv2d's stride of this Module. + act (str): The activation function of this Module, + Now support `leaky_relu` and `hard_swish`. + """ + + def __init__(self, + in_channel=96, + out_channel=96, + kernel_size=3, + stride=1, + act='leaky_relu'): + super(DPModule, self).__init__() + initializer = nn.initializer.KaimingUniform() + self.act = act + self.dwconv = nn.Conv2D( + in_channels=in_channel, + out_channels=out_channel, + kernel_size=kernel_size, + groups=out_channel, + padding=(kernel_size - 1) // 2, + stride=stride, + weight_attr=ParamAttr(initializer=initializer), + bias_attr=False) + self.bn1 = nn.BatchNorm2D(out_channel) + self.pwconv = nn.Conv2D( + in_channels=out_channel, + out_channels=out_channel, + kernel_size=1, + groups=1, + padding=0, + weight_attr=ParamAttr(initializer=initializer), + bias_attr=False) + self.bn2 = nn.BatchNorm2D(out_channel) + + def act_func(self, x): + if self.act == "leaky_relu": + x = F.leaky_relu(x) + elif self.act == "hard_swish": + x = F.hardswish(x) + return x + + def forward(self, x): + x = self.act_func(self.bn1(self.dwconv(x))) + x = self.act_func(self.bn2(self.pwconv(x))) + return x + + +class DarknetBottleneck(nn.Layer): + """The basic bottleneck block used in Darknet. + Each Block consists of two ConvModules and the input is added to the + final output. Each ConvModule is composed of Conv, BN, and act. + The first convLayer has filter size of 1x1 and the second one has the + filter size of 3x3. + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + expansion (int): The kernel size of the convolution. Default: 0.5 + add_identity (bool): Whether to add identity to the out. + Default: True + use_depthwise (bool): Whether to use depthwise separable convolution. + Default: False + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + expansion=0.5, + add_identity=True, + use_depthwise=False, + act="leaky_relu"): + super(DarknetBottleneck, self).__init__() + hidden_channels = int(out_channels * expansion) + conv_func = DPModule if use_depthwise else ConvBNLayer + self.conv1 = ConvBNLayer( + in_channel=in_channels, + out_channel=hidden_channels, + kernel_size=1, + act=act) + self.conv2 = conv_func( + in_channel=hidden_channels, + out_channel=out_channels, + kernel_size=kernel_size, + stride=1, + act=act) + self.add_identity = \ + add_identity and in_channels == out_channels + + def forward(self, x): + identity = x + out = self.conv1(x) + out = self.conv2(out) + + if self.add_identity: + return out + identity + else: + return out + + +class CSPLayer(nn.Layer): + """Cross Stage Partial Layer. + Args: + in_channels (int): The input channels of the CSP layer. + out_channels (int): The output channels of the CSP layer. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Default: 0.5 + num_blocks (int): Number of blocks. Default: 1 + add_identity (bool): Whether to add identity in blocks. + Default: True + use_depthwise (bool): Whether to depthwise separable convolution in + blocks. Default: False + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + expand_ratio=0.5, + num_blocks=1, + add_identity=True, + use_depthwise=False, + act="leaky_relu"): + super().__init__() + mid_channels = int(out_channels * expand_ratio) + self.main_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act) + self.short_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act) + self.final_conv = ConvBNLayer( + 2 * mid_channels, out_channels, 1, act=act) + + self.blocks = nn.Sequential(* [ + DarknetBottleneck( + mid_channels, + mid_channels, + kernel_size, + 1.0, + add_identity, + use_depthwise, + act=act) for _ in range(num_blocks) + ]) + + def forward(self, x): + x_short = self.short_conv(x) + + x_main = self.main_conv(x) + x_main = self.blocks(x_main) + + x_final = paddle.concat((x_main, x_short), axis=1) + return self.final_conv(x_final) + + +class Channel_T(nn.Layer): + def __init__(self, + in_channels=[116, 232, 464], + out_channels=96, + act="leaky_relu"): + super(Channel_T, self).__init__() + self.convs = nn.LayerList() + for i in range(len(in_channels)): + self.convs.append( + ConvBNLayer( + in_channels[i], out_channels, 1, act=act)) + + def forward(self, x): + outs = [self.convs[i](x[i]) for i in range(len(x))] + return outs + + +class CSPPAN(nn.Layer): + """Path Aggregation Network with CSP module. + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + kernel_size (int): The conv2d kernel size of this Module. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1 + use_depthwise (bool): Whether to depthwise separable convolution in + blocks. Default: True + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=5, + num_csp_blocks=1, + use_depthwise=True, + act='hard_swish'): + super(CSPPAN, self).__init__() + self.in_channels = in_channels + self.out_channels = [out_channels] * len(in_channels) + conv_func = DPModule if use_depthwise else ConvBNLayer + + self.conv_t = Channel_T(in_channels, out_channels, act=act) + + # build top-down blocks + self.upsample = nn.Upsample(scale_factor=2, mode='nearest') + self.top_down_blocks = nn.LayerList() + for idx in range(len(in_channels) - 1, 0, -1): + self.top_down_blocks.append( + CSPLayer( + out_channels * 2, + out_channels, + kernel_size=kernel_size, + num_blocks=num_csp_blocks, + add_identity=False, + use_depthwise=use_depthwise, + act=act)) + + # build bottom-up blocks + self.downsamples = nn.LayerList() + self.bottom_up_blocks = nn.LayerList() + for idx in range(len(in_channels) - 1): + self.downsamples.append( + conv_func( + out_channels, + out_channels, + kernel_size=kernel_size, + stride=2, + act=act)) + self.bottom_up_blocks.append( + CSPLayer( + out_channels * 2, + out_channels, + kernel_size=kernel_size, + num_blocks=num_csp_blocks, + add_identity=False, + use_depthwise=use_depthwise, + act=act)) + + def forward(self, inputs): + """ + Args: + inputs (tuple[Tensor]): input features. + Returns: + tuple[Tensor]: CSPPAN features. + """ + assert len(inputs) == len(self.in_channels) + inputs = self.conv_t(inputs) + + # top-down path + inner_outs = [inputs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_heigh = inner_outs[0] + feat_low = inputs[idx - 1] + upsample_feat = F.upsample( + feat_heigh, size=paddle.shape(feat_low)[2:4], mode="nearest") + + inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( + paddle.concat([upsample_feat, feat_low], 1)) + inner_outs.insert(0, inner_out) + + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_height = inner_outs[idx + 1] + downsample_feat = self.downsamples[idx](feat_low) + out = self.bottom_up_blocks[idx](paddle.concat( + [downsample_feat, feat_height], 1)) + outs.append(out) + + return tuple(outs) diff --git a/ppocr/modeling/necks/ct_fpn.py b/ppocr/modeling/necks/ct_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..ee4d25e901b5b3093588571f0412a931eaf6f364 --- /dev/null +++ b/ppocr/modeling/necks/ct_fpn.py @@ -0,0 +1,185 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr +import os +import sys + +import math +from paddle.nn.initializer import TruncatedNormal, Constant, Normal +ones_ = Constant(value=1.) +zeros_ = Constant(value=0.) + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../../..'))) + + +class Conv_BN_ReLU(nn.Layer): + def __init__(self, + in_planes, + out_planes, + kernel_size=1, + stride=1, + padding=0): + super(Conv_BN_ReLU, self).__init__() + self.conv = nn.Conv2D( + in_planes, + out_planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias_attr=False) + self.bn = nn.BatchNorm2D(out_planes) + self.relu = nn.ReLU() + + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + normal_ = Normal(mean=0.0, std=math.sqrt(2. / n)) + normal_(m.weight) + elif isinstance(m, nn.BatchNorm2D): + zeros_(m.bias) + ones_(m.weight) + + def forward(self, x): + return self.relu(self.bn(self.conv(x))) + + +class FPEM(nn.Layer): + def __init__(self, in_channels, out_channels): + super(FPEM, self).__init__() + planes = out_channels + self.dwconv3_1 = nn.Conv2D( + planes, + planes, + kernel_size=3, + stride=1, + padding=1, + groups=planes, + bias_attr=False) + self.smooth_layer3_1 = Conv_BN_ReLU(planes, planes) + + self.dwconv2_1 = nn.Conv2D( + planes, + planes, + kernel_size=3, + stride=1, + padding=1, + groups=planes, + bias_attr=False) + self.smooth_layer2_1 = Conv_BN_ReLU(planes, planes) + + self.dwconv1_1 = nn.Conv2D( + planes, + planes, + kernel_size=3, + stride=1, + padding=1, + groups=planes, + bias_attr=False) + self.smooth_layer1_1 = Conv_BN_ReLU(planes, planes) + + self.dwconv2_2 = nn.Conv2D( + planes, + planes, + kernel_size=3, + stride=2, + padding=1, + groups=planes, + bias_attr=False) + self.smooth_layer2_2 = Conv_BN_ReLU(planes, planes) + + self.dwconv3_2 = nn.Conv2D( + planes, + planes, + kernel_size=3, + stride=2, + padding=1, + groups=planes, + bias_attr=False) + self.smooth_layer3_2 = Conv_BN_ReLU(planes, planes) + + self.dwconv4_2 = nn.Conv2D( + planes, + planes, + kernel_size=3, + stride=2, + padding=1, + groups=planes, + bias_attr=False) + self.smooth_layer4_2 = Conv_BN_ReLU(planes, planes) + + def _upsample_add(self, x, y): + return F.upsample(x, scale_factor=2, mode='bilinear') + y + + def forward(self, f1, f2, f3, f4): + # up-down + f3 = self.smooth_layer3_1(self.dwconv3_1(self._upsample_add(f4, f3))) + f2 = self.smooth_layer2_1(self.dwconv2_1(self._upsample_add(f3, f2))) + f1 = self.smooth_layer1_1(self.dwconv1_1(self._upsample_add(f2, f1))) + + # down-up + f2 = self.smooth_layer2_2(self.dwconv2_2(self._upsample_add(f2, f1))) + f3 = self.smooth_layer3_2(self.dwconv3_2(self._upsample_add(f3, f2))) + f4 = self.smooth_layer4_2(self.dwconv4_2(self._upsample_add(f4, f3))) + + return f1, f2, f3, f4 + + +class CTFPN(nn.Layer): + def __init__(self, in_channels, out_channel=128): + super(CTFPN, self).__init__() + self.out_channels = out_channel * 4 + + self.reduce_layer1 = Conv_BN_ReLU(in_channels[0], 128) + self.reduce_layer2 = Conv_BN_ReLU(in_channels[1], 128) + self.reduce_layer3 = Conv_BN_ReLU(in_channels[2], 128) + self.reduce_layer4 = Conv_BN_ReLU(in_channels[3], 128) + + self.fpem1 = FPEM(in_channels=(64, 128, 256, 512), out_channels=128) + self.fpem2 = FPEM(in_channels=(64, 128, 256, 512), out_channels=128) + + def _upsample(self, x, scale=1): + return F.upsample(x, scale_factor=scale, mode='bilinear') + + def forward(self, f): + # # reduce channel + f1 = self.reduce_layer1(f[0]) # N,64,160,160 --> N, 128, 160, 160 + f2 = self.reduce_layer2(f[1]) # N, 128, 80, 80 --> N, 128, 80, 80 + f3 = self.reduce_layer3(f[2]) # N, 256, 40, 40 --> N, 128, 40, 40 + f4 = self.reduce_layer4(f[3]) # N, 512, 20, 20 --> N, 128, 20, 20 + + # FPEM + f1_1, f2_1, f3_1, f4_1 = self.fpem1(f1, f2, f3, f4) + f1_2, f2_2, f3_2, f4_2 = self.fpem2(f1_1, f2_1, f3_1, f4_1) + + # FFM + f1 = f1_1 + f1_2 + f2 = f2_1 + f2_2 + f3 = f3_1 + f3_2 + f4 = f4_1 + f4_2 + + f2 = self._upsample(f2, scale=2) + f3 = self._upsample(f3, scale=4) + f4 = self._upsample(f4, scale=8) + ff = paddle.concat((f1, f2, f3, f4), 1) # N,512, 160,160 + return ff diff --git a/ppocr/modeling/necks/db_fpn.py b/ppocr/modeling/necks/db_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..8c3f52a331db5daafab2a38c0a441edd44eb141d --- /dev/null +++ b/ppocr/modeling/necks/db_fpn.py @@ -0,0 +1,427 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../../..'))) + +from ppocr.modeling.backbones.det_mobilenet_v3 import SEModule + + +class DSConv(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + padding, + stride=1, + groups=None, + if_act=True, + act="relu", + **kwargs): + super(DSConv, self).__init__() + if groups == None: + groups = in_channels + self.if_act = if_act + self.act = act + self.conv1 = nn.Conv2D( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias_attr=False) + + self.bn1 = nn.BatchNorm(num_channels=in_channels, act=None) + + self.conv2 = nn.Conv2D( + in_channels=in_channels, + out_channels=int(in_channels * 4), + kernel_size=1, + stride=1, + bias_attr=False) + + self.bn2 = nn.BatchNorm(num_channels=int(in_channels * 4), act=None) + + self.conv3 = nn.Conv2D( + in_channels=int(in_channels * 4), + out_channels=out_channels, + kernel_size=1, + stride=1, + bias_attr=False) + self._c = [in_channels, out_channels] + if in_channels != out_channels: + self.conv_end = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + bias_attr=False) + + def forward(self, inputs): + + x = self.conv1(inputs) + x = self.bn1(x) + + x = self.conv2(x) + x = self.bn2(x) + if self.if_act: + if self.act == "relu": + x = F.relu(x) + elif self.act == "hardswish": + x = F.hardswish(x) + else: + print("The activation function({}) is selected incorrectly.". + format(self.act)) + exit() + + x = self.conv3(x) + if self._c[0] != self._c[1]: + x = x + self.conv_end(inputs) + return x + + +class DBFPN(nn.Layer): + def __init__(self, in_channels, out_channels, use_asf=False, **kwargs): + super(DBFPN, self).__init__() + self.out_channels = out_channels + self.use_asf = use_asf + weight_attr = paddle.nn.initializer.KaimingUniform() + + self.in2_conv = nn.Conv2D( + in_channels=in_channels[0], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.in3_conv = nn.Conv2D( + in_channels=in_channels[1], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.in4_conv = nn.Conv2D( + in_channels=in_channels[2], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.in5_conv = nn.Conv2D( + in_channels=in_channels[3], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p5_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p4_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p3_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p2_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + + if self.use_asf is True: + self.asf = ASFBlock(self.out_channels, self.out_channels // 4) + + def forward(self, x): + c2, c3, c4, c5 = x + + in5 = self.in5_conv(c5) + in4 = self.in4_conv(c4) + in3 = self.in3_conv(c3) + in2 = self.in2_conv(c2) + + out4 = in4 + F.upsample( + in5, scale_factor=2, mode="nearest", align_mode=1) # 1/16 + out3 = in3 + F.upsample( + out4, scale_factor=2, mode="nearest", align_mode=1) # 1/8 + out2 = in2 + F.upsample( + out3, scale_factor=2, mode="nearest", align_mode=1) # 1/4 + + p5 = self.p5_conv(in5) + p4 = self.p4_conv(out4) + p3 = self.p3_conv(out3) + p2 = self.p2_conv(out2) + p5 = F.upsample(p5, scale_factor=8, mode="nearest", align_mode=1) + p4 = F.upsample(p4, scale_factor=4, mode="nearest", align_mode=1) + p3 = F.upsample(p3, scale_factor=2, mode="nearest", align_mode=1) + + fuse = paddle.concat([p5, p4, p3, p2], axis=1) + + if self.use_asf is True: + fuse = self.asf(fuse, [p5, p4, p3, p2]) + + return fuse + + +class RSELayer(nn.Layer): + def __init__(self, in_channels, out_channels, kernel_size, shortcut=True): + super(RSELayer, self).__init__() + weight_attr = paddle.nn.initializer.KaimingUniform() + self.out_channels = out_channels + self.in_conv = nn.Conv2D( + in_channels=in_channels, + out_channels=self.out_channels, + kernel_size=kernel_size, + padding=int(kernel_size // 2), + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.se_block = SEModule(self.out_channels) + self.shortcut = shortcut + + def forward(self, ins): + x = self.in_conv(ins) + if self.shortcut: + out = x + self.se_block(x) + else: + out = self.se_block(x) + return out + + +class RSEFPN(nn.Layer): + def __init__(self, in_channels, out_channels, shortcut=True, **kwargs): + super(RSEFPN, self).__init__() + self.out_channels = out_channels + self.ins_conv = nn.LayerList() + self.inp_conv = nn.LayerList() + + for i in range(len(in_channels)): + self.ins_conv.append( + RSELayer( + in_channels[i], + out_channels, + kernel_size=1, + shortcut=shortcut)) + self.inp_conv.append( + RSELayer( + out_channels, + out_channels // 4, + kernel_size=3, + shortcut=shortcut)) + + def forward(self, x): + c2, c3, c4, c5 = x + + in5 = self.ins_conv[3](c5) + in4 = self.ins_conv[2](c4) + in3 = self.ins_conv[1](c3) + in2 = self.ins_conv[0](c2) + + out4 = in4 + F.upsample( + in5, scale_factor=2, mode="nearest", align_mode=1) # 1/16 + out3 = in3 + F.upsample( + out4, scale_factor=2, mode="nearest", align_mode=1) # 1/8 + out2 = in2 + F.upsample( + out3, scale_factor=2, mode="nearest", align_mode=1) # 1/4 + + p5 = self.inp_conv[3](in5) + p4 = self.inp_conv[2](out4) + p3 = self.inp_conv[1](out3) + p2 = self.inp_conv[0](out2) + + p5 = F.upsample(p5, scale_factor=8, mode="nearest", align_mode=1) + p4 = F.upsample(p4, scale_factor=4, mode="nearest", align_mode=1) + p3 = F.upsample(p3, scale_factor=2, mode="nearest", align_mode=1) + + fuse = paddle.concat([p5, p4, p3, p2], axis=1) + return fuse + + +class LKPAN(nn.Layer): + def __init__(self, in_channels, out_channels, mode='large', **kwargs): + super(LKPAN, self).__init__() + self.out_channels = out_channels + weight_attr = paddle.nn.initializer.KaimingUniform() + + self.ins_conv = nn.LayerList() + self.inp_conv = nn.LayerList() + # pan head + self.pan_head_conv = nn.LayerList() + self.pan_lat_conv = nn.LayerList() + + if mode.lower() == 'lite': + p_layer = DSConv + elif mode.lower() == 'large': + p_layer = nn.Conv2D + else: + raise ValueError( + "mode can only be one of ['lite', 'large'], but received {}". + format(mode)) + + for i in range(len(in_channels)): + self.ins_conv.append( + nn.Conv2D( + in_channels=in_channels[i], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False)) + + self.inp_conv.append( + p_layer( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=9, + padding=4, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False)) + + if i > 0: + self.pan_head_conv.append( + nn.Conv2D( + in_channels=self.out_channels // 4, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + stride=2, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False)) + self.pan_lat_conv.append( + p_layer( + in_channels=self.out_channels // 4, + out_channels=self.out_channels // 4, + kernel_size=9, + padding=4, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False)) + + def forward(self, x): + c2, c3, c4, c5 = x + + in5 = self.ins_conv[3](c5) + in4 = self.ins_conv[2](c4) + in3 = self.ins_conv[1](c3) + in2 = self.ins_conv[0](c2) + + out4 = in4 + F.upsample( + in5, scale_factor=2, mode="nearest", align_mode=1) # 1/16 + out3 = in3 + F.upsample( + out4, scale_factor=2, mode="nearest", align_mode=1) # 1/8 + out2 = in2 + F.upsample( + out3, scale_factor=2, mode="nearest", align_mode=1) # 1/4 + + f5 = self.inp_conv[3](in5) + f4 = self.inp_conv[2](out4) + f3 = self.inp_conv[1](out3) + f2 = self.inp_conv[0](out2) + + pan3 = f3 + self.pan_head_conv[0](f2) + pan4 = f4 + self.pan_head_conv[1](pan3) + pan5 = f5 + self.pan_head_conv[2](pan4) + + p2 = self.pan_lat_conv[0](f2) + p3 = self.pan_lat_conv[1](pan3) + p4 = self.pan_lat_conv[2](pan4) + p5 = self.pan_lat_conv[3](pan5) + + p5 = F.upsample(p5, scale_factor=8, mode="nearest", align_mode=1) + p4 = F.upsample(p4, scale_factor=4, mode="nearest", align_mode=1) + p3 = F.upsample(p3, scale_factor=2, mode="nearest", align_mode=1) + + fuse = paddle.concat([p5, p4, p3, p2], axis=1) + return fuse + + +class ASFBlock(nn.Layer): + """ + This code is refered from: + https://github.com/MhLiao/DB/blob/master/decoders/feature_attention.py + """ + + def __init__(self, in_channels, inter_channels, out_features_num=4): + """ + Adaptive Scale Fusion (ASF) block of DBNet++ + Args: + in_channels: the number of channels in the input data + inter_channels: the number of middle channels + out_features_num: the number of fused stages + """ + super(ASFBlock, self).__init__() + weight_attr = paddle.nn.initializer.KaimingUniform() + self.in_channels = in_channels + self.inter_channels = inter_channels + self.out_features_num = out_features_num + self.conv = nn.Conv2D(in_channels, inter_channels, 3, padding=1) + + self.spatial_scale = nn.Sequential( + #Nx1xHxW + nn.Conv2D( + in_channels=1, + out_channels=1, + kernel_size=3, + bias_attr=False, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr)), + nn.ReLU(), + nn.Conv2D( + in_channels=1, + out_channels=1, + kernel_size=1, + bias_attr=False, + weight_attr=ParamAttr(initializer=weight_attr)), + nn.Sigmoid()) + + self.channel_scale = nn.Sequential( + nn.Conv2D( + in_channels=inter_channels, + out_channels=out_features_num, + kernel_size=1, + bias_attr=False, + weight_attr=ParamAttr(initializer=weight_attr)), + nn.Sigmoid()) + + def forward(self, fuse_features, features_list): + fuse_features = self.conv(fuse_features) + spatial_x = paddle.mean(fuse_features, axis=1, keepdim=True) + attention_scores = self.spatial_scale(spatial_x) + fuse_features + attention_scores = self.channel_scale(attention_scores) + assert len(features_list) == self.out_features_num + + out_list = [] + for i in range(self.out_features_num): + out_list.append(attention_scores[:, i:i + 1] * features_list[i]) + return paddle.concat(out_list, axis=1) diff --git a/ppocr/modeling/necks/east_fpn.py b/ppocr/modeling/necks/east_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..120ff156cba2f7f8f8a562e76c82f1fac8584d4a --- /dev/null +++ b/ppocr/modeling/necks/east_fpn.py @@ -0,0 +1,188 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance") + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class DeConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + name=None): + super(DeConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.deconv = nn.Conv2DTranspose( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance") + + def forward(self, x): + x = self.deconv(x) + x = self.bn(x) + return x + + +class EASTFPN(nn.Layer): + def __init__(self, in_channels, model_name, **kwargs): + super(EASTFPN, self).__init__() + self.model_name = model_name + if self.model_name == "large": + self.out_channels = 128 + else: + self.out_channels = 64 + self.in_channels = in_channels[::-1] + self.h1_conv = ConvBNLayer( + in_channels=self.out_channels+self.in_channels[1], + out_channels=self.out_channels, + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="unet_h_1") + self.h2_conv = ConvBNLayer( + in_channels=self.out_channels+self.in_channels[2], + out_channels=self.out_channels, + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="unet_h_2") + self.h3_conv = ConvBNLayer( + in_channels=self.out_channels+self.in_channels[3], + out_channels=self.out_channels, + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="unet_h_3") + self.g0_deconv = DeConvBNLayer( + in_channels=self.in_channels[0], + out_channels=self.out_channels, + kernel_size=4, + stride=2, + padding=1, + if_act=True, + act='relu', + name="unet_g_0") + self.g1_deconv = DeConvBNLayer( + in_channels=self.out_channels, + out_channels=self.out_channels, + kernel_size=4, + stride=2, + padding=1, + if_act=True, + act='relu', + name="unet_g_1") + self.g2_deconv = DeConvBNLayer( + in_channels=self.out_channels, + out_channels=self.out_channels, + kernel_size=4, + stride=2, + padding=1, + if_act=True, + act='relu', + name="unet_g_2") + self.g3_conv = ConvBNLayer( + in_channels=self.out_channels, + out_channels=self.out_channels, + kernel_size=3, + stride=1, + padding=1, + if_act=True, + act='relu', + name="unet_g_3") + + def forward(self, x): + f = x[::-1] + + h = f[0] + g = self.g0_deconv(h) + h = paddle.concat([g, f[1]], axis=1) + h = self.h1_conv(h) + g = self.g1_deconv(h) + h = paddle.concat([g, f[2]], axis=1) + h = self.h2_conv(h) + g = self.g2_deconv(h) + h = paddle.concat([g, f[3]], axis=1) + h = self.h3_conv(h) + g = self.g3_conv(h) + + return g \ No newline at end of file diff --git a/ppocr/modeling/necks/fce_fpn.py b/ppocr/modeling/necks/fce_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..954e964e97d6061d9cc684ccac8a1b137d5d069c --- /dev/null +++ b/ppocr/modeling/necks/fce_fpn.py @@ -0,0 +1,280 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.3/ppdet/modeling/necks/fpn.py +""" + +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.nn.initializer import XavierUniform +from paddle.nn.initializer import Normal +from paddle.regularizer import L2Decay + +__all__ = ['FCEFPN'] + + +class ConvNormLayer(nn.Layer): + def __init__(self, + ch_in, + ch_out, + filter_size, + stride, + groups=1, + norm_type='bn', + norm_decay=0., + norm_groups=32, + lr_scale=1., + freeze_norm=False, + initializer=Normal( + mean=0., std=0.01)): + super(ConvNormLayer, self).__init__() + assert norm_type in ['bn', 'sync_bn', 'gn'] + + bias_attr = False + + self.conv = nn.Conv2D( + in_channels=ch_in, + out_channels=ch_out, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr( + initializer=initializer, learning_rate=1.), + bias_attr=bias_attr) + + norm_lr = 0. if freeze_norm else 1. + param_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay) if norm_decay is not None else None) + bias_attr = ParamAttr( + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay) if norm_decay is not None else None) + if norm_type == 'bn': + self.norm = nn.BatchNorm2D( + ch_out, weight_attr=param_attr, bias_attr=bias_attr) + elif norm_type == 'sync_bn': + self.norm = nn.SyncBatchNorm( + ch_out, weight_attr=param_attr, bias_attr=bias_attr) + elif norm_type == 'gn': + self.norm = nn.GroupNorm( + num_groups=norm_groups, + num_channels=ch_out, + weight_attr=param_attr, + bias_attr=bias_attr) + + def forward(self, inputs): + out = self.conv(inputs) + out = self.norm(out) + return out + + +class FCEFPN(nn.Layer): + """ + Feature Pyramid Network, see https://arxiv.org/abs/1612.03144 + Args: + in_channels (list[int]): input channels of each level which can be + derived from the output shape of backbone by from_config + out_channels (list[int]): output channel of each level + spatial_scales (list[float]): the spatial scales between input feature + maps and original input image which can be derived from the output + shape of backbone by from_config + has_extra_convs (bool): whether to add extra conv to the last level. + default False + extra_stage (int): the number of extra stages added to the last level. + default 1 + use_c5 (bool): Whether to use c5 as the input of extra stage, + otherwise p5 is used. default True + norm_type (string|None): The normalization type in FPN module. If + norm_type is None, norm will not be used after conv and if + norm_type is string, bn, gn, sync_bn are available. default None + norm_decay (float): weight decay for normalization layer weights. + default 0. + freeze_norm (bool): whether to freeze normalization layer. + default False + relu_before_extra_convs (bool): whether to add relu before extra convs. + default False + + """ + + def __init__(self, + in_channels, + out_channels, + spatial_scales=[0.25, 0.125, 0.0625, 0.03125], + has_extra_convs=False, + extra_stage=1, + use_c5=True, + norm_type=None, + norm_decay=0., + freeze_norm=False, + relu_before_extra_convs=True): + super(FCEFPN, self).__init__() + self.out_channels = out_channels + for s in range(extra_stage): + spatial_scales = spatial_scales + [spatial_scales[-1] / 2.] + self.spatial_scales = spatial_scales + self.has_extra_convs = has_extra_convs + self.extra_stage = extra_stage + self.use_c5 = use_c5 + self.relu_before_extra_convs = relu_before_extra_convs + self.norm_type = norm_type + self.norm_decay = norm_decay + self.freeze_norm = freeze_norm + + self.lateral_convs = [] + self.fpn_convs = [] + fan = out_channels * 3 * 3 + + # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone + # 0 <= st_stage < ed_stage <= 3 + st_stage = 4 - len(in_channels) + ed_stage = st_stage + len(in_channels) - 1 + for i in range(st_stage, ed_stage + 1): + if i == 3: + lateral_name = 'fpn_inner_res5_sum' + else: + lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2) + in_c = in_channels[i - st_stage] + if self.norm_type is not None: + lateral = self.add_sublayer( + lateral_name, + ConvNormLayer( + ch_in=in_c, + ch_out=out_channels, + filter_size=1, + stride=1, + norm_type=self.norm_type, + norm_decay=self.norm_decay, + freeze_norm=self.freeze_norm, + initializer=XavierUniform(fan_out=in_c))) + else: + lateral = self.add_sublayer( + lateral_name, + nn.Conv2D( + in_channels=in_c, + out_channels=out_channels, + kernel_size=1, + weight_attr=ParamAttr( + initializer=XavierUniform(fan_out=in_c)))) + self.lateral_convs.append(lateral) + + for i in range(st_stage, ed_stage + 1): + fpn_name = 'fpn_res{}_sum'.format(i + 2) + if self.norm_type is not None: + fpn_conv = self.add_sublayer( + fpn_name, + ConvNormLayer( + ch_in=out_channels, + ch_out=out_channels, + filter_size=3, + stride=1, + norm_type=self.norm_type, + norm_decay=self.norm_decay, + freeze_norm=self.freeze_norm, + initializer=XavierUniform(fan_out=fan))) + else: + fpn_conv = self.add_sublayer( + fpn_name, + nn.Conv2D( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + padding=1, + weight_attr=ParamAttr( + initializer=XavierUniform(fan_out=fan)))) + self.fpn_convs.append(fpn_conv) + + # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) + if self.has_extra_convs: + for i in range(self.extra_stage): + lvl = ed_stage + 1 + i + if i == 0 and self.use_c5: + in_c = in_channels[-1] + else: + in_c = out_channels + extra_fpn_name = 'fpn_{}'.format(lvl + 2) + if self.norm_type is not None: + extra_fpn_conv = self.add_sublayer( + extra_fpn_name, + ConvNormLayer( + ch_in=in_c, + ch_out=out_channels, + filter_size=3, + stride=2, + norm_type=self.norm_type, + norm_decay=self.norm_decay, + freeze_norm=self.freeze_norm, + initializer=XavierUniform(fan_out=fan))) + else: + extra_fpn_conv = self.add_sublayer( + extra_fpn_name, + nn.Conv2D( + in_channels=in_c, + out_channels=out_channels, + kernel_size=3, + stride=2, + padding=1, + weight_attr=ParamAttr( + initializer=XavierUniform(fan_out=fan)))) + self.fpn_convs.append(extra_fpn_conv) + + @classmethod + def from_config(cls, cfg, input_shape): + return { + 'in_channels': [i.channels for i in input_shape], + 'spatial_scales': [1.0 / i.stride for i in input_shape], + } + + def forward(self, body_feats): + laterals = [] + num_levels = len(body_feats) + + for i in range(num_levels): + laterals.append(self.lateral_convs[i](body_feats[i])) + + for i in range(1, num_levels): + lvl = num_levels - i + upsample = F.interpolate( + laterals[lvl], + scale_factor=2., + mode='nearest', ) + laterals[lvl - 1] += upsample + + fpn_output = [] + for lvl in range(num_levels): + fpn_output.append(self.fpn_convs[lvl](laterals[lvl])) + + if self.extra_stage > 0: + # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN) + if not self.has_extra_convs: + assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs' + fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2)) + # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) + else: + if self.use_c5: + extra_source = body_feats[-1] + else: + extra_source = fpn_output[-1] + fpn_output.append(self.fpn_convs[num_levels](extra_source)) + + for i in range(1, self.extra_stage): + if self.relu_before_extra_convs: + fpn_output.append(self.fpn_convs[num_levels + i](F.relu( + fpn_output[-1]))) + else: + fpn_output.append(self.fpn_convs[num_levels + i]( + fpn_output[-1])) + return fpn_output diff --git a/ppocr/modeling/necks/fpn.py b/ppocr/modeling/necks/fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..48c85b1e53bd889bc887e8fedcd33b1b12cb734b --- /dev/null +++ b/ppocr/modeling/necks/fpn.py @@ -0,0 +1,138 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/neck/fpn.py +""" + +import paddle.nn as nn +import paddle +import math +import paddle.nn.functional as F + + +class Conv_BN_ReLU(nn.Layer): + def __init__(self, + in_planes, + out_planes, + kernel_size=1, + stride=1, + padding=0): + super(Conv_BN_ReLU, self).__init__() + self.conv = nn.Conv2D( + in_planes, + out_planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias_attr=False) + self.bn = nn.BatchNorm2D(out_planes, momentum=0.1) + self.relu = nn.ReLU() + + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + m.weight = paddle.create_parameter( + shape=m.weight.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Normal( + 0, math.sqrt(2. / n))) + elif isinstance(m, nn.BatchNorm2D): + m.weight = paddle.create_parameter( + shape=m.weight.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Constant(1.0)) + m.bias = paddle.create_parameter( + shape=m.bias.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Constant(0.0)) + + def forward(self, x): + return self.relu(self.bn(self.conv(x))) + + +class FPN(nn.Layer): + def __init__(self, in_channels, out_channels): + super(FPN, self).__init__() + + # Top layer + self.toplayer_ = Conv_BN_ReLU( + in_channels[3], out_channels, kernel_size=1, stride=1, padding=0) + # Lateral layers + self.latlayer1_ = Conv_BN_ReLU( + in_channels[2], out_channels, kernel_size=1, stride=1, padding=0) + + self.latlayer2_ = Conv_BN_ReLU( + in_channels[1], out_channels, kernel_size=1, stride=1, padding=0) + + self.latlayer3_ = Conv_BN_ReLU( + in_channels[0], out_channels, kernel_size=1, stride=1, padding=0) + + # Smooth layers + self.smooth1_ = Conv_BN_ReLU( + out_channels, out_channels, kernel_size=3, stride=1, padding=1) + + self.smooth2_ = Conv_BN_ReLU( + out_channels, out_channels, kernel_size=3, stride=1, padding=1) + + self.smooth3_ = Conv_BN_ReLU( + out_channels, out_channels, kernel_size=3, stride=1, padding=1) + + self.out_channels = out_channels * 4 + for m in self.sublayers(): + if isinstance(m, nn.Conv2D): + n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels + m.weight = paddle.create_parameter( + shape=m.weight.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Normal( + 0, math.sqrt(2. / n))) + elif isinstance(m, nn.BatchNorm2D): + m.weight = paddle.create_parameter( + shape=m.weight.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Constant(1.0)) + m.bias = paddle.create_parameter( + shape=m.bias.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Constant(0.0)) + + def _upsample(self, x, scale=1): + return F.upsample(x, scale_factor=scale, mode='bilinear') + + def _upsample_add(self, x, y, scale=1): + return F.upsample(x, scale_factor=scale, mode='bilinear') + y + + def forward(self, x): + f2, f3, f4, f5 = x + p5 = self.toplayer_(f5) + + f4 = self.latlayer1_(f4) + p4 = self._upsample_add(p5, f4, 2) + p4 = self.smooth1_(p4) + + f3 = self.latlayer2_(f3) + p3 = self._upsample_add(p4, f3, 2) + p3 = self.smooth2_(p3) + + f2 = self.latlayer3_(f2) + p2 = self._upsample_add(p3, f2, 2) + p2 = self.smooth3_(p2) + + p3 = self._upsample(p3, 2) + p4 = self._upsample(p4, 4) + p5 = self._upsample(p5, 8) + + fuse = paddle.concat([p2, p3, p4, p5], axis=1) + return fuse diff --git a/ppocr/modeling/necks/fpn_unet.py b/ppocr/modeling/necks/fpn_unet.py new file mode 100644 index 0000000000000000000000000000000000000000..34e94a8b50532cfbbfea1cecdba6cfb0d5a239cd --- /dev/null +++ b/ppocr/modeling/necks/fpn_unet.py @@ -0,0 +1,97 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/necks/fpn_unet.py +""" + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class UpBlock(nn.Layer): + def __init__(self, in_channels, out_channels): + super().__init__() + + assert isinstance(in_channels, int) + assert isinstance(out_channels, int) + + self.conv1x1 = nn.Conv2D( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.conv3x3 = nn.Conv2D( + in_channels, out_channels, kernel_size=3, stride=1, padding=1) + self.deconv = nn.Conv2DTranspose( + out_channels, out_channels, kernel_size=4, stride=2, padding=1) + + def forward(self, x): + x = F.relu(self.conv1x1(x)) + x = F.relu(self.conv3x3(x)) + x = self.deconv(x) + return x + + +class FPN_UNet(nn.Layer): + def __init__(self, in_channels, out_channels): + super().__init__() + + assert len(in_channels) == 4 + assert isinstance(out_channels, int) + self.out_channels = out_channels + + blocks_out_channels = [out_channels] + [ + min(out_channels * 2**i, 256) for i in range(4) + ] + blocks_in_channels = [blocks_out_channels[1]] + [ + in_channels[i] + blocks_out_channels[i + 2] for i in range(3) + ] + [in_channels[3]] + + self.up4 = nn.Conv2DTranspose( + blocks_in_channels[4], + blocks_out_channels[4], + kernel_size=4, + stride=2, + padding=1) + self.up_block3 = UpBlock(blocks_in_channels[3], blocks_out_channels[3]) + self.up_block2 = UpBlock(blocks_in_channels[2], blocks_out_channels[2]) + self.up_block1 = UpBlock(blocks_in_channels[1], blocks_out_channels[1]) + self.up_block0 = UpBlock(blocks_in_channels[0], blocks_out_channels[0]) + + def forward(self, x): + """ + Args: + x (list[Tensor] | tuple[Tensor]): A list of four tensors of shape + :math:`(N, C_i, H_i, W_i)`, representing C2, C3, C4, C5 + features respectively. :math:`C_i` should matches the number in + ``in_channels``. + + Returns: + Tensor: Shape :math:`(N, C, H, W)` where :math:`H=4H_0` and + :math:`W=4W_0`. + """ + c2, c3, c4, c5 = x + + x = F.relu(self.up4(c5)) + + x = paddle.concat([x, c4], axis=1) + x = F.relu(self.up_block3(x)) + + x = paddle.concat([x, c3], axis=1) + x = F.relu(self.up_block2(x)) + + x = paddle.concat([x, c2], axis=1) + x = F.relu(self.up_block1(x)) + + x = self.up_block0(x) + return x diff --git a/ppocr/modeling/necks/pg_fpn.py b/ppocr/modeling/necks/pg_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..3f64539f790b55bb1f95adc8d3c78b84ca2fccc5 --- /dev/null +++ b/ppocr/modeling/necks/pg_fpn.py @@ -0,0 +1,314 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + is_vd_mode=False, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = nn.AvgPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self._conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self._batch_norm = nn.BatchNorm( + out_channels, + act=act, + param_attr=ParamAttr(name=bn_name + '_scale'), + bias_attr=ParamAttr(bn_name + '_offset'), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance', + use_global_stats=False) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + return y + + +class DeConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size=4, + stride=2, + padding=1, + groups=1, + if_act=True, + act=None, + name=None): + super(DeConvBNLayer, self).__init__() + + self.if_act = if_act + self.act = act + self.deconv = nn.Conv2DTranspose( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance", + use_global_stats=False) + + def forward(self, x): + x = self.deconv(x) + x = self.bn(x) + return x + + +class PGFPN(nn.Layer): + def __init__(self, in_channels, **kwargs): + super(PGFPN, self).__init__() + num_inputs = [2048, 2048, 1024, 512, 256] + num_outputs = [256, 256, 192, 192, 128] + self.out_channels = 128 + self.conv_bn_layer_1 = ConvBNLayer( + in_channels=3, + out_channels=32, + kernel_size=3, + stride=1, + act=None, + name='FPN_d1') + self.conv_bn_layer_2 = ConvBNLayer( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + act=None, + name='FPN_d2') + self.conv_bn_layer_3 = ConvBNLayer( + in_channels=256, + out_channels=128, + kernel_size=3, + stride=1, + act=None, + name='FPN_d3') + self.conv_bn_layer_4 = ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=2, + act=None, + name='FPN_d4') + self.conv_bn_layer_5 = ConvBNLayer( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + act='relu', + name='FPN_d5') + self.conv_bn_layer_6 = ConvBNLayer( + in_channels=64, + out_channels=128, + kernel_size=3, + stride=2, + act=None, + name='FPN_d6') + self.conv_bn_layer_7 = ConvBNLayer( + in_channels=128, + out_channels=128, + kernel_size=3, + stride=1, + act='relu', + name='FPN_d7') + self.conv_bn_layer_8 = ConvBNLayer( + in_channels=128, + out_channels=128, + kernel_size=1, + stride=1, + act=None, + name='FPN_d8') + + self.conv_h0 = ConvBNLayer( + in_channels=num_inputs[0], + out_channels=num_outputs[0], + kernel_size=1, + stride=1, + act=None, + name="conv_h{}".format(0)) + self.conv_h1 = ConvBNLayer( + in_channels=num_inputs[1], + out_channels=num_outputs[1], + kernel_size=1, + stride=1, + act=None, + name="conv_h{}".format(1)) + self.conv_h2 = ConvBNLayer( + in_channels=num_inputs[2], + out_channels=num_outputs[2], + kernel_size=1, + stride=1, + act=None, + name="conv_h{}".format(2)) + self.conv_h3 = ConvBNLayer( + in_channels=num_inputs[3], + out_channels=num_outputs[3], + kernel_size=1, + stride=1, + act=None, + name="conv_h{}".format(3)) + self.conv_h4 = ConvBNLayer( + in_channels=num_inputs[4], + out_channels=num_outputs[4], + kernel_size=1, + stride=1, + act=None, + name="conv_h{}".format(4)) + + self.dconv0 = DeConvBNLayer( + in_channels=num_outputs[0], + out_channels=num_outputs[0 + 1], + name="dconv_{}".format(0)) + self.dconv1 = DeConvBNLayer( + in_channels=num_outputs[1], + out_channels=num_outputs[1 + 1], + act=None, + name="dconv_{}".format(1)) + self.dconv2 = DeConvBNLayer( + in_channels=num_outputs[2], + out_channels=num_outputs[2 + 1], + act=None, + name="dconv_{}".format(2)) + self.dconv3 = DeConvBNLayer( + in_channels=num_outputs[3], + out_channels=num_outputs[3 + 1], + act=None, + name="dconv_{}".format(3)) + self.conv_g1 = ConvBNLayer( + in_channels=num_outputs[1], + out_channels=num_outputs[1], + kernel_size=3, + stride=1, + act='relu', + name="conv_g{}".format(1)) + self.conv_g2 = ConvBNLayer( + in_channels=num_outputs[2], + out_channels=num_outputs[2], + kernel_size=3, + stride=1, + act='relu', + name="conv_g{}".format(2)) + self.conv_g3 = ConvBNLayer( + in_channels=num_outputs[3], + out_channels=num_outputs[3], + kernel_size=3, + stride=1, + act='relu', + name="conv_g{}".format(3)) + self.conv_g4 = ConvBNLayer( + in_channels=num_outputs[4], + out_channels=num_outputs[4], + kernel_size=3, + stride=1, + act='relu', + name="conv_g{}".format(4)) + self.convf = ConvBNLayer( + in_channels=num_outputs[4], + out_channels=num_outputs[4], + kernel_size=1, + stride=1, + act=None, + name="conv_f{}".format(4)) + + def forward(self, x): + c0, c1, c2, c3, c4, c5, c6 = x + # FPN_Down_Fusion + f = [c0, c1, c2] + g = [None, None, None] + h = [None, None, None] + h[0] = self.conv_bn_layer_1(f[0]) + h[1] = self.conv_bn_layer_2(f[1]) + h[2] = self.conv_bn_layer_3(f[2]) + + g[0] = self.conv_bn_layer_4(h[0]) + g[1] = paddle.add(g[0], h[1]) + g[1] = F.relu(g[1]) + g[1] = self.conv_bn_layer_5(g[1]) + g[1] = self.conv_bn_layer_6(g[1]) + + g[2] = paddle.add(g[1], h[2]) + g[2] = F.relu(g[2]) + g[2] = self.conv_bn_layer_7(g[2]) + f_down = self.conv_bn_layer_8(g[2]) + + # FPN UP Fusion + f1 = [c6, c5, c4, c3, c2] + g = [None, None, None, None, None] + h = [None, None, None, None, None] + h[0] = self.conv_h0(f1[0]) + h[1] = self.conv_h1(f1[1]) + h[2] = self.conv_h2(f1[2]) + h[3] = self.conv_h3(f1[3]) + h[4] = self.conv_h4(f1[4]) + + g[0] = self.dconv0(h[0]) + g[1] = paddle.add(g[0], h[1]) + g[1] = F.relu(g[1]) + g[1] = self.conv_g1(g[1]) + g[1] = self.dconv1(g[1]) + + g[2] = paddle.add(g[1], h[2]) + g[2] = F.relu(g[2]) + g[2] = self.conv_g2(g[2]) + g[2] = self.dconv2(g[2]) + + g[3] = paddle.add(g[2], h[3]) + g[3] = F.relu(g[3]) + g[3] = self.conv_g3(g[3]) + g[3] = self.dconv3(g[3]) + + g[4] = paddle.add(x=g[3], y=h[4]) + g[4] = F.relu(g[4]) + g[4] = self.conv_g4(g[4]) + f_up = self.convf(g[4]) + f_common = paddle.add(f_down, f_up) + f_common = F.relu(f_common) + return f_common diff --git a/ppocr/modeling/necks/pren_fpn.py b/ppocr/modeling/necks/pren_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..afbdcea83d9a5a827d805a542b56e6be11b03389 --- /dev/null +++ b/ppocr/modeling/necks/pren_fpn.py @@ -0,0 +1,163 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Code is refer from: +https://github.com/RuijieJ/pren/blob/main/Nets/Aggregation.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F + + +class PoolAggregate(nn.Layer): + def __init__(self, n_r, d_in, d_middle=None, d_out=None): + super(PoolAggregate, self).__init__() + if not d_middle: + d_middle = d_in + if not d_out: + d_out = d_in + + self.d_in = d_in + self.d_middle = d_middle + self.d_out = d_out + self.act = nn.Swish() + + self.n_r = n_r + self.aggs = self._build_aggs() + + def _build_aggs(self): + aggs = [] + for i in range(self.n_r): + aggs.append( + self.add_sublayer( + '{}'.format(i), + nn.Sequential( + ('conv1', nn.Conv2D( + self.d_in, self.d_middle, 3, 2, 1, bias_attr=False) + ), ('bn1', nn.BatchNorm(self.d_middle)), + ('act', self.act), ('conv2', nn.Conv2D( + self.d_middle, self.d_out, 3, 2, 1, bias_attr=False + )), ('bn2', nn.BatchNorm(self.d_out))))) + return aggs + + def forward(self, x): + b = x.shape[0] + outs = [] + for agg in self.aggs: + y = agg(x) + p = F.adaptive_avg_pool2d(y, 1) + outs.append(p.reshape((b, 1, self.d_out))) + out = paddle.concat(outs, 1) + return out + + +class WeightAggregate(nn.Layer): + def __init__(self, n_r, d_in, d_middle=None, d_out=None): + super(WeightAggregate, self).__init__() + if not d_middle: + d_middle = d_in + if not d_out: + d_out = d_in + + self.n_r = n_r + self.d_out = d_out + self.act = nn.Swish() + + self.conv_n = nn.Sequential( + ('conv1', nn.Conv2D( + d_in, d_in, 3, 1, 1, + bias_attr=False)), ('bn1', nn.BatchNorm(d_in)), + ('act1', self.act), ('conv2', nn.Conv2D( + d_in, n_r, 1, bias_attr=False)), ('bn2', nn.BatchNorm(n_r)), + ('act2', nn.Sigmoid())) + self.conv_d = nn.Sequential( + ('conv1', nn.Conv2D( + d_in, d_middle, 3, 1, 1, + bias_attr=False)), ('bn1', nn.BatchNorm(d_middle)), + ('act1', self.act), ('conv2', nn.Conv2D( + d_middle, d_out, 1, + bias_attr=False)), ('bn2', nn.BatchNorm(d_out))) + + def forward(self, x): + b, _, h, w = x.shape + + hmaps = self.conv_n(x) + fmaps = self.conv_d(x) + r = paddle.bmm( + hmaps.reshape((b, self.n_r, h * w)), + fmaps.reshape((b, self.d_out, h * w)).transpose((0, 2, 1))) + return r + + +class GCN(nn.Layer): + def __init__(self, d_in, n_in, d_out=None, n_out=None, dropout=0.1): + super(GCN, self).__init__() + if not d_out: + d_out = d_in + if not n_out: + n_out = d_in + + self.conv_n = nn.Conv1D(n_in, n_out, 1) + self.linear = nn.Linear(d_in, d_out) + self.dropout = nn.Dropout(dropout) + self.act = nn.Swish() + + def forward(self, x): + x = self.conv_n(x) + x = self.dropout(self.linear(x)) + return self.act(x) + + +class PRENFPN(nn.Layer): + def __init__(self, in_channels, n_r, d_model, max_len, dropout): + super(PRENFPN, self).__init__() + assert len(in_channels) == 3, "in_channels' length must be 3." + c1, c2, c3 = in_channels # the depths are from big to small + # build fpn + assert d_model % 3 == 0, "{} can't be divided by 3.".format(d_model) + self.agg_p1 = PoolAggregate(n_r, c1, d_out=d_model // 3) + self.agg_p2 = PoolAggregate(n_r, c2, d_out=d_model // 3) + self.agg_p3 = PoolAggregate(n_r, c3, d_out=d_model // 3) + + self.agg_w1 = WeightAggregate(n_r, c1, 4 * c1, d_model // 3) + self.agg_w2 = WeightAggregate(n_r, c2, 4 * c2, d_model // 3) + self.agg_w3 = WeightAggregate(n_r, c3, 4 * c3, d_model // 3) + + self.gcn_pool = GCN(d_model, n_r, d_model, max_len, dropout) + self.gcn_weight = GCN(d_model, n_r, d_model, max_len, dropout) + + self.out_channels = d_model + + def forward(self, inputs): + f3, f5, f7 = inputs + + rp1 = self.agg_p1(f3) + rp2 = self.agg_p2(f5) + rp3 = self.agg_p3(f7) + rp = paddle.concat([rp1, rp2, rp3], 2) # [b,nr,d] + + rw1 = self.agg_w1(f3) + rw2 = self.agg_w2(f5) + rw3 = self.agg_w3(f7) + rw = paddle.concat([rw1, rw2, rw3], 2) # [b,nr,d] + + y1 = self.gcn_pool(rp) + y2 = self.gcn_weight(rw) + y = 0.5 * (y1 + y2) + return y # [b,max_len,d] diff --git a/ppocr/modeling/necks/rf_adaptor.py b/ppocr/modeling/necks/rf_adaptor.py new file mode 100644 index 0000000000000000000000000000000000000000..94590127b0fc3027eb0c06609ad60620a120621d --- /dev/null +++ b/ppocr/modeling/necks/rf_adaptor.py @@ -0,0 +1,137 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/hikopensource/DAVAR-Lab-OCR/blob/main/davarocr/davar_rcg/models/connects/single_block/RFAdaptor.py +""" + +import paddle +import paddle.nn as nn +from paddle.nn.initializer import TruncatedNormal, Constant, Normal, KaimingNormal + +kaiming_init_ = KaimingNormal() +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + + +class S2VAdaptor(nn.Layer): + """ Semantic to Visual adaptation module""" + + def __init__(self, in_channels=512): + super(S2VAdaptor, self).__init__() + + self.in_channels = in_channels # 512 + + # feature strengthen module, channel attention + self.channel_inter = nn.Linear( + self.in_channels, self.in_channels, bias_attr=False) + self.channel_bn = nn.BatchNorm1D(self.in_channels) + self.channel_act = nn.ReLU() + self.apply(self.init_weights) + + def init_weights(self, m): + if isinstance(m, nn.Conv2D): + kaiming_init_(m.weight) + if isinstance(m, nn.Conv2D) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, (nn.BatchNorm, nn.BatchNorm2D, nn.BatchNorm1D)): + zeros_(m.bias) + ones_(m.weight) + + def forward(self, semantic): + semantic_source = semantic # batch, channel, height, width + + # feature transformation + semantic = semantic.squeeze(2).transpose( + [0, 2, 1]) # batch, width, channel + channel_att = self.channel_inter(semantic) # batch, width, channel + channel_att = channel_att.transpose([0, 2, 1]) # batch, channel, width + channel_bn = self.channel_bn(channel_att) # batch, channel, width + channel_att = self.channel_act(channel_bn) # batch, channel, width + + # Feature enhancement + channel_output = semantic_source * channel_att.unsqueeze( + -2) # batch, channel, 1, width + + return channel_output + + +class V2SAdaptor(nn.Layer): + """ Visual to Semantic adaptation module""" + + def __init__(self, in_channels=512, return_mask=False): + super(V2SAdaptor, self).__init__() + + # parameter initialization + self.in_channels = in_channels + self.return_mask = return_mask + + # output transformation + self.channel_inter = nn.Linear( + self.in_channels, self.in_channels, bias_attr=False) + self.channel_bn = nn.BatchNorm1D(self.in_channels) + self.channel_act = nn.ReLU() + + def forward(self, visual): + # Feature enhancement + visual = visual.squeeze(2).transpose([0, 2, 1]) # batch, width, channel + channel_att = self.channel_inter(visual) # batch, width, channel + channel_att = channel_att.transpose([0, 2, 1]) # batch, channel, width + channel_bn = self.channel_bn(channel_att) # batch, channel, width + channel_att = self.channel_act(channel_bn) # batch, channel, width + + # size alignment + channel_output = channel_att.unsqueeze(-2) # batch, width, channel + + if self.return_mask: + return channel_output, channel_att + return channel_output + + +class RFAdaptor(nn.Layer): + def __init__(self, in_channels=512, use_v2s=True, use_s2v=True, **kwargs): + super(RFAdaptor, self).__init__() + if use_v2s is True: + self.neck_v2s = V2SAdaptor(in_channels=in_channels, **kwargs) + else: + self.neck_v2s = None + if use_s2v is True: + self.neck_s2v = S2VAdaptor(in_channels=in_channels, **kwargs) + else: + self.neck_s2v = None + self.out_channels = in_channels + + def forward(self, x): + visual_feature, rcg_feature = x + if visual_feature is not None: + batch, source_channels, v_source_height, v_source_width = visual_feature.shape + visual_feature = visual_feature.reshape( + [batch, source_channels, 1, v_source_height * v_source_width]) + + if self.neck_v2s is not None: + v_rcg_feature = rcg_feature * self.neck_v2s(visual_feature) + else: + v_rcg_feature = rcg_feature + + if self.neck_s2v is not None: + v_visual_feature = visual_feature + self.neck_s2v(rcg_feature) + else: + v_visual_feature = visual_feature + if v_rcg_feature is not None: + batch, source_channels, source_height, source_width = v_rcg_feature.shape + v_rcg_feature = v_rcg_feature.reshape( + [batch, source_channels, 1, source_height * source_width]) + + v_rcg_feature = v_rcg_feature.squeeze(2).transpose([0, 2, 1]) + return v_visual_feature, v_rcg_feature diff --git a/ppocr/modeling/necks/rnn.py b/ppocr/modeling/necks/rnn.py new file mode 100644 index 0000000000000000000000000000000000000000..33be9400b34cb535d260881748e179c3df106caa --- /dev/null +++ b/ppocr/modeling/necks/rnn.py @@ -0,0 +1,245 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + +from ppocr.modeling.heads.rec_ctc_head import get_para_bias_attr +from ppocr.modeling.backbones.rec_svtrnet import Block, ConvBNLayer, trunc_normal_, zeros_, ones_ + + +class Im2Seq(nn.Layer): + def __init__(self, in_channels, **kwargs): + super().__init__() + self.out_channels = in_channels + + def forward(self, x): + B, C, H, W = x.shape + assert H == 1 + x = x.squeeze(axis=2) + x = x.transpose([0, 2, 1]) # (NTC)(batch, width, channels) + return x + + +class EncoderWithRNN(nn.Layer): + def __init__(self, in_channels, hidden_size): + super(EncoderWithRNN, self).__init__() + self.out_channels = hidden_size * 2 + self.lstm = nn.LSTM( + in_channels, hidden_size, direction='bidirectional', num_layers=2) + + def forward(self, x): + x, _ = self.lstm(x) + return x + +class BidirectionalLSTM(nn.Layer): + def __init__(self, input_size, + hidden_size, + output_size=None, + num_layers=1, + dropout=0, + direction=False, + time_major=False, + with_linear=False): + super(BidirectionalLSTM, self).__init__() + self.with_linear = with_linear + self.rnn = nn.LSTM(input_size, + hidden_size, + num_layers=num_layers, + dropout=dropout, + direction=direction, + time_major=time_major) + + # text recognition the specified structure LSTM with linear + if self.with_linear: + self.linear = nn.Linear(hidden_size * 2, output_size) + + def forward(self, input_feature): + recurrent, _ = self.rnn(input_feature) # batch_size x T x input_size -> batch_size x T x (2*hidden_size) + if self.with_linear: + output = self.linear(recurrent) # batch_size x T x output_size + return output + return recurrent + +class EncoderWithCascadeRNN(nn.Layer): + def __init__(self, in_channels, hidden_size, out_channels, num_layers=2, with_linear=False): + super(EncoderWithCascadeRNN, self).__init__() + self.out_channels = out_channels[-1] + self.encoder = nn.LayerList( + [BidirectionalLSTM( + in_channels if i == 0 else out_channels[i - 1], + hidden_size, + output_size=out_channels[i], + num_layers=1, + direction='bidirectional', + with_linear=with_linear) + for i in range(num_layers)] + ) + + + def forward(self, x): + for i, l in enumerate(self.encoder): + x = l(x) + return x + + +class EncoderWithFC(nn.Layer): + def __init__(self, in_channels, hidden_size): + super(EncoderWithFC, self).__init__() + self.out_channels = hidden_size + weight_attr, bias_attr = get_para_bias_attr( + l2_decay=0.00001, k=in_channels) + self.fc = nn.Linear( + in_channels, + hidden_size, + weight_attr=weight_attr, + bias_attr=bias_attr, + name='reduce_encoder_fea') + + def forward(self, x): + x = self.fc(x) + return x + + +class EncoderWithSVTR(nn.Layer): + def __init__( + self, + in_channels, + dims=64, # XS + depth=2, + hidden_dims=120, + use_guide=False, + num_heads=8, + qkv_bias=True, + mlp_ratio=2.0, + drop_rate=0.1, + attn_drop_rate=0.1, + drop_path=0., + qk_scale=None): + super(EncoderWithSVTR, self).__init__() + self.depth = depth + self.use_guide = use_guide + self.conv1 = ConvBNLayer( + in_channels, in_channels // 8, padding=1, act=nn.Swish) + self.conv2 = ConvBNLayer( + in_channels // 8, hidden_dims, kernel_size=1, act=nn.Swish) + + self.svtr_block = nn.LayerList([ + Block( + dim=hidden_dims, + num_heads=num_heads, + mixer='Global', + HW=None, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=nn.Swish, + attn_drop=attn_drop_rate, + drop_path=drop_path, + norm_layer='nn.LayerNorm', + epsilon=1e-05, + prenorm=False) for i in range(depth) + ]) + self.norm = nn.LayerNorm(hidden_dims, epsilon=1e-6) + self.conv3 = ConvBNLayer( + hidden_dims, in_channels, kernel_size=1, act=nn.Swish) + # last conv-nxn, the input is concat of input tensor and conv3 output tensor + self.conv4 = ConvBNLayer( + 2 * in_channels, in_channels // 8, padding=1, act=nn.Swish) + + self.conv1x1 = ConvBNLayer( + in_channels // 8, dims, kernel_size=1, act=nn.Swish) + self.out_channels = dims + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + + def forward(self, x): + # for use guide + if self.use_guide: + z = x.clone() + z.stop_gradient = True + else: + z = x + # for short cut + h = z + # reduce dim + z = self.conv1(z) + z = self.conv2(z) + # SVTR global block + B, C, H, W = z.shape + z = z.flatten(2).transpose([0, 2, 1]) + for blk in self.svtr_block: + z = blk(z) + z = self.norm(z) + # last stage + z = z.reshape([0, H, W, C]).transpose([0, 3, 1, 2]) + z = self.conv3(z) + z = paddle.concat((h, z), axis=1) + z = self.conv1x1(self.conv4(z)) + return z + + +class SequenceEncoder(nn.Layer): + def __init__(self, in_channels, encoder_type, hidden_size=48, **kwargs): + super(SequenceEncoder, self).__init__() + self.encoder_reshape = Im2Seq(in_channels) + self.out_channels = self.encoder_reshape.out_channels + self.encoder_type = encoder_type + if encoder_type == 'reshape': + self.only_reshape = True + else: + support_encoder_dict = { + 'reshape': Im2Seq, + 'fc': EncoderWithFC, + 'rnn': EncoderWithRNN, + 'svtr': EncoderWithSVTR, + 'cascadernn': EncoderWithCascadeRNN + } + assert encoder_type in support_encoder_dict, '{} must in {}'.format( + encoder_type, support_encoder_dict.keys()) + if encoder_type == "svtr": + self.encoder = support_encoder_dict[encoder_type]( + self.encoder_reshape.out_channels, **kwargs) + elif encoder_type == 'cascadernn': + self.encoder = support_encoder_dict[encoder_type]( + self.encoder_reshape.out_channels, hidden_size, **kwargs) + else: + self.encoder = support_encoder_dict[encoder_type]( + self.encoder_reshape.out_channels, hidden_size) + self.out_channels = self.encoder.out_channels + self.only_reshape = False + + def forward(self, x): + if self.encoder_type != 'svtr': + x = self.encoder_reshape(x) + if not self.only_reshape: + x = self.encoder(x) + return x + else: + x = self.encoder(x) + x = self.encoder_reshape(x) + return x diff --git a/ppocr/modeling/necks/sast_fpn.py b/ppocr/modeling/necks/sast_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..9b6024590ac574f725fc6c519dd96ee4462276d6 --- /dev/null +++ b/ppocr/modeling/necks/sast_fpn.py @@ -0,0 +1,284 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + if_act=True, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance") + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class DeConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride, + groups=1, + if_act=True, + act=None, + name=None): + super(DeConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.deconv = nn.Conv2DTranspose( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + '_weights'), + bias_attr=False) + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name="bn_" + name + "_scale"), + bias_attr=ParamAttr(name="bn_" + name + "_offset"), + moving_mean_name="bn_" + name + "_mean", + moving_variance_name="bn_" + name + "_variance") + + def forward(self, x): + x = self.deconv(x) + x = self.bn(x) + return x + + +class FPN_Up_Fusion(nn.Layer): + def __init__(self, in_channels): + super(FPN_Up_Fusion, self).__init__() + in_channels = in_channels[::-1] + out_channels = [256, 256, 192, 192, 128] + + self.h0_conv = ConvBNLayer(in_channels[0], out_channels[0], 1, 1, act=None, name='fpn_up_h0') + self.h1_conv = ConvBNLayer(in_channels[1], out_channels[1], 1, 1, act=None, name='fpn_up_h1') + self.h2_conv = ConvBNLayer(in_channels[2], out_channels[2], 1, 1, act=None, name='fpn_up_h2') + self.h3_conv = ConvBNLayer(in_channels[3], out_channels[3], 1, 1, act=None, name='fpn_up_h3') + self.h4_conv = ConvBNLayer(in_channels[4], out_channels[4], 1, 1, act=None, name='fpn_up_h4') + + self.g0_conv = DeConvBNLayer(out_channels[0], out_channels[1], 4, 2, act=None, name='fpn_up_g0') + + self.g1_conv = nn.Sequential( + ConvBNLayer(out_channels[1], out_channels[1], 3, 1, act='relu', name='fpn_up_g1_1'), + DeConvBNLayer(out_channels[1], out_channels[2], 4, 2, act=None, name='fpn_up_g1_2') + ) + self.g2_conv = nn.Sequential( + ConvBNLayer(out_channels[2], out_channels[2], 3, 1, act='relu', name='fpn_up_g2_1'), + DeConvBNLayer(out_channels[2], out_channels[3], 4, 2, act=None, name='fpn_up_g2_2') + ) + self.g3_conv = nn.Sequential( + ConvBNLayer(out_channels[3], out_channels[3], 3, 1, act='relu', name='fpn_up_g3_1'), + DeConvBNLayer(out_channels[3], out_channels[4], 4, 2, act=None, name='fpn_up_g3_2') + ) + + self.g4_conv = nn.Sequential( + ConvBNLayer(out_channels[4], out_channels[4], 3, 1, act='relu', name='fpn_up_fusion_1'), + ConvBNLayer(out_channels[4], out_channels[4], 1, 1, act=None, name='fpn_up_fusion_2') + ) + + def _add_relu(self, x1, x2): + x = paddle.add(x=x1, y=x2) + x = F.relu(x) + return x + + def forward(self, x): + f = x[2:][::-1] + h0 = self.h0_conv(f[0]) + h1 = self.h1_conv(f[1]) + h2 = self.h2_conv(f[2]) + h3 = self.h3_conv(f[3]) + h4 = self.h4_conv(f[4]) + + g0 = self.g0_conv(h0) + g1 = self._add_relu(g0, h1) + g1 = self.g1_conv(g1) + g2 = self.g2_conv(self._add_relu(g1, h2)) + g3 = self.g3_conv(self._add_relu(g2, h3)) + g4 = self.g4_conv(self._add_relu(g3, h4)) + + return g4 + + +class FPN_Down_Fusion(nn.Layer): + def __init__(self, in_channels): + super(FPN_Down_Fusion, self).__init__() + out_channels = [32, 64, 128] + + self.h0_conv = ConvBNLayer(in_channels[0], out_channels[0], 3, 1, act=None, name='fpn_down_h0') + self.h1_conv = ConvBNLayer(in_channels[1], out_channels[1], 3, 1, act=None, name='fpn_down_h1') + self.h2_conv = ConvBNLayer(in_channels[2], out_channels[2], 3, 1, act=None, name='fpn_down_h2') + + self.g0_conv = ConvBNLayer(out_channels[0], out_channels[1], 3, 2, act=None, name='fpn_down_g0') + + self.g1_conv = nn.Sequential( + ConvBNLayer(out_channels[1], out_channels[1], 3, 1, act='relu', name='fpn_down_g1_1'), + ConvBNLayer(out_channels[1], out_channels[2], 3, 2, act=None, name='fpn_down_g1_2') + ) + + self.g2_conv = nn.Sequential( + ConvBNLayer(out_channels[2], out_channels[2], 3, 1, act='relu', name='fpn_down_fusion_1'), + ConvBNLayer(out_channels[2], out_channels[2], 1, 1, act=None, name='fpn_down_fusion_2') + ) + + def forward(self, x): + f = x[:3] + h0 = self.h0_conv(f[0]) + h1 = self.h1_conv(f[1]) + h2 = self.h2_conv(f[2]) + g0 = self.g0_conv(h0) + g1 = paddle.add(x=g0, y=h1) + g1 = F.relu(g1) + g1 = self.g1_conv(g1) + g2 = paddle.add(x=g1, y=h2) + g2 = F.relu(g2) + g2 = self.g2_conv(g2) + return g2 + + +class Cross_Attention(nn.Layer): + def __init__(self, in_channels): + super(Cross_Attention, self).__init__() + self.theta_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_theta') + self.phi_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_phi') + self.g_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act='relu', name='f_g') + + self.fh_weight_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fh_weight') + self.fh_sc_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fh_sc') + + self.fv_weight_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fv_weight') + self.fv_sc_conv = ConvBNLayer(in_channels, in_channels, 1, 1, act=None, name='fv_sc') + + self.f_attn_conv = ConvBNLayer(in_channels * 2, in_channels, 1, 1, act='relu', name='f_attn') + + def _cal_fweight(self, f, shape): + f_theta, f_phi, f_g = f + #flatten + f_theta = paddle.transpose(f_theta, [0, 2, 3, 1]) + f_theta = paddle.reshape(f_theta, [shape[0] * shape[1], shape[2], 128]) + f_phi = paddle.transpose(f_phi, [0, 2, 3, 1]) + f_phi = paddle.reshape(f_phi, [shape[0] * shape[1], shape[2], 128]) + f_g = paddle.transpose(f_g, [0, 2, 3, 1]) + f_g = paddle.reshape(f_g, [shape[0] * shape[1], shape[2], 128]) + #correlation + f_attn = paddle.matmul(f_theta, paddle.transpose(f_phi, [0, 2, 1])) + #scale + f_attn = f_attn / (128**0.5) + f_attn = F.softmax(f_attn) + #weighted sum + f_weight = paddle.matmul(f_attn, f_g) + f_weight = paddle.reshape( + f_weight, [shape[0], shape[1], shape[2], 128]) + return f_weight + + def forward(self, f_common): + f_shape = paddle.shape(f_common) + # print('f_shape: ', f_shape) + + f_theta = self.theta_conv(f_common) + f_phi = self.phi_conv(f_common) + f_g = self.g_conv(f_common) + + ######## horizon ######## + fh_weight = self._cal_fweight([f_theta, f_phi, f_g], + [f_shape[0], f_shape[2], f_shape[3]]) + fh_weight = paddle.transpose(fh_weight, [0, 3, 1, 2]) + fh_weight = self.fh_weight_conv(fh_weight) + #short cut + fh_sc = self.fh_sc_conv(f_common) + f_h = F.relu(fh_weight + fh_sc) + + ######## vertical ######## + fv_theta = paddle.transpose(f_theta, [0, 1, 3, 2]) + fv_phi = paddle.transpose(f_phi, [0, 1, 3, 2]) + fv_g = paddle.transpose(f_g, [0, 1, 3, 2]) + fv_weight = self._cal_fweight([fv_theta, fv_phi, fv_g], + [f_shape[0], f_shape[3], f_shape[2]]) + fv_weight = paddle.transpose(fv_weight, [0, 3, 2, 1]) + fv_weight = self.fv_weight_conv(fv_weight) + #short cut + fv_sc = self.fv_sc_conv(f_common) + f_v = F.relu(fv_weight + fv_sc) + + ######## merge ######## + f_attn = paddle.concat([f_h, f_v], axis=1) + f_attn = self.f_attn_conv(f_attn) + return f_attn + + +class SASTFPN(nn.Layer): + def __init__(self, in_channels, with_cab=False, **kwargs): + super(SASTFPN, self).__init__() + self.in_channels = in_channels + self.with_cab = with_cab + self.FPN_Down_Fusion = FPN_Down_Fusion(self.in_channels) + self.FPN_Up_Fusion = FPN_Up_Fusion(self.in_channels) + self.out_channels = 128 + self.cross_attention = Cross_Attention(self.out_channels) + + def forward(self, x): + #down fpn + f_down = self.FPN_Down_Fusion(x) + + #up fpn + f_up = self.FPN_Up_Fusion(x) + + #fusion + f_common = paddle.add(x=f_down, y=f_up) + f_common = F.relu(f_common) + + if self.with_cab: + # print('enhence f_common with CAB.') + f_common = self.cross_attention(f_common) + + return f_common diff --git a/ppocr/modeling/necks/table_fpn.py b/ppocr/modeling/necks/table_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..734f15af65e4e15a7ddb4004954a61bfa1934246 --- /dev/null +++ b/ppocr/modeling/necks/table_fpn.py @@ -0,0 +1,110 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + + +class TableFPN(nn.Layer): + def __init__(self, in_channels, out_channels, **kwargs): + super(TableFPN, self).__init__() + self.out_channels = 512 + weight_attr = paddle.nn.initializer.KaimingUniform() + self.in2_conv = nn.Conv2D( + in_channels=in_channels[0], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.in3_conv = nn.Conv2D( + in_channels=in_channels[1], + out_channels=self.out_channels, + kernel_size=1, + stride = 1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.in4_conv = nn.Conv2D( + in_channels=in_channels[2], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.in5_conv = nn.Conv2D( + in_channels=in_channels[3], + out_channels=self.out_channels, + kernel_size=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p5_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p4_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p3_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.p2_conv = nn.Conv2D( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), + bias_attr=False) + self.fuse_conv = nn.Conv2D( + in_channels=self.out_channels * 4, + out_channels=512, + kernel_size=3, + padding=1, + weight_attr=ParamAttr(initializer=weight_attr), bias_attr=False) + + def forward(self, x): + c2, c3, c4, c5 = x + + in5 = self.in5_conv(c5) + in4 = self.in4_conv(c4) + in3 = self.in3_conv(c3) + in2 = self.in2_conv(c2) + + out4 = in4 + F.upsample( + in5, size=in4.shape[2:4], mode="nearest", align_mode=1) # 1/16 + out3 = in3 + F.upsample( + out4, size=in3.shape[2:4], mode="nearest", align_mode=1) # 1/8 + out2 = in2 + F.upsample( + out3, size=in2.shape[2:4], mode="nearest", align_mode=1) # 1/4 + + p4 = F.upsample(out4, size=in5.shape[2:4], mode="nearest", align_mode=1) + p3 = F.upsample(out3, size=in5.shape[2:4], mode="nearest", align_mode=1) + p2 = F.upsample(out2, size=in5.shape[2:4], mode="nearest", align_mode=1) + fuse = paddle.concat([in5, p4, p3, p2], axis=1) + fuse_conv = self.fuse_conv(fuse) * 0.005 + return [c5 + fuse_conv] diff --git a/ppocr/modeling/transforms/__init__.py b/ppocr/modeling/transforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..022ece60a56131a25049547a64bdaf9f94c0e69c --- /dev/null +++ b/ppocr/modeling/transforms/__init__.py @@ -0,0 +1,31 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['build_transform'] + + +def build_transform(config): + from .tps import TPS + from .stn import STN_ON + from .tsrn import TSRN + from .tbsrn import TBSRN + from .gaspin_transformer import GA_SPIN_Transformer as GA_SPIN + + support_dict = ['TPS', 'STN_ON', 'GA_SPIN', 'TSRN', 'TBSRN'] + + module_name = config.pop('name') + assert module_name in support_dict, Exception( + 'transform only support {}'.format(support_dict)) + module_class = eval(module_name)(**config) + return module_class diff --git a/ppocr/modeling/transforms/gaspin_transformer.py b/ppocr/modeling/transforms/gaspin_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..f4719eb2162a02141620586bcb6a849ae16f3b62 --- /dev/null +++ b/ppocr/modeling/transforms/gaspin_transformer.py @@ -0,0 +1,284 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import numpy as np +import functools +from .tps import GridGenerator + +'''This code is refer from: +https://github.com/hikopensource/DAVAR-Lab-OCR/davarocr/davar_rcg/models/transformations/gaspin_transformation.py +''' + +class SP_TransformerNetwork(nn.Layer): + """ + Sturture-Preserving Transformation (SPT) as Equa. (2) in Ref. [1] + Ref: [1] SPIN: Structure-Preserving Inner Offset Network for Scene Text Recognition. AAAI-2021. + """ + + def __init__(self, nc=1, default_type=5): + """ Based on SPIN + Args: + nc (int): number of input channels (usually in 1 or 3) + default_type (int): the complexity of transformation intensities (by default set to 6 as the paper) + """ + super(SP_TransformerNetwork, self).__init__() + self.power_list = self.cal_K(default_type) + self.sigmoid = nn.Sigmoid() + self.bn = nn.InstanceNorm2D(nc) + + def cal_K(self, k=5): + """ + + Args: + k (int): the complexity of transformation intensities (by default set to 6 as the paper) + + Returns: + List: the normalized intensity of each pixel in [0,1], denoted as \beta [1x(2K+1)] + + """ + from math import log + x = [] + if k != 0: + for i in range(1, k+1): + lower = round(log(1-(0.5/(k+1))*i)/log((0.5/(k+1))*i), 2) + upper = round(1/lower, 2) + x.append(lower) + x.append(upper) + x.append(1.00) + return x + + def forward(self, batch_I, weights, offsets, lambda_color=None): + """ + + Args: + batch_I (Tensor): batch of input images [batch_size x nc x I_height x I_width] + weights: + offsets: the predicted offset by AIN, a scalar + lambda_color: the learnable update gate \alpha in Equa. (5) as + g(x) = (1 - \alpha) \odot x + \alpha \odot x_{offsets} + + Returns: + Tensor: transformed images by SPN as Equa. (4) in Ref. [1] + [batch_size x I_channel_num x I_r_height x I_r_width] + + """ + batch_I = (batch_I + 1) * 0.5 + if offsets is not None: + batch_I = batch_I*(1-lambda_color) + offsets*lambda_color + batch_weight_params = paddle.unsqueeze(paddle.unsqueeze(weights, -1), -1) + batch_I_power = paddle.stack([batch_I.pow(p) for p in self.power_list], axis=1) + + batch_weight_sum = paddle.sum(batch_I_power * batch_weight_params, axis=1) + batch_weight_sum = self.bn(batch_weight_sum) + batch_weight_sum = self.sigmoid(batch_weight_sum) + batch_weight_sum = batch_weight_sum * 2 - 1 + return batch_weight_sum + +class GA_SPIN_Transformer(nn.Layer): + """ + Geometric-Absorbed SPIN Transformation (GA-SPIN) proposed in Ref. [1] + + + Ref: [1] SPIN: Structure-Preserving Inner Offset Network for Scene Text Recognition. AAAI-2021. + """ + + def __init__(self, in_channels=1, + I_r_size=(32, 100), + offsets=False, + norm_type='BN', + default_type=6, + loc_lr=1, + stn=True): + """ + Args: + in_channels (int): channel of input features, + set it to 1 if the grayscale images and 3 if RGB input + I_r_size (tuple): size of rectified images (used in STN transformations) + offsets (bool): set it to False if use SPN w.o. AIN, + and set it to True if use SPIN (both with SPN and AIN) + norm_type (str): the normalization type of the module, + set it to 'BN' by default, 'IN' optionally + default_type (int): the K chromatic space, + set it to 3/5/6 depend on the complexity of transformation intensities + loc_lr (float): learning rate of location network + stn (bool): whther to use stn. + + """ + super(GA_SPIN_Transformer, self).__init__() + self.nc = in_channels + self.spt = True + self.offsets = offsets + self.stn = stn # set to True in GA-SPIN, while set it to False in SPIN + self.I_r_size = I_r_size + self.out_channels = in_channels + if norm_type == 'BN': + norm_layer = functools.partial(nn.BatchNorm2D, use_global_stats=True) + elif norm_type == 'IN': + norm_layer = functools.partial(nn.InstanceNorm2D, weight_attr=False, + use_global_stats=False) + else: + raise NotImplementedError('normalization layer [%s] is not found' % norm_type) + + if self.spt: + self.sp_net = SP_TransformerNetwork(in_channels, + default_type) + self.spt_convnet = nn.Sequential( + # 32*100 + nn.Conv2D(in_channels, 32, 3, 1, 1, bias_attr=False), + norm_layer(32), nn.ReLU(), + nn.MaxPool2D(kernel_size=2, stride=2), + # 16*50 + nn.Conv2D(32, 64, 3, 1, 1, bias_attr=False), + norm_layer(64), nn.ReLU(), + nn.MaxPool2D(kernel_size=2, stride=2), + # 8*25 + nn.Conv2D(64, 128, 3, 1, 1, bias_attr=False), + norm_layer(128), nn.ReLU(), + nn.MaxPool2D(kernel_size=2, stride=2), + # 4*12 + ) + self.stucture_fc1 = nn.Sequential( + nn.Conv2D(128, 256, 3, 1, 1, bias_attr=False), + norm_layer(256), nn.ReLU(), + nn.MaxPool2D(kernel_size=2, stride=2), + nn.Conv2D(256, 256, 3, 1, 1, bias_attr=False), + norm_layer(256), nn.ReLU(), # 2*6 + nn.MaxPool2D(kernel_size=2, stride=2), + nn.Conv2D(256, 512, 3, 1, 1, bias_attr=False), + norm_layer(512), nn.ReLU(), # 1*3 + nn.AdaptiveAvgPool2D(1), + nn.Flatten(1, -1), # batch_size x 512 + nn.Linear(512, 256, weight_attr=nn.initializer.Normal(0.001)), + nn.BatchNorm1D(256), nn.ReLU() + ) + self.out_weight = 2*default_type+1 + self.spt_length = 2*default_type+1 + if offsets: + self.out_weight += 1 + if self.stn: + self.F = 20 + self.out_weight += self.F * 2 + self.GridGenerator = GridGenerator(self.F*2, self.F) + + # self.out_weight*=nc + # Init structure_fc2 in LocalizationNetwork + initial_bias = self.init_spin(default_type*2) + initial_bias = initial_bias.reshape(-1) + param_attr = ParamAttr( + learning_rate=loc_lr, + initializer=nn.initializer.Assign(np.zeros([256, self.out_weight]))) + bias_attr = ParamAttr( + learning_rate=loc_lr, + initializer=nn.initializer.Assign(initial_bias)) + self.stucture_fc2 = nn.Linear(256, self.out_weight, + weight_attr=param_attr, + bias_attr=bias_attr) + self.sigmoid = nn.Sigmoid() + + if offsets: + self.offset_fc1 = nn.Sequential(nn.Conv2D(128, 16, + 3, 1, 1, + bias_attr=False), + norm_layer(16), + nn.ReLU(),) + self.offset_fc2 = nn.Conv2D(16, in_channels, + 3, 1, 1) + self.pool = nn.MaxPool2D(2, 2) + + def init_spin(self, nz): + """ + Args: + nz (int): number of paired \betas exponents, which means the value of K x 2 + + """ + init_id = [0.00]*nz+[5.00] + if self.offsets: + init_id += [-5.00] + # init_id *=3 + init = np.array(init_id) + + if self.stn: + F = self.F + ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2)) + ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(F / 2)) + ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(F / 2)) + ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1) + ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1) + initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0) + initial_bias = initial_bias.reshape(-1) + init = np.concatenate([init, initial_bias], axis=0) + return init + + def forward(self, x, return_weight=False): + """ + Args: + x (Tensor): input image batch + return_weight (bool): set to False by default, + if set to True return the predicted offsets of AIN, denoted as x_{offsets} + + Returns: + Tensor: rectified image [batch_size x I_channel_num x I_height x I_width], the same as the input size + """ + + if self.spt: + feat = self.spt_convnet(x) + fc1 = self.stucture_fc1(feat) + sp_weight_fusion = self.stucture_fc2(fc1) + sp_weight_fusion = sp_weight_fusion.reshape([x.shape[0], self.out_weight, 1]) + if self.offsets: # SPIN w. AIN + lambda_color = sp_weight_fusion[:, self.spt_length, 0] + lambda_color = self.sigmoid(lambda_color).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1) + sp_weight = sp_weight_fusion[:, :self.spt_length, :] + offsets = self.pool(self.offset_fc2(self.offset_fc1(feat))) + + assert offsets.shape[2] == 2 # 2 + assert offsets.shape[3] == 6 # 16 + offsets = self.sigmoid(offsets) # v12 + + if return_weight: + return offsets + offsets = nn.functional.upsample(offsets, size=(x.shape[2], x.shape[3]), mode='bilinear') + + if self.stn: + batch_C_prime = sp_weight_fusion[:, (self.spt_length + 1):, :].reshape([x.shape[0], self.F, 2]) + build_P_prime = self.GridGenerator(batch_C_prime, self.I_r_size) + build_P_prime_reshape = build_P_prime.reshape([build_P_prime.shape[0], + self.I_r_size[0], + self.I_r_size[1], + 2]) + + else: # SPIN w.o. AIN + sp_weight = sp_weight_fusion[:, :self.spt_length, :] + lambda_color, offsets = None, None + + if self.stn: + batch_C_prime = sp_weight_fusion[:, self.spt_length:, :].reshape([x.shape[0], self.F, 2]) + build_P_prime = self.GridGenerator(batch_C_prime, self.I_r_size) + build_P_prime_reshape = build_P_prime.reshape([build_P_prime.shape[0], + self.I_r_size[0], + self.I_r_size[1], + 2]) + + x = self.sp_net(x, sp_weight, offsets, lambda_color) + if self.stn: + x = F.grid_sample(x=x, grid=build_P_prime_reshape, padding_mode='border') + return x diff --git a/ppocr/modeling/transforms/stn.py b/ppocr/modeling/transforms/stn.py new file mode 100644 index 0000000000000000000000000000000000000000..6f2bdda050f217d8253740001901fbff4065782a --- /dev/null +++ b/ppocr/modeling/transforms/stn.py @@ -0,0 +1,135 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/stn_head.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import numpy as np + +from .tps_spatial_transformer import TPSSpatialTransformer + + +def conv3x3_block(in_channels, out_channels, stride=1): + n = 3 * 3 * out_channels + w = math.sqrt(2. / n) + conv_layer = nn.Conv2D( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + weight_attr=nn.initializer.Normal( + mean=0.0, std=w), + bias_attr=nn.initializer.Constant(0)) + block = nn.Sequential(conv_layer, nn.BatchNorm2D(out_channels), nn.ReLU()) + return block + + +class STN(nn.Layer): + def __init__(self, in_channels, num_ctrlpoints, activation='none'): + super(STN, self).__init__() + self.in_channels = in_channels + self.num_ctrlpoints = num_ctrlpoints + self.activation = activation + self.stn_convnet = nn.Sequential( + conv3x3_block(in_channels, 32), #32x64 + nn.MaxPool2D( + kernel_size=2, stride=2), + conv3x3_block(32, 64), #16x32 + nn.MaxPool2D( + kernel_size=2, stride=2), + conv3x3_block(64, 128), # 8*16 + nn.MaxPool2D( + kernel_size=2, stride=2), + conv3x3_block(128, 256), # 4*8 + nn.MaxPool2D( + kernel_size=2, stride=2), + conv3x3_block(256, 256), # 2*4, + nn.MaxPool2D( + kernel_size=2, stride=2), + conv3x3_block(256, 256)) # 1*2 + self.stn_fc1 = nn.Sequential( + nn.Linear( + 2 * 256, + 512, + weight_attr=nn.initializer.Normal(0, 0.001), + bias_attr=nn.initializer.Constant(0)), + nn.BatchNorm1D(512), + nn.ReLU()) + fc2_bias = self.init_stn() + self.stn_fc2 = nn.Linear( + 512, + num_ctrlpoints * 2, + weight_attr=nn.initializer.Constant(0.0), + bias_attr=nn.initializer.Assign(fc2_bias)) + + def init_stn(self): + margin = 0.01 + sampling_num_per_side = int(self.num_ctrlpoints / 2) + ctrl_pts_x = np.linspace(margin, 1. - margin, sampling_num_per_side) + ctrl_pts_y_top = np.ones(sampling_num_per_side) * margin + ctrl_pts_y_bottom = np.ones(sampling_num_per_side) * (1 - margin) + ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1) + ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1) + ctrl_points = np.concatenate( + [ctrl_pts_top, ctrl_pts_bottom], axis=0).astype(np.float32) + if self.activation == 'none': + pass + elif self.activation == 'sigmoid': + ctrl_points = -np.log(1. / ctrl_points - 1.) + ctrl_points = paddle.to_tensor(ctrl_points) + fc2_bias = paddle.reshape( + ctrl_points, shape=[ctrl_points.shape[0] * ctrl_points.shape[1]]) + return fc2_bias + + def forward(self, x): + x = self.stn_convnet(x) + batch_size, _, h, w = x.shape + x = paddle.reshape(x, shape=(batch_size, -1)) + img_feat = self.stn_fc1(x) + x = self.stn_fc2(0.1 * img_feat) + if self.activation == 'sigmoid': + x = F.sigmoid(x) + x = paddle.reshape(x, shape=[-1, self.num_ctrlpoints, 2]) + return img_feat, x + + +class STN_ON(nn.Layer): + def __init__(self, in_channels, tps_inputsize, tps_outputsize, + num_control_points, tps_margins, stn_activation): + super(STN_ON, self).__init__() + self.tps = TPSSpatialTransformer( + output_image_size=tuple(tps_outputsize), + num_control_points=num_control_points, + margins=tuple(tps_margins)) + self.stn_head = STN(in_channels=in_channels, + num_ctrlpoints=num_control_points, + activation=stn_activation) + self.tps_inputsize = tps_inputsize + self.out_channels = in_channels + + def forward(self, image): + stn_input = paddle.nn.functional.interpolate( + image, self.tps_inputsize, mode="bilinear", align_corners=True) + stn_img_feat, ctrl_points = self.stn_head(stn_input) + x, _ = self.tps(image, ctrl_points) + return x diff --git a/ppocr/modeling/transforms/tbsrn.py b/ppocr/modeling/transforms/tbsrn.py new file mode 100644 index 0000000000000000000000000000000000000000..ee119003600b0515feb6fd1049e2c91565528b7d --- /dev/null +++ b/ppocr/modeling/transforms/tbsrn.py @@ -0,0 +1,264 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FudanVI/FudanOCR/blob/main/scene-text-telescope/model/tbsrn.py +""" + +import math +import warnings +import numpy as np +import paddle +from paddle import nn +import string + +warnings.filterwarnings("ignore") + +from .tps_spatial_transformer import TPSSpatialTransformer +from .stn import STN as STNHead +from .tsrn import GruBlock, mish, UpsampleBLock +from ppocr.modeling.heads.sr_rensnet_transformer import Transformer, LayerNorm, \ + PositionwiseFeedForward, MultiHeadedAttention + + +def positionalencoding2d(d_model, height, width): + """ + :param d_model: dimension of the model + :param height: height of the positions + :param width: width of the positions + :return: d_model*height*width position matrix + """ + if d_model % 4 != 0: + raise ValueError("Cannot use sin/cos positional encoding with " + "odd dimension (got dim={:d})".format(d_model)) + pe = paddle.zeros([d_model, height, width]) + # Each dimension use half of d_model + d_model = int(d_model / 2) + div_term = paddle.exp(paddle.arange(0., d_model, 2) * + -(math.log(10000.0) / d_model)) + pos_w = paddle.arange(0., width, dtype='float32').unsqueeze(1) + pos_h = paddle.arange(0., height, dtype='float32').unsqueeze(1) + + pe[0:d_model:2, :, :] = paddle.sin(pos_w * div_term).transpose([1, 0]).unsqueeze(1).tile([1, height, 1]) + pe[1:d_model:2, :, :] = paddle.cos(pos_w * div_term).transpose([1, 0]).unsqueeze(1).tile([1, height, 1]) + pe[d_model::2, :, :] = paddle.sin(pos_h * div_term).transpose([1, 0]).unsqueeze(2).tile([1, 1, width]) + pe[d_model + 1::2, :, :] = paddle.cos(pos_h * div_term).transpose([1, 0]).unsqueeze(2).tile([1, 1, width]) + + return pe + + +class FeatureEnhancer(nn.Layer): + + def __init__(self): + super(FeatureEnhancer, self).__init__() + + self.multihead = MultiHeadedAttention(h=4, d_model=128, dropout=0.1) + self.mul_layernorm1 = LayerNorm(features=128) + + self.pff = PositionwiseFeedForward(128, 128) + self.mul_layernorm3 = LayerNorm(features=128) + + self.linear = nn.Linear(128, 64) + + def forward(self, conv_feature): + ''' + text : (batch, seq_len, embedding_size) + global_info: (batch, embedding_size, 1, 1) + conv_feature: (batch, channel, H, W) + ''' + batch = conv_feature.shape[0] + position2d = positionalencoding2d(64, 16, 64).cast('float32').unsqueeze(0).reshape([1, 64, 1024]) + position2d = position2d.tile([batch, 1, 1]) + conv_feature = paddle.concat([conv_feature, position2d], 1) # batch, 128(64+64), 32, 128 + result = conv_feature.transpose([0, 2, 1]) + origin_result = result + result = self.mul_layernorm1(origin_result + self.multihead(result, result, result, mask=None)[0]) + origin_result = result + result = self.mul_layernorm3(origin_result + self.pff(result)) + result = self.linear(result) + return result.transpose([0, 2, 1]) + + +def str_filt(str_, voc_type): + alpha_dict = { + 'digit': string.digits, + 'lower': string.digits + string.ascii_lowercase, + 'upper': string.digits + string.ascii_letters, + 'all': string.digits + string.ascii_letters + string.punctuation + } + if voc_type == 'lower': + str_ = str_.lower() + for char in str_: + if char not in alpha_dict[voc_type]: + str_ = str_.replace(char, '') + str_ = str_.lower() + return str_ + + +class TBSRN(nn.Layer): + def __init__(self, + in_channels=3, + scale_factor=2, + width=128, + height=32, + STN=True, + srb_nums=5, + mask=False, + hidden_units=32, + infer_mode=False): + super(TBSRN, self).__init__() + in_planes = 3 + if mask: + in_planes = 4 + assert math.log(scale_factor, 2) % 1 == 0 + upsample_block_num = int(math.log(scale_factor, 2)) + self.block1 = nn.Sequential( + nn.Conv2D(in_planes, 2 * hidden_units, kernel_size=9, padding=4), + nn.PReLU() + # nn.ReLU() + ) + self.srb_nums = srb_nums + for i in range(srb_nums): + setattr(self, 'block%d' % (i + 2), RecurrentResidualBlock(2 * hidden_units)) + + setattr(self, 'block%d' % (srb_nums + 2), + nn.Sequential( + nn.Conv2D(2 * hidden_units, 2 * hidden_units, kernel_size=3, padding=1), + nn.BatchNorm2D(2 * hidden_units) + )) + + # self.non_local = NonLocalBlock2D(64, 64) + block_ = [UpsampleBLock(2 * hidden_units, 2) for _ in range(upsample_block_num)] + block_.append(nn.Conv2D(2 * hidden_units, in_planes, kernel_size=9, padding=4)) + setattr(self, 'block%d' % (srb_nums + 3), nn.Sequential(*block_)) + self.tps_inputsize = [height // scale_factor, width // scale_factor] + tps_outputsize = [height // scale_factor, width // scale_factor] + num_control_points = 20 + tps_margins = [0.05, 0.05] + self.stn = STN + self.out_channels = in_channels + if self.stn: + self.tps = TPSSpatialTransformer( + output_image_size=tuple(tps_outputsize), + num_control_points=num_control_points, + margins=tuple(tps_margins)) + + self.stn_head = STNHead( + in_channels=in_planes, + num_ctrlpoints=num_control_points, + activation='none') + self.infer_mode = infer_mode + + self.english_alphabet = '-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + self.english_dict = {} + for index in range(len(self.english_alphabet)): + self.english_dict[self.english_alphabet[index]] = index + transformer = Transformer(alphabet='-0123456789abcdefghijklmnopqrstuvwxyz') + self.transformer = transformer + for param in self.transformer.parameters(): + param.trainable = False + + def label_encoder(self, label): + batch = len(label) + + length = [len(i) for i in label] + length_tensor = paddle.to_tensor(length, dtype='int64') + + max_length = max(length) + input_tensor = np.zeros((batch, max_length)) + for i in range(batch): + for j in range(length[i] - 1): + input_tensor[i][j + 1] = self.english_dict[label[i][j]] + + text_gt = [] + for i in label: + for j in i: + text_gt.append(self.english_dict[j]) + text_gt = paddle.to_tensor(text_gt, dtype='int64') + + input_tensor = paddle.to_tensor(input_tensor, dtype='int64') + return length_tensor, input_tensor, text_gt + + def forward(self, x): + output = {} + if self.infer_mode: + output["lr_img"] = x + y = x + else: + output["lr_img"] = x[0] + output["hr_img"] = x[1] + y = x[0] + if self.stn and self.training: + _, ctrl_points_x = self.stn_head(y) + y, _ = self.tps(y, ctrl_points_x) + block = {'1': self.block1(y)} + for i in range(self.srb_nums + 1): + block[str(i + 2)] = getattr(self, + 'block%d' % (i + 2))(block[str(i + 1)]) + + block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \ + ((block['1'] + block[str(self.srb_nums + 2)])) + + sr_img = paddle.tanh(block[str(self.srb_nums + 3)]) + output["sr_img"] = sr_img + + if self.training: + hr_img = x[1] + + # add transformer + label = [str_filt(i, 'lower') + '-' for i in x[2]] + length_tensor, input_tensor, text_gt = self.label_encoder(label) + hr_pred, word_attention_map_gt, hr_correct_list = self.transformer(hr_img, length_tensor, + input_tensor) + sr_pred, word_attention_map_pred, sr_correct_list = self.transformer(sr_img, length_tensor, + input_tensor) + output["hr_img"] = hr_img + output["hr_pred"] = hr_pred + output["text_gt"] = text_gt + output["word_attention_map_gt"] = word_attention_map_gt + output["sr_pred"] = sr_pred + output["word_attention_map_pred"] = word_attention_map_pred + + return output + + +class RecurrentResidualBlock(nn.Layer): + def __init__(self, channels): + super(RecurrentResidualBlock, self).__init__() + self.conv1 = nn.Conv2D(channels, channels, kernel_size=3, padding=1) + self.bn1 = nn.BatchNorm2D(channels) + self.gru1 = GruBlock(channels, channels) + # self.prelu = nn.ReLU() + self.prelu = mish() + self.conv2 = nn.Conv2D(channels, channels, kernel_size=3, padding=1) + self.bn2 = nn.BatchNorm2D(channels) + self.gru2 = GruBlock(channels, channels) + self.feature_enhancer = FeatureEnhancer() + + for p in self.parameters(): + if p.dim() > 1: + paddle.nn.initializer.XavierUniform(p) + + def forward(self, x): + residual = self.conv1(x) + residual = self.bn1(residual) + residual = self.prelu(residual) + residual = self.conv2(residual) + residual = self.bn2(residual) + + size = residual.shape + residual = residual.reshape([size[0], size[1], -1]) + residual = self.feature_enhancer(residual) + residual = residual.reshape([size[0], size[1], size[2], size[3]]) + return x + residual \ No newline at end of file diff --git a/ppocr/modeling/transforms/tps.py b/ppocr/modeling/transforms/tps.py new file mode 100644 index 0000000000000000000000000000000000000000..9bdab0f85112b90d8da959dce4e258188a812052 --- /dev/null +++ b/ppocr/modeling/transforms/tps.py @@ -0,0 +1,308 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/clovaai/deep-text-recognition-benchmark/blob/master/modules/transformation.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import numpy as np + + +class ConvBNLayer(nn.Layer): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + act=None, + name=None): + super(ConvBNLayer, self).__init__() + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + bn_name = "bn_" + name + self.bn = nn.BatchNorm( + out_channels, + act=act, + param_attr=ParamAttr(name=bn_name + '_scale'), + bias_attr=ParamAttr(bn_name + '_offset'), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance') + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class LocalizationNetwork(nn.Layer): + def __init__(self, in_channels, num_fiducial, loc_lr, model_name): + super(LocalizationNetwork, self).__init__() + self.F = num_fiducial + F = num_fiducial + if model_name == "large": + num_filters_list = [64, 128, 256, 512] + fc_dim = 256 + else: + num_filters_list = [16, 32, 64, 128] + fc_dim = 64 + + self.block_list = [] + for fno in range(0, len(num_filters_list)): + num_filters = num_filters_list[fno] + name = "loc_conv%d" % fno + conv = self.add_sublayer( + name, + ConvBNLayer( + in_channels=in_channels, + out_channels=num_filters, + kernel_size=3, + act='relu', + name=name)) + self.block_list.append(conv) + if fno == len(num_filters_list) - 1: + pool = nn.AdaptiveAvgPool2D(1) + else: + pool = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) + in_channels = num_filters + self.block_list.append(pool) + name = "loc_fc1" + stdv = 1.0 / math.sqrt(num_filters_list[-1] * 1.0) + self.fc1 = nn.Linear( + in_channels, + fc_dim, + weight_attr=ParamAttr( + learning_rate=loc_lr, + name=name + "_w", + initializer=nn.initializer.Uniform(-stdv, stdv)), + bias_attr=ParamAttr(name=name + '.b_0'), + name=name) + + # Init fc2 in LocalizationNetwork + initial_bias = self.get_initial_fiducials() + initial_bias = initial_bias.reshape(-1) + name = "loc_fc2" + param_attr = ParamAttr( + learning_rate=loc_lr, + initializer=nn.initializer.Assign(np.zeros([fc_dim, F * 2])), + name=name + "_w") + bias_attr = ParamAttr( + learning_rate=loc_lr, + initializer=nn.initializer.Assign(initial_bias), + name=name + "_b") + self.fc2 = nn.Linear( + fc_dim, + F * 2, + weight_attr=param_attr, + bias_attr=bias_attr, + name=name) + self.out_channels = F * 2 + + def forward(self, x): + """ + Estimating parameters of geometric transformation + Args: + image: input + Return: + batch_C_prime: the matrix of the geometric transformation + """ + B = x.shape[0] + i = 0 + for block in self.block_list: + x = block(x) + x = x.squeeze(axis=2).squeeze(axis=2) + x = self.fc1(x) + + x = F.relu(x) + x = self.fc2(x) + x = x.reshape(shape=[-1, self.F, 2]) + return x + + def get_initial_fiducials(self): + """ see RARE paper Fig. 6 (a) """ + F = self.F + ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2)) + ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(F / 2)) + ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(F / 2)) + ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1) + ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1) + initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0) + return initial_bias + + +class GridGenerator(nn.Layer): + def __init__(self, in_channels, num_fiducial): + super(GridGenerator, self).__init__() + self.eps = 1e-6 + self.F = num_fiducial + + name = "ex_fc" + initializer = nn.initializer.Constant(value=0.0) + param_attr = ParamAttr( + learning_rate=0.0, initializer=initializer, name=name + "_w") + bias_attr = ParamAttr( + learning_rate=0.0, initializer=initializer, name=name + "_b") + self.fc = nn.Linear( + in_channels, + 6, + weight_attr=param_attr, + bias_attr=bias_attr, + name=name) + + def forward(self, batch_C_prime, I_r_size): + """ + Generate the grid for the grid_sampler. + Args: + batch_C_prime: the matrix of the geometric transformation + I_r_size: the shape of the input image + Return: + batch_P_prime: the grid for the grid_sampler + """ + C = self.build_C_paddle() + P = self.build_P_paddle(I_r_size) + + inv_delta_C_tensor = self.build_inv_delta_C_paddle(C).astype('float32') + P_hat_tensor = self.build_P_hat_paddle( + C, paddle.to_tensor(P)).astype('float32') + + inv_delta_C_tensor.stop_gradient = True + P_hat_tensor.stop_gradient = True + + batch_C_ex_part_tensor = self.get_expand_tensor(batch_C_prime) + + batch_C_ex_part_tensor.stop_gradient = True + + batch_C_prime_with_zeros = paddle.concat( + [batch_C_prime, batch_C_ex_part_tensor], axis=1) + batch_T = paddle.matmul(inv_delta_C_tensor, batch_C_prime_with_zeros) + batch_P_prime = paddle.matmul(P_hat_tensor, batch_T) + return batch_P_prime + + def build_C_paddle(self): + """ Return coordinates of fiducial points in I_r; C """ + F = self.F + ctrl_pts_x = paddle.linspace(-1.0, 1.0, int(F / 2), dtype='float64') + ctrl_pts_y_top = -1 * paddle.ones([int(F / 2)], dtype='float64') + ctrl_pts_y_bottom = paddle.ones([int(F / 2)], dtype='float64') + ctrl_pts_top = paddle.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1) + ctrl_pts_bottom = paddle.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1) + C = paddle.concat([ctrl_pts_top, ctrl_pts_bottom], axis=0) + return C # F x 2 + + def build_P_paddle(self, I_r_size): + I_r_height, I_r_width = I_r_size + I_r_grid_x = (paddle.arange( + -I_r_width, I_r_width, 2, dtype='float64') + 1.0 + ) / paddle.to_tensor(np.array([I_r_width])) + + I_r_grid_y = (paddle.arange( + -I_r_height, I_r_height, 2, dtype='float64') + 1.0 + ) / paddle.to_tensor(np.array([I_r_height])) + + # P: self.I_r_width x self.I_r_height x 2 + P = paddle.stack(paddle.meshgrid(I_r_grid_x, I_r_grid_y), axis=2) + P = paddle.transpose(P, perm=[1, 0, 2]) + # n (= self.I_r_width x self.I_r_height) x 2 + return P.reshape([-1, 2]) + + def build_inv_delta_C_paddle(self, C): + """ Return inv_delta_C which is needed to calculate T """ + F = self.F + hat_eye = paddle.eye(F, dtype='float64') # F x F + hat_C = paddle.norm( + C.reshape([1, F, 2]) - C.reshape([F, 1, 2]), axis=2) + hat_eye + hat_C = (hat_C**2) * paddle.log(hat_C) + delta_C = paddle.concat( # F+3 x F+3 + [ + paddle.concat( + [paddle.ones( + (F, 1), dtype='float64'), C, hat_C], axis=1), # F x F+3 + paddle.concat( + [ + paddle.zeros( + (2, 3), dtype='float64'), paddle.transpose( + C, perm=[1, 0]) + ], + axis=1), # 2 x F+3 + paddle.concat( + [ + paddle.zeros( + (1, 3), dtype='float64'), paddle.ones( + (1, F), dtype='float64') + ], + axis=1) # 1 x F+3 + ], + axis=0) + inv_delta_C = paddle.inverse(delta_C) + return inv_delta_C # F+3 x F+3 + + def build_P_hat_paddle(self, C, P): + F = self.F + eps = self.eps + n = P.shape[0] # n (= self.I_r_width x self.I_r_height) + # P_tile: n x 2 -> n x 1 x 2 -> n x F x 2 + P_tile = paddle.tile(paddle.unsqueeze(P, axis=1), (1, F, 1)) + C_tile = paddle.unsqueeze(C, axis=0) # 1 x F x 2 + P_diff = P_tile - C_tile # n x F x 2 + # rbf_norm: n x F + rbf_norm = paddle.norm(P_diff, p=2, axis=2, keepdim=False) + + # rbf: n x F + rbf = paddle.multiply( + paddle.square(rbf_norm), paddle.log(rbf_norm + eps)) + P_hat = paddle.concat( + [paddle.ones( + (n, 1), dtype='float64'), P, rbf], axis=1) + return P_hat # n x F+3 + + def get_expand_tensor(self, batch_C_prime): + B, H, C = batch_C_prime.shape + batch_C_prime = batch_C_prime.reshape([B, H * C]) + batch_C_ex_part_tensor = self.fc(batch_C_prime) + batch_C_ex_part_tensor = batch_C_ex_part_tensor.reshape([-1, 3, 2]) + return batch_C_ex_part_tensor + + +class TPS(nn.Layer): + def __init__(self, in_channels, num_fiducial, loc_lr, model_name): + super(TPS, self).__init__() + self.loc_net = LocalizationNetwork(in_channels, num_fiducial, loc_lr, + model_name) + self.grid_generator = GridGenerator(self.loc_net.out_channels, + num_fiducial) + self.out_channels = in_channels + + def forward(self, image): + image.stop_gradient = False + batch_C_prime = self.loc_net(image) + batch_P_prime = self.grid_generator(batch_C_prime, image.shape[2:]) + batch_P_prime = batch_P_prime.reshape( + [-1, image.shape[2], image.shape[3], 2]) + batch_I_r = F.grid_sample(x=image, grid=batch_P_prime) + return batch_I_r diff --git a/ppocr/modeling/transforms/tps_spatial_transformer.py b/ppocr/modeling/transforms/tps_spatial_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..e7ec2c848f192d766722f824962a7f8d0fed41f9 --- /dev/null +++ b/ppocr/modeling/transforms/tps_spatial_transformer.py @@ -0,0 +1,156 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/ayumiymk/aster.pytorch/blob/master/lib/models/tps_spatial_transformer.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import numpy as np +import itertools + + +def grid_sample(input, grid, canvas=None): + input.stop_gradient = False + output = F.grid_sample(input, grid) + if canvas is None: + return output + else: + input_mask = paddle.ones(shape=input.shape) + output_mask = F.grid_sample(input_mask, grid) + padded_output = output * output_mask + canvas * (1 - output_mask) + return padded_output + + +# phi(x1, x2) = r^2 * log(r), where r = ||x1 - x2||_2 +def compute_partial_repr(input_points, control_points): + N = input_points.shape[0] + M = control_points.shape[0] + pairwise_diff = paddle.reshape( + input_points, shape=[N, 1, 2]) - paddle.reshape( + control_points, shape=[1, M, 2]) + # original implementation, very slow + # pairwise_dist = torch.sum(pairwise_diff ** 2, dim = 2) # square of distance + pairwise_diff_square = pairwise_diff * pairwise_diff + pairwise_dist = pairwise_diff_square[:, :, 0] + pairwise_diff_square[:, :, + 1] + repr_matrix = 0.5 * pairwise_dist * paddle.log(pairwise_dist) + # fix numerical error for 0 * log(0), substitute all nan with 0 + mask = np.array(repr_matrix != repr_matrix) + repr_matrix[mask] = 0 + return repr_matrix + + +# output_ctrl_pts are specified, according to our task. +def build_output_control_points(num_control_points, margins): + margin_x, margin_y = margins + num_ctrl_pts_per_side = num_control_points // 2 + ctrl_pts_x = np.linspace(margin_x, 1.0 - margin_x, num_ctrl_pts_per_side) + ctrl_pts_y_top = np.ones(num_ctrl_pts_per_side) * margin_y + ctrl_pts_y_bottom = np.ones(num_ctrl_pts_per_side) * (1.0 - margin_y) + ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1) + ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1) + output_ctrl_pts_arr = np.concatenate( + [ctrl_pts_top, ctrl_pts_bottom], axis=0) + output_ctrl_pts = paddle.to_tensor(output_ctrl_pts_arr) + return output_ctrl_pts + + +class TPSSpatialTransformer(nn.Layer): + def __init__(self, + output_image_size=None, + num_control_points=None, + margins=None): + super(TPSSpatialTransformer, self).__init__() + self.output_image_size = output_image_size + self.num_control_points = num_control_points + self.margins = margins + + self.target_height, self.target_width = output_image_size + target_control_points = build_output_control_points(num_control_points, + margins) + N = num_control_points + + # create padded kernel matrix + forward_kernel = paddle.zeros(shape=[N + 3, N + 3]) + target_control_partial_repr = compute_partial_repr( + target_control_points, target_control_points) + target_control_partial_repr = paddle.cast(target_control_partial_repr, + forward_kernel.dtype) + forward_kernel[:N, :N] = target_control_partial_repr + forward_kernel[:N, -3] = 1 + forward_kernel[-3, :N] = 1 + target_control_points = paddle.cast(target_control_points, + forward_kernel.dtype) + forward_kernel[:N, -2:] = target_control_points + forward_kernel[-2:, :N] = paddle.transpose( + target_control_points, perm=[1, 0]) + # compute inverse matrix + inverse_kernel = paddle.inverse(forward_kernel) + + # create target cordinate matrix + HW = self.target_height * self.target_width + target_coordinate = list( + itertools.product( + range(self.target_height), range(self.target_width))) + target_coordinate = paddle.to_tensor(target_coordinate) # HW x 2 + Y, X = paddle.split( + target_coordinate, target_coordinate.shape[1], axis=1) + Y = Y / (self.target_height - 1) + X = X / (self.target_width - 1) + target_coordinate = paddle.concat( + [X, Y], axis=1) # convert from (y, x) to (x, y) + target_coordinate_partial_repr = compute_partial_repr( + target_coordinate, target_control_points) + target_coordinate_repr = paddle.concat( + [ + target_coordinate_partial_repr, paddle.ones(shape=[HW, 1]), + target_coordinate + ], + axis=1) + + # register precomputed matrices + self.inverse_kernel = inverse_kernel + self.padding_matrix = paddle.zeros(shape=[3, 2]) + self.target_coordinate_repr = target_coordinate_repr + self.target_control_points = target_control_points + + def forward(self, input, source_control_points): + assert source_control_points.ndimension() == 3 + assert source_control_points.shape[1] == self.num_control_points + assert source_control_points.shape[2] == 2 + batch_size = paddle.shape(source_control_points)[0] + + padding_matrix = paddle.expand( + self.padding_matrix, shape=[batch_size, 3, 2]) + Y = paddle.concat([source_control_points, padding_matrix], 1) + mapping_matrix = paddle.matmul(self.inverse_kernel, Y) + source_coordinate = paddle.matmul(self.target_coordinate_repr, + mapping_matrix) + + grid = paddle.reshape( + source_coordinate, + shape=[-1, self.target_height, self.target_width, 2]) + grid = paddle.clip(grid, 0, + 1) # the source_control_points may be out of [0, 1]. + # the input to grid_sample is normalized [-1, 1], but what we get is [0, 1] + grid = 2.0 * grid - 1.0 + output_maps = grid_sample(input, grid, canvas=None) + return output_maps, source_coordinate \ No newline at end of file diff --git a/ppocr/modeling/transforms/tsrn.py b/ppocr/modeling/transforms/tsrn.py new file mode 100644 index 0000000000000000000000000000000000000000..31aa90ea4b5d5e8f071487899b72219f3e5b36f5 --- /dev/null +++ b/ppocr/modeling/transforms/tsrn.py @@ -0,0 +1,219 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/FudanVI/FudanOCR/blob/main/text-gestalt/model/tsrn.py +""" + +import math +import paddle +import paddle.nn.functional as F +from paddle import nn +from collections import OrderedDict +import sys +import numpy as np +import warnings +import math, copy +import cv2 + +warnings.filterwarnings("ignore") + +from .tps_spatial_transformer import TPSSpatialTransformer +from .stn import STN as STN_model +from ppocr.modeling.heads.sr_rensnet_transformer import Transformer + + +class TSRN(nn.Layer): + def __init__(self, + in_channels, + scale_factor=2, + width=128, + height=32, + STN=False, + srb_nums=5, + mask=False, + hidden_units=32, + infer_mode=False, + **kwargs): + super(TSRN, self).__init__() + in_planes = 3 + if mask: + in_planes = 4 + assert math.log(scale_factor, 2) % 1 == 0 + upsample_block_num = int(math.log(scale_factor, 2)) + self.block1 = nn.Sequential( + nn.Conv2D( + in_planes, 2 * hidden_units, kernel_size=9, padding=4), + nn.PReLU()) + self.srb_nums = srb_nums + for i in range(srb_nums): + setattr(self, 'block%d' % (i + 2), + RecurrentResidualBlock(2 * hidden_units)) + + setattr( + self, + 'block%d' % (srb_nums + 2), + nn.Sequential( + nn.Conv2D( + 2 * hidden_units, + 2 * hidden_units, + kernel_size=3, + padding=1), + nn.BatchNorm2D(2 * hidden_units))) + + block_ = [ + UpsampleBLock(2 * hidden_units, 2) + for _ in range(upsample_block_num) + ] + block_.append( + nn.Conv2D( + 2 * hidden_units, in_planes, kernel_size=9, padding=4)) + setattr(self, 'block%d' % (srb_nums + 3), nn.Sequential(*block_)) + self.tps_inputsize = [height // scale_factor, width // scale_factor] + tps_outputsize = [height // scale_factor, width // scale_factor] + num_control_points = 20 + tps_margins = [0.05, 0.05] + self.stn = STN + if self.stn: + self.tps = TPSSpatialTransformer( + output_image_size=tuple(tps_outputsize), + num_control_points=num_control_points, + margins=tuple(tps_margins)) + + self.stn_head = STN_model( + in_channels=in_planes, + num_ctrlpoints=num_control_points, + activation='none') + self.out_channels = in_channels + + self.r34_transformer = Transformer() + for param in self.r34_transformer.parameters(): + param.trainable = False + self.infer_mode = infer_mode + + def forward(self, x): + output = {} + if self.infer_mode: + output["lr_img"] = x + y = x + else: + output["lr_img"] = x[0] + output["hr_img"] = x[1] + y = x[0] + if self.stn and self.training: + _, ctrl_points_x = self.stn_head(y) + y, _ = self.tps(y, ctrl_points_x) + block = {'1': self.block1(y)} + for i in range(self.srb_nums + 1): + block[str(i + 2)] = getattr(self, + 'block%d' % (i + 2))(block[str(i + 1)]) + + block[str(self.srb_nums + 3)] = getattr(self, 'block%d' % (self.srb_nums + 3)) \ + ((block['1'] + block[str(self.srb_nums + 2)])) + + sr_img = paddle.tanh(block[str(self.srb_nums + 3)]) + + output["sr_img"] = sr_img + + if self.training: + hr_img = x[1] + length = x[2] + input_tensor = x[3] + + # add transformer + sr_pred, word_attention_map_pred, _ = self.r34_transformer( + sr_img, length, input_tensor) + + hr_pred, word_attention_map_gt, _ = self.r34_transformer( + hr_img, length, input_tensor) + + output["hr_img"] = hr_img + output["hr_pred"] = hr_pred + output["word_attention_map_gt"] = word_attention_map_gt + output["sr_pred"] = sr_pred + output["word_attention_map_pred"] = word_attention_map_pred + + return output + + +class RecurrentResidualBlock(nn.Layer): + def __init__(self, channels): + super(RecurrentResidualBlock, self).__init__() + self.conv1 = nn.Conv2D(channels, channels, kernel_size=3, padding=1) + self.bn1 = nn.BatchNorm2D(channels) + self.gru1 = GruBlock(channels, channels) + self.prelu = mish() + self.conv2 = nn.Conv2D(channels, channels, kernel_size=3, padding=1) + self.bn2 = nn.BatchNorm2D(channels) + self.gru2 = GruBlock(channels, channels) + + def forward(self, x): + residual = self.conv1(x) + residual = self.bn1(residual) + residual = self.prelu(residual) + residual = self.conv2(residual) + residual = self.bn2(residual) + residual = self.gru1(residual.transpose([0, 1, 3, 2])).transpose( + [0, 1, 3, 2]) + + return self.gru2(x + residual) + + +class UpsampleBLock(nn.Layer): + def __init__(self, in_channels, up_scale): + super(UpsampleBLock, self).__init__() + self.conv = nn.Conv2D( + in_channels, in_channels * up_scale**2, kernel_size=3, padding=1) + + self.pixel_shuffle = nn.PixelShuffle(up_scale) + self.prelu = mish() + + def forward(self, x): + x = self.conv(x) + x = self.pixel_shuffle(x) + x = self.prelu(x) + return x + + +class mish(nn.Layer): + def __init__(self, ): + super(mish, self).__init__() + self.activated = True + + def forward(self, x): + if self.activated: + x = x * (paddle.tanh(F.softplus(x))) + return x + + +class GruBlock(nn.Layer): + def __init__(self, in_channels, out_channels): + super(GruBlock, self).__init__() + assert out_channels % 2 == 0 + self.conv1 = nn.Conv2D( + in_channels, out_channels, kernel_size=1, padding=0) + self.gru = nn.GRU(out_channels, + out_channels // 2, + direction='bidirectional') + + def forward(self, x): + # x: b, c, w, h + x = self.conv1(x) + x = x.transpose([0, 2, 3, 1]) # b, w, h, c + batch_size, w, h, c = x.shape + x = x.reshape([-1, h, c]) # b*w, h, c + x, _ = self.gru(x) + x = x.reshape([-1, w, h, c]) + x = x.transpose([0, 3, 1, 2]) + return x diff --git a/ppocr/optimizer/__init__.py b/ppocr/optimizer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b92954c9cc4e32bc23eb550d5ee4ecd45e9b9fc3 --- /dev/null +++ b/ppocr/optimizer/__init__.py @@ -0,0 +1,65 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +import copy +import paddle + +__all__ = ['build_optimizer'] + + +def build_lr_scheduler(lr_config, epochs, step_each_epoch): + from . import learning_rate + lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch}) + lr_name = lr_config.pop('name', 'Const') + lr = getattr(learning_rate, lr_name)(**lr_config)() + return lr + + +def build_optimizer(config, epochs, step_each_epoch, model): + from . import regularizer, optimizer + config = copy.deepcopy(config) + # step1 build lr + lr = build_lr_scheduler(config.pop('lr'), epochs, step_each_epoch) + + # step2 build regularization + if 'regularizer' in config and config['regularizer'] is not None: + reg_config = config.pop('regularizer') + reg_name = reg_config.pop('name') + if not hasattr(regularizer, reg_name): + reg_name += 'Decay' + reg = getattr(regularizer, reg_name)(**reg_config)() + elif 'weight_decay' in config: + reg = config.pop('weight_decay') + else: + reg = None + + # step3 build optimizer + optim_name = config.pop('name') + if 'clip_norm' in config: + clip_norm = config.pop('clip_norm') + grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm) + elif 'clip_norm_global' in config: + clip_norm = config.pop('clip_norm_global') + grad_clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=clip_norm) + else: + grad_clip = None + optim = getattr(optimizer, optim_name)(learning_rate=lr, + weight_decay=reg, + grad_clip=grad_clip, + **config) + return optim(model), lr diff --git a/ppocr/optimizer/learning_rate.py b/ppocr/optimizer/learning_rate.py new file mode 100644 index 0000000000000000000000000000000000000000..be52a918458d64f0ae15b52ebf511e5068184f59 --- /dev/null +++ b/ppocr/optimizer/learning_rate.py @@ -0,0 +1,429 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from paddle.optimizer import lr +from .lr_scheduler import CyclicalCosineDecay, OneCycleDecay, TwoStepCosineDecay + + +class Linear(object): + """ + Linear learning rate decay + Args: + lr (float): The initial learning rate. It is a python float number. + epochs(int): The decay step size. It determines the decay cycle. + end_lr(float, optional): The minimum final learning rate. Default: 0.0001. + power(float, optional): Power of polynomial. Default: 1.0. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + learning_rate, + epochs, + step_each_epoch, + end_lr=0.0, + power=1.0, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(Linear, self).__init__() + self.learning_rate = learning_rate + self.epochs = epochs * step_each_epoch + self.end_lr = end_lr + self.power = power + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = lr.PolynomialDecay( + learning_rate=self.learning_rate, + decay_steps=self.epochs, + end_lr=self.end_lr, + power=self.power, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch) + return learning_rate + + +class Cosine(object): + """ + Cosine learning rate decay + lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1) + Args: + lr(float): initial learning rate + step_each_epoch(int): steps each epoch + epochs(int): total training epochs + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + learning_rate, + step_each_epoch, + epochs, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(Cosine, self).__init__() + self.learning_rate = learning_rate + self.T_max = step_each_epoch * epochs + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = lr.CosineAnnealingDecay( + learning_rate=self.learning_rate, + T_max=self.T_max, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch) + return learning_rate + + +class Step(object): + """ + Piecewise learning rate decay + Args: + step_each_epoch(int): steps each epoch + learning_rate (float): The initial learning rate. It is a python float number. + step_size (int): the interval to update. + gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . + It should be less than 1.0. Default: 0.1. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + learning_rate, + step_size, + step_each_epoch, + gamma, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(Step, self).__init__() + self.step_size = step_each_epoch * step_size + self.learning_rate = learning_rate + self.gamma = gamma + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = lr.StepDecay( + learning_rate=self.learning_rate, + step_size=self.step_size, + gamma=self.gamma, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch) + return learning_rate + + +class Piecewise(object): + """ + Piecewise learning rate decay + Args: + boundaries(list): A list of steps numbers. The type of element in the list is python int. + values(list): A list of learning rate values that will be picked during different epoch boundaries. + The type of element in the list is python float. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + step_each_epoch, + decay_epochs, + values, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(Piecewise, self).__init__() + self.boundaries = [step_each_epoch * e for e in decay_epochs] + self.values = values + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = lr.PiecewiseDecay( + boundaries=self.boundaries, + values=self.values, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.values[0], + last_epoch=self.last_epoch) + return learning_rate + + +class CyclicalCosine(object): + """ + Cyclical cosine learning rate decay + Args: + learning_rate(float): initial learning rate + step_each_epoch(int): steps each epoch + epochs(int): total training epochs + cycle(int): period of the cosine learning rate + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + learning_rate, + step_each_epoch, + epochs, + cycle, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(CyclicalCosine, self).__init__() + self.learning_rate = learning_rate + self.T_max = step_each_epoch * epochs + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + self.cycle = round(cycle * step_each_epoch) + + def __call__(self): + learning_rate = CyclicalCosineDecay( + learning_rate=self.learning_rate, + T_max=self.T_max, + cycle=self.cycle, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch) + return learning_rate + + +class OneCycle(object): + """ + One Cycle learning rate decay + Args: + max_lr(float): Upper learning rate boundaries + epochs(int): total training epochs + step_each_epoch(int): steps each epoch + anneal_strategy(str): {‘cos’, ‘linear’} Specifies the annealing strategy: “cos” for cosine annealing, “linear” for linear annealing. + Default: ‘cos’ + three_phase(bool): If True, use a third phase of the schedule to annihilate the learning rate according to ‘final_div_factor’ + instead of modifying the second phase (the first two phases will be symmetrical about the step indicated by ‘pct_start’). + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + max_lr, + epochs, + step_each_epoch, + anneal_strategy='cos', + three_phase=False, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(OneCycle, self).__init__() + self.max_lr = max_lr + self.epochs = epochs + self.steps_per_epoch = step_each_epoch + self.anneal_strategy = anneal_strategy + self.three_phase = three_phase + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = OneCycleDecay( + max_lr=self.max_lr, + epochs=self.epochs, + steps_per_epoch=self.steps_per_epoch, + anneal_strategy=self.anneal_strategy, + three_phase=self.three_phase, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.max_lr, + last_epoch=self.last_epoch) + return learning_rate + + +class Const(object): + """ + Const learning rate decay + Args: + learning_rate(float): initial learning rate + step_each_epoch(int): steps each epoch + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + learning_rate, + step_each_epoch, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(Const, self).__init__() + self.learning_rate = learning_rate + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = self.learning_rate + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch) + return learning_rate + + +class DecayLearningRate(object): + """ + DecayLearningRate learning rate decay + new_lr = (lr - end_lr) * (1 - epoch/decay_steps)**power + end_lr + Args: + learning_rate(float): initial learning rate + step_each_epoch(int): steps each epoch + epochs(int): total training epochs + factor(float): Power of polynomial, should greater than 0.0 to get learning rate decay. Default: 0.9 + end_lr(float): The minimum final learning rate. Default: 0.0. + """ + + def __init__(self, + learning_rate, + step_each_epoch, + epochs, + factor=0.9, + end_lr=0, + **kwargs): + super(DecayLearningRate, self).__init__() + self.learning_rate = learning_rate + self.epochs = epochs + 1 + self.factor = factor + self.end_lr = 0 + self.decay_steps = step_each_epoch * epochs + + def __call__(self): + learning_rate = lr.PolynomialDecay( + learning_rate=self.learning_rate, + decay_steps=self.decay_steps, + power=self.factor, + end_lr=self.end_lr) + return learning_rate + + +class MultiStepDecay(object): + """ + Piecewise learning rate decay + Args: + step_each_epoch(int): steps each epoch + learning_rate (float): The initial learning rate. It is a python float number. + step_size (int): the interval to update. + gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . + It should be less than 1.0. Default: 0.1. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + learning_rate, + milestones, + step_each_epoch, + gamma, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(MultiStepDecay, self).__init__() + self.milestones = [step_each_epoch * e for e in milestones] + self.learning_rate = learning_rate + self.gamma = gamma + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = lr.MultiStepDecay( + learning_rate=self.learning_rate, + milestones=self.milestones, + gamma=self.gamma, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch) + return learning_rate + + +class TwoStepCosine(object): + """ + Cosine learning rate decay + lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1) + Args: + lr(float): initial learning rate + step_each_epoch(int): steps each epoch + epochs(int): total training epochs + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__(self, + learning_rate, + step_each_epoch, + epochs, + warmup_epoch=0, + last_epoch=-1, + **kwargs): + super(TwoStepCosine, self).__init__() + self.learning_rate = learning_rate + self.T_max1 = step_each_epoch * 200 + self.T_max2 = step_each_epoch * epochs + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = TwoStepCosineDecay( + learning_rate=self.learning_rate, + T_max1=self.T_max1, + T_max2=self.T_max2, + last_epoch=self.last_epoch) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch) + return learning_rate diff --git a/ppocr/optimizer/lr_scheduler.py b/ppocr/optimizer/lr_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..cd09367e2ab8a649e3c375698f5b182eb5c3ff7a --- /dev/null +++ b/ppocr/optimizer/lr_scheduler.py @@ -0,0 +1,222 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from paddle.optimizer.lr import LRScheduler + + +class CyclicalCosineDecay(LRScheduler): + def __init__(self, + learning_rate, + T_max, + cycle=1, + last_epoch=-1, + eta_min=0.0, + verbose=False): + """ + Cyclical cosine learning rate decay + A learning rate which can be referred in https://arxiv.org/pdf/2012.12645.pdf + Args: + learning rate(float): learning rate + T_max(int): maximum epoch num + cycle(int): period of the cosine decay + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + eta_min(float): minimum learning rate during training + verbose(bool): whether to print learning rate for each epoch + """ + super(CyclicalCosineDecay, self).__init__(learning_rate, last_epoch, + verbose) + self.cycle = cycle + self.eta_min = eta_min + + def get_lr(self): + if self.last_epoch == 0: + return self.base_lr + reletive_epoch = self.last_epoch % self.cycle + lr = self.eta_min + 0.5 * (self.base_lr - self.eta_min) * \ + (1 + math.cos(math.pi * reletive_epoch / self.cycle)) + return lr + + +class OneCycleDecay(LRScheduler): + """ + One Cycle learning rate decay + A learning rate which can be referred in https://arxiv.org/abs/1708.07120 + Code refered in https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR + """ + + def __init__(self, + max_lr, + epochs=None, + steps_per_epoch=None, + pct_start=0.3, + anneal_strategy='cos', + div_factor=25., + final_div_factor=1e4, + three_phase=False, + last_epoch=-1, + verbose=False): + + # Validate total_steps + if epochs <= 0 or not isinstance(epochs, int): + raise ValueError( + "Expected positive integer epochs, but got {}".format(epochs)) + if steps_per_epoch <= 0 or not isinstance(steps_per_epoch, int): + raise ValueError( + "Expected positive integer steps_per_epoch, but got {}".format( + steps_per_epoch)) + self.total_steps = epochs * steps_per_epoch + + self.max_lr = max_lr + self.initial_lr = self.max_lr / div_factor + self.min_lr = self.initial_lr / final_div_factor + + if three_phase: + self._schedule_phases = [ + { + 'end_step': float(pct_start * self.total_steps) - 1, + 'start_lr': self.initial_lr, + 'end_lr': self.max_lr, + }, + { + 'end_step': float(2 * pct_start * self.total_steps) - 2, + 'start_lr': self.max_lr, + 'end_lr': self.initial_lr, + }, + { + 'end_step': self.total_steps - 1, + 'start_lr': self.initial_lr, + 'end_lr': self.min_lr, + }, + ] + else: + self._schedule_phases = [ + { + 'end_step': float(pct_start * self.total_steps) - 1, + 'start_lr': self.initial_lr, + 'end_lr': self.max_lr, + }, + { + 'end_step': self.total_steps - 1, + 'start_lr': self.max_lr, + 'end_lr': self.min_lr, + }, + ] + + # Validate pct_start + if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float): + raise ValueError( + "Expected float between 0 and 1 pct_start, but got {}".format( + pct_start)) + + # Validate anneal_strategy + if anneal_strategy not in ['cos', 'linear']: + raise ValueError( + "anneal_strategy must by one of 'cos' or 'linear', instead got {}". + format(anneal_strategy)) + elif anneal_strategy == 'cos': + self.anneal_func = self._annealing_cos + elif anneal_strategy == 'linear': + self.anneal_func = self._annealing_linear + + super(OneCycleDecay, self).__init__(max_lr, last_epoch, verbose) + + def _annealing_cos(self, start, end, pct): + "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0." + cos_out = math.cos(math.pi * pct) + 1 + return end + (start - end) / 2.0 * cos_out + + def _annealing_linear(self, start, end, pct): + "Linearly anneal from `start` to `end` as pct goes from 0.0 to 1.0." + return (end - start) * pct + start + + def get_lr(self): + computed_lr = 0.0 + step_num = self.last_epoch + + if step_num > self.total_steps: + raise ValueError( + "Tried to step {} times. The specified number of total steps is {}" + .format(step_num + 1, self.total_steps)) + start_step = 0 + for i, phase in enumerate(self._schedule_phases): + end_step = phase['end_step'] + if step_num <= end_step or i == len(self._schedule_phases) - 1: + pct = (step_num - start_step) / (end_step - start_step) + computed_lr = self.anneal_func(phase['start_lr'], + phase['end_lr'], pct) + break + start_step = phase['end_step'] + + return computed_lr + + +class TwoStepCosineDecay(LRScheduler): + def __init__(self, + learning_rate, + T_max1, + T_max2, + eta_min=0, + last_epoch=-1, + verbose=False): + if not isinstance(T_max1, int): + raise TypeError( + "The type of 'T_max1' in 'CosineAnnealingDecay' must be 'int', but received %s." + % type(T_max1)) + if not isinstance(T_max2, int): + raise TypeError( + "The type of 'T_max2' in 'CosineAnnealingDecay' must be 'int', but received %s." + % type(T_max2)) + if not isinstance(eta_min, (float, int)): + raise TypeError( + "The type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s." + % type(eta_min)) + assert T_max1 > 0 and isinstance( + T_max1, int), " 'T_max1' must be a positive integer." + assert T_max2 > 0 and isinstance( + T_max2, int), " 'T_max1' must be a positive integer." + self.T_max1 = T_max1 + self.T_max2 = T_max2 + self.eta_min = float(eta_min) + super(TwoStepCosineDecay, self).__init__(learning_rate, last_epoch, + verbose) + + def get_lr(self): + + if self.last_epoch <= self.T_max1: + if self.last_epoch == 0: + return self.base_lr + elif (self.last_epoch - 1 - self.T_max1) % (2 * self.T_max1) == 0: + return self.last_lr + (self.base_lr - self.eta_min) * ( + 1 - math.cos(math.pi / self.T_max1)) / 2 + + return (1 + math.cos(math.pi * self.last_epoch / self.T_max1)) / ( + 1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max1)) * ( + self.last_lr - self.eta_min) + self.eta_min + else: + if (self.last_epoch - 1 - self.T_max2) % (2 * self.T_max2) == 0: + return self.last_lr + (self.base_lr - self.eta_min) * ( + 1 - math.cos(math.pi / self.T_max2)) / 2 + + return (1 + math.cos(math.pi * self.last_epoch / self.T_max2)) / ( + 1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max2)) * ( + self.last_lr - self.eta_min) + self.eta_min + + def _get_closed_form_lr(self): + if self.last_epoch <= self.T_max1: + return self.eta_min + (self.base_lr - self.eta_min) * (1 + math.cos( + math.pi * self.last_epoch / self.T_max1)) / 2 + else: + return self.eta_min + (self.base_lr - self.eta_min) * (1 + math.cos( + math.pi * self.last_epoch / self.T_max2)) / 2 diff --git a/ppocr/optimizer/optimizer.py b/ppocr/optimizer/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..144f011c79ec2303b7fbc73ac078afe3ce92c255 --- /dev/null +++ b/ppocr/optimizer/optimizer.py @@ -0,0 +1,285 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from paddle import optimizer as optim + + +class Momentum(object): + """ + Simple Momentum optimizer with velocity state. + Args: + learning_rate (float|Variable) - The learning rate used to update parameters. + Can be a float value or a Variable with one float value as data element. + momentum (float) - Momentum factor. + regularization (WeightDecayRegularizer, optional) - The strategy of regularization. + """ + + def __init__(self, + learning_rate, + momentum, + weight_decay=None, + grad_clip=None, + **args): + super(Momentum, self).__init__() + self.learning_rate = learning_rate + self.momentum = momentum + self.weight_decay = weight_decay + self.grad_clip = grad_clip + + def __call__(self, model): + train_params = [ + param for param in model.parameters() if param.trainable is True + ] + opt = optim.Momentum( + learning_rate=self.learning_rate, + momentum=self.momentum, + weight_decay=self.weight_decay, + grad_clip=self.grad_clip, + parameters=train_params) + return opt + + +class Adam(object): + def __init__(self, + learning_rate=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + parameter_list=None, + weight_decay=None, + grad_clip=None, + name=None, + lazy_mode=False, + **kwargs): + self.learning_rate = learning_rate + self.beta1 = beta1 + self.beta2 = beta2 + self.epsilon = epsilon + self.parameter_list = parameter_list + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.grad_clip = grad_clip + self.name = name + self.lazy_mode = lazy_mode + self.group_lr = kwargs.get('group_lr', False) + self.training_step = kwargs.get('training_step', None) + + def __call__(self, model): + if self.group_lr: + if self.training_step == 'LF_2': + import paddle + if isinstance(model, paddle.fluid.dygraph.parallel. + DataParallel): # multi gpu + mlm = model._layers.head.MLM_VRM.MLM.parameters() + pre_mlm_pp = model._layers.head.MLM_VRM.Prediction.pp_share.parameters( + ) + pre_mlm_w = model._layers.head.MLM_VRM.Prediction.w_share.parameters( + ) + else: # single gpu + mlm = model.head.MLM_VRM.MLM.parameters() + pre_mlm_pp = model.head.MLM_VRM.Prediction.pp_share.parameters( + ) + pre_mlm_w = model.head.MLM_VRM.Prediction.w_share.parameters( + ) + + total = [] + for param in mlm: + total.append(id(param)) + for param in pre_mlm_pp: + total.append(id(param)) + for param in pre_mlm_w: + total.append(id(param)) + + group_base_params = [ + param for param in model.parameters() if id(param) in total + ] + group_small_params = [ + param for param in model.parameters() + if id(param) not in total + ] + train_params = [{ + 'params': group_base_params + }, { + 'params': group_small_params, + 'learning_rate': self.learning_rate.values[0] * 0.1 + }] + + else: + print( + 'group lr currently only support VisionLAN in LF_2 training step' + ) + train_params = [ + param for param in model.parameters() + if param.trainable is True + ] + else: + train_params = [ + param for param in model.parameters() if param.trainable is True + ] + + opt = optim.Adam( + learning_rate=self.learning_rate, + beta1=self.beta1, + beta2=self.beta2, + epsilon=self.epsilon, + weight_decay=self.weight_decay, + grad_clip=self.grad_clip, + name=self.name, + lazy_mode=self.lazy_mode, + parameters=train_params) + return opt + + +class RMSProp(object): + """ + Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method. + Args: + learning_rate (float|Variable) - The learning rate used to update parameters. + Can be a float value or a Variable with one float value as data element. + momentum (float) - Momentum factor. + rho (float) - rho value in equation. + epsilon (float) - avoid division by zero, default is 1e-6. + regularization (WeightDecayRegularizer, optional) - The strategy of regularization. + """ + + def __init__(self, + learning_rate, + momentum=0.0, + rho=0.95, + epsilon=1e-6, + weight_decay=None, + grad_clip=None, + **args): + super(RMSProp, self).__init__() + self.learning_rate = learning_rate + self.momentum = momentum + self.rho = rho + self.epsilon = epsilon + self.weight_decay = weight_decay + self.grad_clip = grad_clip + + def __call__(self, model): + train_params = [ + param for param in model.parameters() if param.trainable is True + ] + opt = optim.RMSProp( + learning_rate=self.learning_rate, + momentum=self.momentum, + rho=self.rho, + epsilon=self.epsilon, + weight_decay=self.weight_decay, + grad_clip=self.grad_clip, + parameters=train_params) + return opt + + +class Adadelta(object): + def __init__(self, + learning_rate=0.001, + epsilon=1e-08, + rho=0.95, + parameter_list=None, + weight_decay=None, + grad_clip=None, + name=None, + **kwargs): + self.learning_rate = learning_rate + self.epsilon = epsilon + self.rho = rho + self.parameter_list = parameter_list + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.grad_clip = grad_clip + self.name = name + + def __call__(self, model): + train_params = [ + param for param in model.parameters() if param.trainable is True + ] + opt = optim.Adadelta( + learning_rate=self.learning_rate, + epsilon=self.epsilon, + rho=self.rho, + weight_decay=self.weight_decay, + grad_clip=self.grad_clip, + name=self.name, + parameters=train_params) + return opt + + +class AdamW(object): + def __init__(self, + learning_rate=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-8, + weight_decay=0.01, + multi_precision=False, + grad_clip=None, + no_weight_decay_name=None, + one_dim_param_no_weight_decay=False, + name=None, + lazy_mode=False, + **args): + super().__init__() + self.learning_rate = learning_rate + self.beta1 = beta1 + self.beta2 = beta2 + self.epsilon = epsilon + self.grad_clip = grad_clip + self.weight_decay = 0.01 if weight_decay is None else weight_decay + self.grad_clip = grad_clip + self.name = name + self.lazy_mode = lazy_mode + self.multi_precision = multi_precision + self.no_weight_decay_name_list = no_weight_decay_name.split( + ) if no_weight_decay_name else [] + self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay + + def __call__(self, model): + parameters = [ + param for param in model.parameters() if param.trainable is True + ] + + self.no_weight_decay_param_name_list = [ + p.name for n, p in model.named_parameters() + if any(nd in n for nd in self.no_weight_decay_name_list) + ] + + if self.one_dim_param_no_weight_decay: + self.no_weight_decay_param_name_list += [ + p.name for n, p in model.named_parameters() if len(p.shape) == 1 + ] + + opt = optim.AdamW( + learning_rate=self.learning_rate, + beta1=self.beta1, + beta2=self.beta2, + epsilon=self.epsilon, + parameters=parameters, + weight_decay=self.weight_decay, + multi_precision=self.multi_precision, + grad_clip=self.grad_clip, + name=self.name, + lazy_mode=self.lazy_mode, + apply_decay_param_fun=self._apply_decay_param_fun) + return opt + + def _apply_decay_param_fun(self, name): + return name not in self.no_weight_decay_param_name_list diff --git a/ppocr/optimizer/regularizer.py b/ppocr/optimizer/regularizer.py new file mode 100644 index 0000000000000000000000000000000000000000..2ce68f7139e21f9e3e1dcc155254b7a92b0e7270 --- /dev/null +++ b/ppocr/optimizer/regularizer.py @@ -0,0 +1,51 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import paddle + + +class L1Decay(object): + """ + L1 Weight Decay Regularization, which encourages the weights to be sparse. + Args: + factor(float): regularization coeff. Default:0.0. + """ + + def __init__(self, factor=0.0): + super(L1Decay, self).__init__() + self.coeff = factor + + def __call__(self): + reg = paddle.regularizer.L1Decay(self.coeff) + return reg + + +class L2Decay(object): + """ + L2 Weight Decay Regularization, which helps to prevent the model over-fitting. + Args: + factor(float): regularization coeff. Default:0.0. + """ + + def __init__(self, factor=0.0): + super(L2Decay, self).__init__() + self.coeff = float(factor) + + def __call__(self): + return self.coeff \ No newline at end of file diff --git a/ppocr/postprocess/__init__.py b/ppocr/postprocess/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..36a3152f2f2d68ed0884bd415844d209d850f5ca --- /dev/null +++ b/ppocr/postprocess/__init__.py @@ -0,0 +1,71 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import copy + +__all__ = ['build_post_process'] + +from .db_postprocess import DBPostProcess, DistillationDBPostProcess +from .east_postprocess import EASTPostProcess +from .sast_postprocess import SASTPostProcess +from .fce_postprocess import FCEPostProcess +from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, \ + DistillationCTCLabelDecode, NRTRLabelDecode, SARLabelDecode, \ + SEEDLabelDecode, PRENLabelDecode, ViTSTRLabelDecode, ABINetLabelDecode, \ + SPINLabelDecode, VLLabelDecode, RFLLabelDecode +from .cls_postprocess import ClsPostProcess +from .pg_postprocess import PGPostProcess +from .vqa_token_ser_layoutlm_postprocess import VQASerTokenLayoutLMPostProcess, DistillationSerPostProcess +from .vqa_token_re_layoutlm_postprocess import VQAReTokenLayoutLMPostProcess, DistillationRePostProcess +from .table_postprocess import TableMasterLabelDecode, TableLabelDecode +from .picodet_postprocess import PicoDetPostProcess +from .ct_postprocess import CTPostProcess +from .drrg_postprocess import DRRGPostprocess +from .rec_postprocess import CANLabelDecode + + +def build_post_process(config, global_config=None): + support_dict = [ + 'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'FCEPostProcess', + 'CTCLabelDecode', 'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode', + 'PGPostProcess', 'DistillationCTCLabelDecode', 'TableLabelDecode', + 'DistillationDBPostProcess', 'NRTRLabelDecode', 'SARLabelDecode', + 'SEEDLabelDecode', 'VQASerTokenLayoutLMPostProcess', + 'VQAReTokenLayoutLMPostProcess', 'PRENLabelDecode', + 'DistillationSARLabelDecode', 'ViTSTRLabelDecode', 'ABINetLabelDecode', + 'TableMasterLabelDecode', 'SPINLabelDecode', + 'DistillationSerPostProcess', 'DistillationRePostProcess', + 'VLLabelDecode', 'PicoDetPostProcess', 'CTPostProcess', + 'RFLLabelDecode', 'DRRGPostprocess', 'CANLabelDecode' + ] + + if config['name'] == 'PSEPostProcess': + from .pse_postprocess import PSEPostProcess + support_dict.append('PSEPostProcess') + + config = copy.deepcopy(config) + module_name = config.pop('name') + if module_name == "None": + return + if global_config is not None: + config.update(global_config) + assert module_name in support_dict, Exception( + 'post process only support {}'.format(support_dict)) + module_class = eval(module_name)(**config) + return module_class diff --git a/ppocr/postprocess/cls_postprocess.py b/ppocr/postprocess/cls_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..9a27ba0831358564d99a6ec698a5019eae1c25f7 --- /dev/null +++ b/ppocr/postprocess/cls_postprocess.py @@ -0,0 +1,42 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle + + +class ClsPostProcess(object): + """ Convert between text-label and text-index """ + + def __init__(self, label_list=None, key=None, **kwargs): + super(ClsPostProcess, self).__init__() + self.label_list = label_list + self.key = key + + def __call__(self, preds, label=None, *args, **kwargs): + if self.key is not None: + preds = preds[self.key] + + label_list = self.label_list + if label_list is None: + label_list = {idx: idx for idx in range(preds.shape[-1])} + + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + + pred_idxs = preds.argmax(axis=1) + decode_out = [(label_list[idx], preds[i, idx]) + for i, idx in enumerate(pred_idxs)] + if label is None: + return decode_out + label = [(label_list[idx], 1.0) for idx in label] + return decode_out, label diff --git a/ppocr/postprocess/ct_postprocess.py b/ppocr/postprocess/ct_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..3ab90be24d65888339698a5abe2ed692ceaab4c7 --- /dev/null +++ b/ppocr/postprocess/ct_postprocess.py @@ -0,0 +1,154 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refered from: +https://github.com/shengtao96/CentripetalText/blob/main/test.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import os.path as osp +import numpy as np +import cv2 +import paddle +import pyclipper + + +class CTPostProcess(object): + """ + The post process for Centripetal Text (CT). + """ + + def __init__(self, min_score=0.88, min_area=16, box_type='poly', **kwargs): + self.min_score = min_score + self.min_area = min_area + self.box_type = box_type + + self.coord = np.zeros((2, 300, 300), dtype=np.int32) + for i in range(300): + for j in range(300): + self.coord[0, i, j] = j + self.coord[1, i, j] = i + + def __call__(self, preds, batch): + outs = preds['maps'] + out_scores = preds['score'] + + if isinstance(outs, paddle.Tensor): + outs = outs.numpy() + if isinstance(out_scores, paddle.Tensor): + out_scores = out_scores.numpy() + + batch_size = outs.shape[0] + boxes_batch = [] + for idx in range(batch_size): + bboxes = [] + scores = [] + + img_shape = batch[idx] + + org_img_size = img_shape[:3] + img_shape = img_shape[3:] + img_size = img_shape[:2] + + out = np.expand_dims(outs[idx], axis=0) + outputs = dict() + + score = np.expand_dims(out_scores[idx], axis=0) + + kernel = out[:, 0, :, :] > 0.2 + loc = out[:, 1:, :, :].astype("float32") + + score = score[0].astype(np.float32) + kernel = kernel[0].astype(np.uint8) + loc = loc[0].astype(np.float32) + + label_num, label_kernel = cv2.connectedComponents( + kernel, connectivity=4) + + for i in range(1, label_num): + ind = (label_kernel == i) + if ind.sum( + ) < 10: # pixel number less than 10, treated as background + label_kernel[ind] = 0 + + label = np.zeros_like(label_kernel) + h, w = label_kernel.shape + pixels = self.coord[:, :h, :w].reshape(2, -1) + points = pixels.transpose([1, 0]).astype(np.float32) + + off_points = (points + 10. / 4. * loc[:, pixels[1], pixels[0]].T + ).astype(np.int32) + off_points[:, 0] = np.clip(off_points[:, 0], 0, label.shape[1] - 1) + off_points[:, 1] = np.clip(off_points[:, 1], 0, label.shape[0] - 1) + + label[pixels[1], pixels[0]] = label_kernel[off_points[:, 1], + off_points[:, 0]] + label[label_kernel > 0] = label_kernel[label_kernel > 0] + + score_pocket = [0.0] + for i in range(1, label_num): + ind = (label_kernel == i) + if ind.sum() == 0: + score_pocket.append(0.0) + continue + score_i = np.mean(score[ind]) + score_pocket.append(score_i) + + label_num = np.max(label) + 1 + label = cv2.resize( + label, (img_size[1], img_size[0]), + interpolation=cv2.INTER_NEAREST) + + scale = (float(org_img_size[1]) / float(img_size[1]), + float(org_img_size[0]) / float(img_size[0])) + + for i in range(1, label_num): + ind = (label == i) + points = np.array(np.where(ind)).transpose((1, 0)) + + if points.shape[0] < self.min_area: + continue + + score_i = score_pocket[i] + if score_i < self.min_score: + continue + + if self.box_type == 'rect': + rect = cv2.minAreaRect(points[:, ::-1]) + bbox = cv2.boxPoints(rect) * scale + z = bbox.mean(0) + bbox = z + (bbox - z) * 0.85 + elif self.box_type == 'poly': + binary = np.zeros(label.shape, dtype='uint8') + binary[ind] = 1 + try: + _, contours, _ = cv2.findContours( + binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + except BaseException: + contours, _ = cv2.findContours( + binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + bbox = contours[0] * scale + + bbox = bbox.astype('int32') + bboxes.append(bbox.reshape(-1, 2)) + scores.append(score_i) + + boxes_batch.append({'points': bboxes}) + + return boxes_batch diff --git a/ppocr/postprocess/db_postprocess.py b/ppocr/postprocess/db_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..244825b76a47162419b4ae68103b182331be1791 --- /dev/null +++ b/ppocr/postprocess/db_postprocess.py @@ -0,0 +1,276 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refered from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/post_processing/seg_detector_representer.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import cv2 +import paddle +from shapely.geometry import Polygon +import pyclipper + + +class DBPostProcess(object): + """ + The post process for Differentiable Binarization (DB). + """ + + def __init__(self, + thresh=0.3, + box_thresh=0.7, + max_candidates=1000, + unclip_ratio=2.0, + use_dilation=False, + score_mode="fast", + box_type='quad', + **kwargs): + self.thresh = thresh + self.box_thresh = box_thresh + self.max_candidates = max_candidates + self.unclip_ratio = unclip_ratio + self.min_size = 3 + self.score_mode = score_mode + self.box_type = box_type + assert score_mode in [ + "slow", "fast" + ], "Score mode must be in [slow, fast] but got: {}".format(score_mode) + + self.dilation_kernel = None if not use_dilation else np.array( + [[1, 1], [1, 1]]) + + def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height): + ''' + _bitmap: single map with shape (1, H, W), + whose values are binarized as {0, 1} + ''' + + bitmap = _bitmap + height, width = bitmap.shape + + boxes = [] + scores = [] + + contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), + cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) + + for contour in contours[:self.max_candidates]: + epsilon = 0.002 * cv2.arcLength(contour, True) + approx = cv2.approxPolyDP(contour, epsilon, True) + points = approx.reshape((-1, 2)) + if points.shape[0] < 4: + continue + + score = self.box_score_fast(pred, points.reshape(-1, 2)) + if self.box_thresh > score: + continue + + if points.shape[0] > 2: + box = self.unclip(points, self.unclip_ratio) + if len(box) > 1: + continue + else: + continue + box = box.reshape(-1, 2) + + _, sside = self.get_mini_boxes(box.reshape((-1, 1, 2))) + if sside < self.min_size + 2: + continue + + box = np.array(box) + box[:, 0] = np.clip( + np.round(box[:, 0] / width * dest_width), 0, dest_width) + box[:, 1] = np.clip( + np.round(box[:, 1] / height * dest_height), 0, dest_height) + boxes.append(box.tolist()) + scores.append(score) + return boxes, scores + + def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height): + ''' + _bitmap: single map with shape (1, H, W), + whose values are binarized as {0, 1} + ''' + + bitmap = _bitmap + height, width = bitmap.shape + + outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, + cv2.CHAIN_APPROX_SIMPLE) + if len(outs) == 3: + img, contours, _ = outs[0], outs[1], outs[2] + elif len(outs) == 2: + contours, _ = outs[0], outs[1] + + num_contours = min(len(contours), self.max_candidates) + + boxes = [] + scores = [] + for index in range(num_contours): + contour = contours[index] + points, sside = self.get_mini_boxes(contour) + if sside < self.min_size: + continue + points = np.array(points) + if self.score_mode == "fast": + score = self.box_score_fast(pred, points.reshape(-1, 2)) + else: + score = self.box_score_slow(pred, contour) + if self.box_thresh > score: + continue + + box = self.unclip(points, self.unclip_ratio).reshape(-1, 1, 2) + box, sside = self.get_mini_boxes(box) + if sside < self.min_size + 2: + continue + box = np.array(box) + + box[:, 0] = np.clip( + np.round(box[:, 0] / width * dest_width), 0, dest_width) + box[:, 1] = np.clip( + np.round(box[:, 1] / height * dest_height), 0, dest_height) + boxes.append(box.astype("int32")) + scores.append(score) + return np.array(boxes, dtype="int32"), scores + + def unclip(self, box, unclip_ratio): + poly = Polygon(box) + distance = poly.area * unclip_ratio / poly.length + offset = pyclipper.PyclipperOffset() + offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + expanded = np.array(offset.Execute(distance)) + return expanded + + def get_mini_boxes(self, contour): + bounding_box = cv2.minAreaRect(contour) + points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) + + index_1, index_2, index_3, index_4 = 0, 1, 2, 3 + if points[1][1] > points[0][1]: + index_1 = 0 + index_4 = 1 + else: + index_1 = 1 + index_4 = 0 + if points[3][1] > points[2][1]: + index_2 = 2 + index_3 = 3 + else: + index_2 = 3 + index_3 = 2 + + box = [ + points[index_1], points[index_2], points[index_3], points[index_4] + ] + return box, min(bounding_box[1]) + + def box_score_fast(self, bitmap, _box): + ''' + box_score_fast: use bbox mean score as the mean score + ''' + h, w = bitmap.shape[:2] + box = _box.copy() + xmin = np.clip(np.floor(box[:, 0].min()).astype("int32"), 0, w - 1) + xmax = np.clip(np.ceil(box[:, 0].max()).astype("int32"), 0, w - 1) + ymin = np.clip(np.floor(box[:, 1].min()).astype("int32"), 0, h - 1) + ymax = np.clip(np.ceil(box[:, 1].max()).astype("int32"), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + box[:, 0] = box[:, 0] - xmin + box[:, 1] = box[:, 1] - ymin + cv2.fillPoly(mask, box.reshape(1, -1, 2).astype("int32"), 1) + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] + + def box_score_slow(self, bitmap, contour): + ''' + box_score_slow: use polyon mean score as the mean score + ''' + h, w = bitmap.shape[:2] + contour = contour.copy() + contour = np.reshape(contour, (-1, 2)) + + xmin = np.clip(np.min(contour[:, 0]), 0, w - 1) + xmax = np.clip(np.max(contour[:, 0]), 0, w - 1) + ymin = np.clip(np.min(contour[:, 1]), 0, h - 1) + ymax = np.clip(np.max(contour[:, 1]), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + + contour[:, 0] = contour[:, 0] - xmin + contour[:, 1] = contour[:, 1] - ymin + + cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype("int32"), 1) + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] + + def __call__(self, outs_dict, shape_list): + pred = outs_dict['maps'] + if isinstance(pred, paddle.Tensor): + pred = pred.numpy() + pred = pred[:, 0, :, :] + segmentation = pred > self.thresh + + boxes_batch = [] + for batch_index in range(pred.shape[0]): + src_h, src_w, ratio_h, ratio_w = shape_list[batch_index] + if self.dilation_kernel is not None: + mask = cv2.dilate( + np.array(segmentation[batch_index]).astype(np.uint8), + self.dilation_kernel) + else: + mask = segmentation[batch_index] + if self.box_type == 'poly': + boxes, scores = self.polygons_from_bitmap(pred[batch_index], + mask, src_w, src_h) + elif self.box_type == 'quad': + boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask, + src_w, src_h) + else: + raise ValueError("box_type can only be one of ['quad', 'poly']") + + boxes_batch.append({'points': boxes}) + return boxes_batch + + +class DistillationDBPostProcess(object): + def __init__(self, + model_name=["student"], + key=None, + thresh=0.3, + box_thresh=0.6, + max_candidates=1000, + unclip_ratio=1.5, + use_dilation=False, + score_mode="fast", + box_type='quad', + **kwargs): + self.model_name = model_name + self.key = key + self.post_process = DBPostProcess( + thresh=thresh, + box_thresh=box_thresh, + max_candidates=max_candidates, + unclip_ratio=unclip_ratio, + use_dilation=use_dilation, + score_mode=score_mode, + box_type=box_type) + + def __call__(self, predicts, shape_list): + results = {} + for k in self.model_name: + results[k] = self.post_process(predicts[k], shape_list=shape_list) + return results diff --git a/ppocr/postprocess/drrg_postprocess.py b/ppocr/postprocess/drrg_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..353081c9d4d0fa1d04d995c84445445767276cc8 --- /dev/null +++ b/ppocr/postprocess/drrg_postprocess.py @@ -0,0 +1,326 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/main/mmocr/models/textdet/postprocess/drrg_postprocessor.py +""" + +import functools +import operator + +import numpy as np +import paddle +from numpy.linalg import norm +import cv2 + + +class Node: + def __init__(self, ind): + self.__ind = ind + self.__links = set() + + @property + def ind(self): + return self.__ind + + @property + def links(self): + return set(self.__links) + + def add_link(self, link_node): + self.__links.add(link_node) + link_node.__links.add(self) + + +def graph_propagation(edges, scores, text_comps, edge_len_thr=50.): + assert edges.ndim == 2 + assert edges.shape[1] == 2 + assert edges.shape[0] == scores.shape[0] + assert text_comps.ndim == 2 + assert isinstance(edge_len_thr, float) + + edges = np.sort(edges, axis=1) + score_dict = {} + for i, edge in enumerate(edges): + if text_comps is not None: + box1 = text_comps[edge[0], :8].reshape(4, 2) + box2 = text_comps[edge[1], :8].reshape(4, 2) + center1 = np.mean(box1, axis=0) + center2 = np.mean(box2, axis=0) + distance = norm(center1 - center2) + if distance > edge_len_thr: + scores[i] = 0 + if (edge[0], edge[1]) in score_dict: + score_dict[edge[0], edge[1]] = 0.5 * ( + score_dict[edge[0], edge[1]] + scores[i]) + else: + score_dict[edge[0], edge[1]] = scores[i] + + nodes = np.sort(np.unique(edges.flatten())) + mapping = -1 * np.ones((np.max(nodes) + 1), dtype=np.int) + mapping[nodes] = np.arange(nodes.shape[0]) + order_inds = mapping[edges] + vertices = [Node(node) for node in nodes] + for ind in order_inds: + vertices[ind[0]].add_link(vertices[ind[1]]) + + return vertices, score_dict + + +def connected_components(nodes, score_dict, link_thr): + assert isinstance(nodes, list) + assert all([isinstance(node, Node) for node in nodes]) + assert isinstance(score_dict, dict) + assert isinstance(link_thr, float) + + clusters = [] + nodes = set(nodes) + while nodes: + node = nodes.pop() + cluster = {node} + node_queue = [node] + while node_queue: + node = node_queue.pop(0) + neighbors = set([ + neighbor for neighbor in node.links + if score_dict[tuple(sorted([node.ind, neighbor.ind]))] >= + link_thr + ]) + neighbors.difference_update(cluster) + nodes.difference_update(neighbors) + cluster.update(neighbors) + node_queue.extend(neighbors) + clusters.append(list(cluster)) + return clusters + + +def clusters2labels(clusters, num_nodes): + assert isinstance(clusters, list) + assert all([isinstance(cluster, list) for cluster in clusters]) + assert all( + [isinstance(node, Node) for cluster in clusters for node in cluster]) + assert isinstance(num_nodes, int) + + node_labels = np.zeros(num_nodes) + for cluster_ind, cluster in enumerate(clusters): + for node in cluster: + node_labels[node.ind] = cluster_ind + return node_labels + + +def remove_single(text_comps, comp_pred_labels): + assert text_comps.ndim == 2 + assert text_comps.shape[0] == comp_pred_labels.shape[0] + + single_flags = np.zeros_like(comp_pred_labels) + pred_labels = np.unique(comp_pred_labels) + for label in pred_labels: + current_label_flag = (comp_pred_labels == label) + if np.sum(current_label_flag) == 1: + single_flags[np.where(current_label_flag)[0][0]] = 1 + keep_ind = [i for i in range(len(comp_pred_labels)) if not single_flags[i]] + filtered_text_comps = text_comps[keep_ind, :] + filtered_labels = comp_pred_labels[keep_ind] + + return filtered_text_comps, filtered_labels + + +def norm2(point1, point2): + return ((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2)**0.5 + + +def min_connect_path(points): + assert isinstance(points, list) + assert all([isinstance(point, list) for point in points]) + assert all([isinstance(coord, int) for point in points for coord in point]) + + points_queue = points.copy() + shortest_path = [] + current_edge = [[], []] + + edge_dict0 = {} + edge_dict1 = {} + current_edge[0] = points_queue[0] + current_edge[1] = points_queue[0] + points_queue.remove(points_queue[0]) + while points_queue: + for point in points_queue: + length0 = norm2(point, current_edge[0]) + edge_dict0[length0] = [point, current_edge[0]] + length1 = norm2(current_edge[1], point) + edge_dict1[length1] = [current_edge[1], point] + key0 = min(edge_dict0.keys()) + key1 = min(edge_dict1.keys()) + + if key0 <= key1: + start = edge_dict0[key0][0] + end = edge_dict0[key0][1] + shortest_path.insert(0, [points.index(start), points.index(end)]) + points_queue.remove(start) + current_edge[0] = start + else: + start = edge_dict1[key1][0] + end = edge_dict1[key1][1] + shortest_path.append([points.index(start), points.index(end)]) + points_queue.remove(end) + current_edge[1] = end + + edge_dict0 = {} + edge_dict1 = {} + + shortest_path = functools.reduce(operator.concat, shortest_path) + shortest_path = sorted(set(shortest_path), key=shortest_path.index) + + return shortest_path + + +def in_contour(cont, point): + x, y = point + is_inner = cv2.pointPolygonTest(cont, (int(x), int(y)), False) > 0.5 + return is_inner + + +def fix_corner(top_line, bot_line, start_box, end_box): + assert isinstance(top_line, list) + assert all(isinstance(point, list) for point in top_line) + assert isinstance(bot_line, list) + assert all(isinstance(point, list) for point in bot_line) + assert start_box.shape == end_box.shape == (4, 2) + + contour = np.array(top_line + bot_line[::-1]) + start_left_mid = (start_box[0] + start_box[3]) / 2 + start_right_mid = (start_box[1] + start_box[2]) / 2 + end_left_mid = (end_box[0] + end_box[3]) / 2 + end_right_mid = (end_box[1] + end_box[2]) / 2 + if not in_contour(contour, start_left_mid): + top_line.insert(0, start_box[0].tolist()) + bot_line.insert(0, start_box[3].tolist()) + elif not in_contour(contour, start_right_mid): + top_line.insert(0, start_box[1].tolist()) + bot_line.insert(0, start_box[2].tolist()) + if not in_contour(contour, end_left_mid): + top_line.append(end_box[0].tolist()) + bot_line.append(end_box[3].tolist()) + elif not in_contour(contour, end_right_mid): + top_line.append(end_box[1].tolist()) + bot_line.append(end_box[2].tolist()) + return top_line, bot_line + + +def comps2boundaries(text_comps, comp_pred_labels): + assert text_comps.ndim == 2 + assert len(text_comps) == len(comp_pred_labels) + boundaries = [] + if len(text_comps) < 1: + return boundaries + for cluster_ind in range(0, int(np.max(comp_pred_labels)) + 1): + cluster_comp_inds = np.where(comp_pred_labels == cluster_ind) + text_comp_boxes = text_comps[cluster_comp_inds, :8].reshape( + (-1, 4, 2)).astype(np.int32) + score = np.mean(text_comps[cluster_comp_inds, -1]) + + if text_comp_boxes.shape[0] < 1: + continue + + elif text_comp_boxes.shape[0] > 1: + centers = np.mean(text_comp_boxes, axis=1).astype(np.int32).tolist() + shortest_path = min_connect_path(centers) + text_comp_boxes = text_comp_boxes[shortest_path] + top_line = np.mean( + text_comp_boxes[:, 0:2, :], axis=1).astype(np.int32).tolist() + bot_line = np.mean( + text_comp_boxes[:, 2:4, :], axis=1).astype(np.int32).tolist() + top_line, bot_line = fix_corner( + top_line, bot_line, text_comp_boxes[0], text_comp_boxes[-1]) + boundary_points = top_line + bot_line[::-1] + + else: + top_line = text_comp_boxes[0, 0:2, :].astype(np.int32).tolist() + bot_line = text_comp_boxes[0, 2:4:-1, :].astype(np.int32).tolist() + boundary_points = top_line + bot_line + + boundary = [p for coord in boundary_points for p in coord] + [score] + boundaries.append(boundary) + + return boundaries + + +class DRRGPostprocess(object): + """Merge text components and construct boundaries of text instances. + + Args: + link_thr (float): The edge score threshold. + """ + + def __init__(self, link_thr, **kwargs): + assert isinstance(link_thr, float) + self.link_thr = link_thr + + def __call__(self, preds, shape_list): + """ + Args: + edges (ndarray): The edge array of shape N * 2, each row is a node + index pair that makes up an edge in graph. + scores (ndarray): The edge score array of shape (N,). + text_comps (ndarray): The text components. + + Returns: + List[list[float]]: The predicted boundaries of text instances. + """ + edges, scores, text_comps = preds + if edges is not None: + if isinstance(edges, paddle.Tensor): + edges = edges.numpy() + if isinstance(scores, paddle.Tensor): + scores = scores.numpy() + if isinstance(text_comps, paddle.Tensor): + text_comps = text_comps.numpy() + assert len(edges) == len(scores) + assert text_comps.ndim == 2 + assert text_comps.shape[1] == 9 + + vertices, score_dict = graph_propagation(edges, scores, text_comps) + clusters = connected_components(vertices, score_dict, self.link_thr) + pred_labels = clusters2labels(clusters, text_comps.shape[0]) + text_comps, pred_labels = remove_single(text_comps, pred_labels) + boundaries = comps2boundaries(text_comps, pred_labels) + else: + boundaries = [] + + boundaries, scores = self.resize_boundary( + boundaries, (1 / shape_list[0, 2:]).tolist()[::-1]) + boxes_batch = [dict(points=boundaries, scores=scores)] + return boxes_batch + + def resize_boundary(self, boundaries, scale_factor): + """Rescale boundaries via scale_factor. + + Args: + boundaries (list[list[float]]): The boundary list. Each boundary + with size 2k+1 with k>=4. + scale_factor(ndarray): The scale factor of size (4,). + + Returns: + boundaries (list[list[float]]): The scaled boundaries. + """ + boxes = [] + scores = [] + for b in boundaries: + sz = len(b) + scores.append(b[-1]) + b = (np.array(b[:sz - 1]) * + (np.tile(scale_factor[:2], int( + (sz - 1) / 2)).reshape(1, sz - 1))).flatten().tolist() + boxes.append(np.array(b).reshape([-1, 2])) + return boxes, scores diff --git a/ppocr/postprocess/east_postprocess.py b/ppocr/postprocess/east_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..c194c81c6911aac0f9210109c37b76b44532e9c4 --- /dev/null +++ b/ppocr/postprocess/east_postprocess.py @@ -0,0 +1,143 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from .locality_aware_nms import nms_locality +import cv2 +import paddle + +import os +import sys + + +class EASTPostProcess(object): + """ + The post process for EAST. + """ + + def __init__(self, + score_thresh=0.8, + cover_thresh=0.1, + nms_thresh=0.2, + **kwargs): + + self.score_thresh = score_thresh + self.cover_thresh = cover_thresh + self.nms_thresh = nms_thresh + + def restore_rectangle_quad(self, origin, geometry): + """ + Restore rectangle from quadrangle. + """ + # quad + origin_concat = np.concatenate( + (origin, origin, origin, origin), axis=1) # (n, 8) + pred_quads = origin_concat - geometry + pred_quads = pred_quads.reshape((-1, 4, 2)) # (n, 4, 2) + return pred_quads + + def detect(self, + score_map, + geo_map, + score_thresh=0.8, + cover_thresh=0.1, + nms_thresh=0.2): + """ + restore text boxes from score map and geo map + """ + + score_map = score_map[0] + geo_map = np.swapaxes(geo_map, 1, 0) + geo_map = np.swapaxes(geo_map, 1, 2) + # filter the score map + xy_text = np.argwhere(score_map > score_thresh) + if len(xy_text) == 0: + return [] + # sort the text boxes via the y axis + xy_text = xy_text[np.argsort(xy_text[:, 0])] + #restore quad proposals + text_box_restored = self.restore_rectangle_quad( + xy_text[:, ::-1] * 4, geo_map[xy_text[:, 0], xy_text[:, 1], :]) + boxes = np.zeros((text_box_restored.shape[0], 9), dtype=np.float32) + boxes[:, :8] = text_box_restored.reshape((-1, 8)) + boxes[:, 8] = score_map[xy_text[:, 0], xy_text[:, 1]] + + try: + import lanms + boxes = lanms.merge_quadrangle_n9(boxes, nms_thresh) + except: + print( + 'you should install lanms by pip3 install lanms-nova to speed up nms_locality' + ) + boxes = nms_locality(boxes.astype(np.float64), nms_thresh) + if boxes.shape[0] == 0: + return [] + # Here we filter some low score boxes by the average score map, + # this is different from the orginal paper. + for i, box in enumerate(boxes): + mask = np.zeros_like(score_map, dtype=np.uint8) + cv2.fillPoly(mask, box[:8].reshape( + (-1, 4, 2)).astype(np.int32) // 4, 1) + boxes[i, 8] = cv2.mean(score_map, mask)[0] + boxes = boxes[boxes[:, 8] > cover_thresh] + return boxes + + def sort_poly(self, p): + """ + Sort polygons. + """ + min_axis = np.argmin(np.sum(p, axis=1)) + p = p[[min_axis, (min_axis + 1) % 4,\ + (min_axis + 2) % 4, (min_axis + 3) % 4]] + if abs(p[0, 0] - p[1, 0]) > abs(p[0, 1] - p[1, 1]): + return p + else: + return p[[0, 3, 2, 1]] + + def __call__(self, outs_dict, shape_list): + score_list = outs_dict['f_score'] + geo_list = outs_dict['f_geo'] + if isinstance(score_list, paddle.Tensor): + score_list = score_list.numpy() + geo_list = geo_list.numpy() + img_num = len(shape_list) + dt_boxes_list = [] + for ino in range(img_num): + score = score_list[ino] + geo = geo_list[ino] + boxes = self.detect( + score_map=score, + geo_map=geo, + score_thresh=self.score_thresh, + cover_thresh=self.cover_thresh, + nms_thresh=self.nms_thresh) + boxes_norm = [] + if len(boxes) > 0: + h, w = score.shape[1:] + src_h, src_w, ratio_h, ratio_w = shape_list[ino] + boxes = boxes[:, :8].reshape((-1, 4, 2)) + boxes[:, :, 0] /= ratio_w + boxes[:, :, 1] /= ratio_h + for i_box, box in enumerate(boxes): + box = self.sort_poly(box.astype(np.int32)) + if np.linalg.norm(box[0] - box[1]) < 5 \ + or np.linalg.norm(box[3] - box[0]) < 5: + continue + boxes_norm.append(box) + dt_boxes_list.append({'points': np.array(boxes_norm)}) + return dt_boxes_list diff --git a/ppocr/postprocess/fce_postprocess.py b/ppocr/postprocess/fce_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..8e0716f9f2f3a7cb585fa40a2e2a27aecb606a9b --- /dev/null +++ b/ppocr/postprocess/fce_postprocess.py @@ -0,0 +1,241 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/open-mmlab/mmocr/blob/v0.3.0/mmocr/models/textdet/postprocess/wrapper.py +""" + +import cv2 +import paddle +import numpy as np +from numpy.fft import ifft +from ppocr.utils.poly_nms import poly_nms, valid_boundary + + +def fill_hole(input_mask): + h, w = input_mask.shape + canvas = np.zeros((h + 2, w + 2), np.uint8) + canvas[1:h + 1, 1:w + 1] = input_mask.copy() + + mask = np.zeros((h + 4, w + 4), np.uint8) + + cv2.floodFill(canvas, mask, (0, 0), 1) + canvas = canvas[1:h + 1, 1:w + 1].astype(np.bool) + + return ~canvas | input_mask + + +def fourier2poly(fourier_coeff, num_reconstr_points=50): + """ Inverse Fourier transform + Args: + fourier_coeff (ndarray): Fourier coefficients shaped (n, 2k+1), + with n and k being candidates number and Fourier degree + respectively. + num_reconstr_points (int): Number of reconstructed polygon points. + Returns: + Polygons (ndarray): The reconstructed polygons shaped (n, n') + """ + + a = np.zeros((len(fourier_coeff), num_reconstr_points), dtype='complex') + k = (len(fourier_coeff[0]) - 1) // 2 + + a[:, 0:k + 1] = fourier_coeff[:, k:] + a[:, -k:] = fourier_coeff[:, :k] + + poly_complex = ifft(a) * num_reconstr_points + polygon = np.zeros((len(fourier_coeff), num_reconstr_points, 2)) + polygon[:, :, 0] = poly_complex.real + polygon[:, :, 1] = poly_complex.imag + return polygon.astype('int32').reshape((len(fourier_coeff), -1)) + + +class FCEPostProcess(object): + """ + The post process for FCENet. + """ + + def __init__(self, + scales, + fourier_degree=5, + num_reconstr_points=50, + decoding_type='fcenet', + score_thr=0.3, + nms_thr=0.1, + alpha=1.0, + beta=1.0, + box_type='poly', + **kwargs): + + self.scales = scales + self.fourier_degree = fourier_degree + self.num_reconstr_points = num_reconstr_points + self.decoding_type = decoding_type + self.score_thr = score_thr + self.nms_thr = nms_thr + self.alpha = alpha + self.beta = beta + self.box_type = box_type + + def __call__(self, preds, shape_list): + score_maps = [] + for key, value in preds.items(): + if isinstance(value, paddle.Tensor): + value = value.numpy() + cls_res = value[:, :4, :, :] + reg_res = value[:, 4:, :, :] + score_maps.append([cls_res, reg_res]) + + return self.get_boundary(score_maps, shape_list) + + def resize_boundary(self, boundaries, scale_factor): + """Rescale boundaries via scale_factor. + + Args: + boundaries (list[list[float]]): The boundary list. Each boundary + with size 2k+1 with k>=4. + scale_factor(ndarray): The scale factor of size (4,). + + Returns: + boundaries (list[list[float]]): The scaled boundaries. + """ + boxes = [] + scores = [] + for b in boundaries: + sz = len(b) + valid_boundary(b, True) + scores.append(b[-1]) + b = (np.array(b[:sz - 1]) * + (np.tile(scale_factor[:2], int( + (sz - 1) / 2)).reshape(1, sz - 1))).flatten().tolist() + boxes.append(np.array(b).reshape([-1, 2])) + + return np.array(boxes, dtype=np.float32), scores + + def get_boundary(self, score_maps, shape_list): + assert len(score_maps) == len(self.scales) + boundaries = [] + for idx, score_map in enumerate(score_maps): + scale = self.scales[idx] + boundaries = boundaries + self._get_boundary_single(score_map, + scale) + + # nms + boundaries = poly_nms(boundaries, self.nms_thr) + boundaries, scores = self.resize_boundary( + boundaries, (1 / shape_list[0, 2:]).tolist()[::-1]) + + boxes_batch = [dict(points=boundaries, scores=scores)] + return boxes_batch + + def _get_boundary_single(self, score_map, scale): + assert len(score_map) == 2 + assert score_map[1].shape[1] == 4 * self.fourier_degree + 2 + + return self.fcenet_decode( + preds=score_map, + fourier_degree=self.fourier_degree, + num_reconstr_points=self.num_reconstr_points, + scale=scale, + alpha=self.alpha, + beta=self.beta, + box_type=self.box_type, + score_thr=self.score_thr, + nms_thr=self.nms_thr) + + def fcenet_decode(self, + preds, + fourier_degree, + num_reconstr_points, + scale, + alpha=1.0, + beta=2.0, + box_type='poly', + score_thr=0.3, + nms_thr=0.1): + """Decoding predictions of FCENet to instances. + + Args: + preds (list(Tensor)): The head output tensors. + fourier_degree (int): The maximum Fourier transform degree k. + num_reconstr_points (int): The points number of the polygon + reconstructed from predicted Fourier coefficients. + scale (int): The down-sample scale of the prediction. + alpha (float) : The parameter to calculate final scores. Score_{final} + = (Score_{text region} ^ alpha) + * (Score_{text center region}^ beta) + beta (float) : The parameter to calculate final score. + box_type (str): Boundary encoding type 'poly' or 'quad'. + score_thr (float) : The threshold used to filter out the final + candidates. + nms_thr (float) : The threshold of nms. + + Returns: + boundaries (list[list[float]]): The instance boundary and confidence + list. + """ + assert isinstance(preds, list) + assert len(preds) == 2 + assert box_type in ['poly', 'quad'] + + cls_pred = preds[0][0] + tr_pred = cls_pred[0:2] + tcl_pred = cls_pred[2:] + + reg_pred = preds[1][0].transpose([1, 2, 0]) + x_pred = reg_pred[:, :, :2 * fourier_degree + 1] + y_pred = reg_pred[:, :, 2 * fourier_degree + 1:] + + score_pred = (tr_pred[1]**alpha) * (tcl_pred[1]**beta) + tr_pred_mask = (score_pred) > score_thr + tr_mask = fill_hole(tr_pred_mask) + + tr_contours, _ = cv2.findContours( + tr_mask.astype(np.uint8), cv2.RETR_TREE, + cv2.CHAIN_APPROX_SIMPLE) # opencv4 + + mask = np.zeros_like(tr_mask) + boundaries = [] + for cont in tr_contours: + deal_map = mask.copy().astype(np.int8) + cv2.drawContours(deal_map, [cont], -1, 1, -1) + + score_map = score_pred * deal_map + score_mask = score_map > 0 + xy_text = np.argwhere(score_mask) + dxy = xy_text[:, 1] + xy_text[:, 0] * 1j + + x, y = x_pred[score_mask], y_pred[score_mask] + c = x + y * 1j + c[:, fourier_degree] = c[:, fourier_degree] + dxy + c *= scale + + polygons = fourier2poly(c, num_reconstr_points) + score = score_map[score_mask].reshape(-1, 1) + polygons = poly_nms(np.hstack((polygons, score)).tolist(), nms_thr) + + boundaries = boundaries + polygons + + boundaries = poly_nms(boundaries, nms_thr) + + if box_type == 'quad': + new_boundaries = [] + for boundary in boundaries: + poly = np.array(boundary[:-1]).reshape(-1, 2).astype(np.float32) + score = boundary[-1] + points = cv2.boxPoints(cv2.minAreaRect(poly)) + points = np.int0(points) + new_boundaries.append(points.reshape(-1).tolist() + [score]) + boundaries = new_boundaries + + return boundaries diff --git a/ppocr/postprocess/locality_aware_nms.py b/ppocr/postprocess/locality_aware_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..d305ef681882b4a393a73190bcbd20a65d1f0c15 --- /dev/null +++ b/ppocr/postprocess/locality_aware_nms.py @@ -0,0 +1,200 @@ +""" +Locality aware nms. +This code is refered from: https://github.com/songdejia/EAST/blob/master/locality_aware_nms.py +""" + +import numpy as np +from shapely.geometry import Polygon + + +def intersection(g, p): + """ + Intersection. + """ + g = Polygon(g[:8].reshape((4, 2))) + p = Polygon(p[:8].reshape((4, 2))) + g = g.buffer(0) + p = p.buffer(0) + if not g.is_valid or not p.is_valid: + return 0 + inter = Polygon(g).intersection(Polygon(p)).area + union = g.area + p.area - inter + if union == 0: + return 0 + else: + return inter / union + + +def intersection_iog(g, p): + """ + Intersection_iog. + """ + g = Polygon(g[:8].reshape((4, 2))) + p = Polygon(p[:8].reshape((4, 2))) + if not g.is_valid or not p.is_valid: + return 0 + inter = Polygon(g).intersection(Polygon(p)).area + #union = g.area + p.area - inter + union = p.area + if union == 0: + print("p_area is very small") + return 0 + else: + return inter / union + + +def weighted_merge(g, p): + """ + Weighted merge. + """ + g[:8] = (g[8] * g[:8] + p[8] * p[:8]) / (g[8] + p[8]) + g[8] = (g[8] + p[8]) + return g + + +def standard_nms(S, thres): + """ + Standard nms. + """ + order = np.argsort(S[:, 8])[::-1] + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + ovr = np.array([intersection(S[i], S[t]) for t in order[1:]]) + + inds = np.where(ovr <= thres)[0] + order = order[inds + 1] + + return S[keep] + + +def standard_nms_inds(S, thres): + """ + Standard nms, retun inds. + """ + order = np.argsort(S[:, 8])[::-1] + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + ovr = np.array([intersection(S[i], S[t]) for t in order[1:]]) + + inds = np.where(ovr <= thres)[0] + order = order[inds + 1] + + return keep + + +def nms(S, thres): + """ + nms. + """ + order = np.argsort(S[:, 8])[::-1] + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + ovr = np.array([intersection(S[i], S[t]) for t in order[1:]]) + + inds = np.where(ovr <= thres)[0] + order = order[inds + 1] + + return keep + + +def soft_nms(boxes_in, Nt_thres=0.3, threshold=0.8, sigma=0.5, method=2): + """ + soft_nms + :para boxes_in, N x 9 (coords + score) + :para threshould, eliminate cases min score(0.001) + :para Nt_thres, iou_threshi + :para sigma, gaussian weght + :method, linear or gaussian + """ + boxes = boxes_in.copy() + N = boxes.shape[0] + if N is None or N < 1: + return np.array([]) + pos, maxpos = 0, 0 + weight = 0.0 + inds = np.arange(N) + tbox, sbox = boxes[0].copy(), boxes[0].copy() + for i in range(N): + maxscore = boxes[i, 8] + maxpos = i + tbox = boxes[i].copy() + ti = inds[i] + pos = i + 1 + #get max box + while pos < N: + if maxscore < boxes[pos, 8]: + maxscore = boxes[pos, 8] + maxpos = pos + pos = pos + 1 + #add max box as a detection + boxes[i, :] = boxes[maxpos, :] + inds[i] = inds[maxpos] + #swap + boxes[maxpos, :] = tbox + inds[maxpos] = ti + tbox = boxes[i].copy() + pos = i + 1 + #NMS iteration + while pos < N: + sbox = boxes[pos].copy() + ts_iou_val = intersection(tbox, sbox) + if ts_iou_val > 0: + if method == 1: + if ts_iou_val > Nt_thres: + weight = 1 - ts_iou_val + else: + weight = 1 + elif method == 2: + weight = np.exp(-1.0 * ts_iou_val**2 / sigma) + else: + if ts_iou_val > Nt_thres: + weight = 0 + else: + weight = 1 + boxes[pos, 8] = weight * boxes[pos, 8] + #if box score falls below thresold, discard the box by + #swaping last box update N + if boxes[pos, 8] < threshold: + boxes[pos, :] = boxes[N - 1, :] + inds[pos] = inds[N - 1] + N = N - 1 + pos = pos - 1 + pos = pos + 1 + + return boxes[:N] + + +def nms_locality(polys, thres=0.3): + """ + locality aware nms of EAST + :param polys: a N*9 numpy array. first 8 coordinates, then prob + :return: boxes after nms + """ + S = [] + p = None + for g in polys: + if p is not None and intersection(g, p) > thres: + p = weighted_merge(g, p) + else: + if p is not None: + S.append(p) + p = g + if p is not None: + S.append(p) + + if len(S) == 0: + return np.array([]) + return standard_nms(np.array(S), thres) + + +if __name__ == '__main__': + # 343,350,448,135,474,143,369,359 + print( + Polygon(np.array([[343, 350], [448, 135], [474, 143], [369, 359]])) + .area) \ No newline at end of file diff --git a/ppocr/postprocess/pg_postprocess.py b/ppocr/postprocess/pg_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..058cf8b907de296094d3ed2fc7e6981939ced328 --- /dev/null +++ b/ppocr/postprocess/pg_postprocess.py @@ -0,0 +1,63 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +__dir__ = os.path.dirname(__file__) +sys.path.append(__dir__) +sys.path.append(os.path.join(__dir__, '..')) +from ppocr.utils.e2e_utils.pgnet_pp_utils import PGNet_PostProcess + + +class PGPostProcess(object): + """ + The post process for PGNet. + """ + + def __init__(self, + character_dict_path, + valid_set, + score_thresh, + mode, + point_gather_mode=None, + **kwargs): + self.character_dict_path = character_dict_path + self.valid_set = valid_set + self.score_thresh = score_thresh + self.mode = mode + self.point_gather_mode = point_gather_mode + + # c++ la-nms is faster, but only support python 3.5 + self.is_python35 = False + if sys.version_info.major == 3 and sys.version_info.minor == 5: + self.is_python35 = True + + def __call__(self, outs_dict, shape_list): + post = PGNet_PostProcess( + self.character_dict_path, + self.valid_set, + self.score_thresh, + outs_dict, + shape_list, + point_gather_mode=self.point_gather_mode) + if self.mode == 'fast': + data = post.pg_postprocess_fast() + else: + data = post.pg_postprocess_slow() + return data diff --git a/ppocr/postprocess/picodet_postprocess.py b/ppocr/postprocess/picodet_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..1a0aeb4387ea4778c1c6bec910262f1c4e136084 --- /dev/null +++ b/ppocr/postprocess/picodet_postprocess.py @@ -0,0 +1,250 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from scipy.special import softmax + + +def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200): + """ + Args: + box_scores (N, 5): boxes in corner-form and probabilities. + iou_threshold: intersection over union threshold. + top_k: keep top_k results. If k <= 0, keep all the results. + candidate_size: only consider the candidates with the highest scores. + Returns: + picked: a list of indexes of the kept boxes + """ + scores = box_scores[:, -1] + boxes = box_scores[:, :-1] + picked = [] + indexes = np.argsort(scores) + indexes = indexes[-candidate_size:] + while len(indexes) > 0: + current = indexes[-1] + picked.append(current) + if 0 < top_k == len(picked) or len(indexes) == 1: + break + current_box = boxes[current, :] + indexes = indexes[:-1] + rest_boxes = boxes[indexes, :] + iou = iou_of( + rest_boxes, + np.expand_dims( + current_box, axis=0), ) + indexes = indexes[iou <= iou_threshold] + + return box_scores[picked, :] + + +def iou_of(boxes0, boxes1, eps=1e-5): + """Return intersection-over-union (Jaccard index) of boxes. + Args: + boxes0 (N, 4): ground truth boxes. + boxes1 (N or 1, 4): predicted boxes. + eps: a small number to avoid 0 as denominator. + Returns: + iou (N): IoU values. + """ + overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2]) + overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:]) + + overlap_area = area_of(overlap_left_top, overlap_right_bottom) + area0 = area_of(boxes0[..., :2], boxes0[..., 2:]) + area1 = area_of(boxes1[..., :2], boxes1[..., 2:]) + return overlap_area / (area0 + area1 - overlap_area + eps) + + +def area_of(left_top, right_bottom): + """Compute the areas of rectangles given two corners. + Args: + left_top (N, 2): left top corner. + right_bottom (N, 2): right bottom corner. + Returns: + area (N): return the area. + """ + hw = np.clip(right_bottom - left_top, 0.0, None) + return hw[..., 0] * hw[..., 1] + + +class PicoDetPostProcess(object): + """ + Args: + input_shape (int): network input image size + ori_shape (int): ori image shape of before padding + scale_factor (float): scale factor of ori image + enable_mkldnn (bool): whether to open MKLDNN + """ + + def __init__(self, + layout_dict_path, + strides=[8, 16, 32, 64], + score_threshold=0.4, + nms_threshold=0.5, + nms_top_k=1000, + keep_top_k=100): + self.labels = self.load_layout_dict(layout_dict_path) + self.strides = strides + self.score_threshold = score_threshold + self.nms_threshold = nms_threshold + self.nms_top_k = nms_top_k + self.keep_top_k = keep_top_k + + def load_layout_dict(self, layout_dict_path): + with open(layout_dict_path, 'r', encoding='utf-8') as fp: + labels = fp.readlines() + return [label.strip('\n') for label in labels] + + def warp_boxes(self, boxes, ori_shape): + """Apply transform to boxes + """ + width, height = ori_shape[1], ori_shape[0] + n = len(boxes) + if n: + # warp points + xy = np.ones((n * 4, 3)) + xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape( + n * 4, 2) # x1y1, x2y2, x1y2, x2y1 + # xy = xy @ M.T # transform + xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8) # rescale + # create new boxes + x = xy[:, [0, 2, 4, 6]] + y = xy[:, [1, 3, 5, 7]] + xy = np.concatenate( + (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T + # clip boxes + xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width) + xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height) + return xy.astype(np.float32) + else: + return boxes + + def img_info(self, ori_img, img): + origin_shape = ori_img.shape + resize_shape = img.shape + im_scale_y = resize_shape[2] / float(origin_shape[0]) + im_scale_x = resize_shape[3] / float(origin_shape[1]) + scale_factor = np.array([im_scale_y, im_scale_x], dtype=np.float32) + img_shape = np.array(img.shape[2:], dtype=np.float32) + + input_shape = np.array(img).astype('float32').shape[2:] + ori_shape = np.array((img_shape, )).astype('float32') + scale_factor = np.array((scale_factor, )).astype('float32') + return ori_shape, input_shape, scale_factor + + def __call__(self, ori_img, img, preds): + scores, raw_boxes = preds['boxes'], preds['boxes_num'] + batch_size = raw_boxes[0].shape[0] + reg_max = int(raw_boxes[0].shape[-1] / 4 - 1) + out_boxes_num = [] + out_boxes_list = [] + results = [] + ori_shape, input_shape, scale_factor = self.img_info(ori_img, img) + + for batch_id in range(batch_size): + # generate centers + decode_boxes = [] + select_scores = [] + for stride, box_distribute, score in zip(self.strides, raw_boxes, + scores): + box_distribute = box_distribute[batch_id] + score = score[batch_id] + # centers + fm_h = input_shape[0] / stride + fm_w = input_shape[1] / stride + h_range = np.arange(fm_h) + w_range = np.arange(fm_w) + ww, hh = np.meshgrid(w_range, h_range) + ct_row = (hh.flatten() + 0.5) * stride + ct_col = (ww.flatten() + 0.5) * stride + center = np.stack((ct_col, ct_row, ct_col, ct_row), axis=1) + + # box distribution to distance + reg_range = np.arange(reg_max + 1) + box_distance = box_distribute.reshape((-1, reg_max + 1)) + box_distance = softmax(box_distance, axis=1) + box_distance = box_distance * np.expand_dims(reg_range, axis=0) + box_distance = np.sum(box_distance, axis=1).reshape((-1, 4)) + box_distance = box_distance * stride + + # top K candidate + topk_idx = np.argsort(score.max(axis=1))[::-1] + topk_idx = topk_idx[:self.nms_top_k] + center = center[topk_idx] + score = score[topk_idx] + box_distance = box_distance[topk_idx] + + # decode box + decode_box = center + [-1, -1, 1, 1] * box_distance + + select_scores.append(score) + decode_boxes.append(decode_box) + + # nms + bboxes = np.concatenate(decode_boxes, axis=0) + confidences = np.concatenate(select_scores, axis=0) + picked_box_probs = [] + picked_labels = [] + for class_index in range(0, confidences.shape[1]): + probs = confidences[:, class_index] + mask = probs > self.score_threshold + probs = probs[mask] + if probs.shape[0] == 0: + continue + subset_boxes = bboxes[mask, :] + box_probs = np.concatenate( + [subset_boxes, probs.reshape(-1, 1)], axis=1) + box_probs = hard_nms( + box_probs, + iou_threshold=self.nms_threshold, + top_k=self.keep_top_k, ) + picked_box_probs.append(box_probs) + picked_labels.extend([class_index] * box_probs.shape[0]) + + if len(picked_box_probs) == 0: + out_boxes_list.append(np.empty((0, 4))) + out_boxes_num.append(0) + + else: + picked_box_probs = np.concatenate(picked_box_probs) + + # resize output boxes + picked_box_probs[:, :4] = self.warp_boxes( + picked_box_probs[:, :4], ori_shape[batch_id]) + im_scale = np.concatenate([ + scale_factor[batch_id][::-1], scale_factor[batch_id][::-1] + ]) + picked_box_probs[:, :4] /= im_scale + # clas score box + out_boxes_list.append( + np.concatenate( + [ + np.expand_dims( + np.array(picked_labels), + axis=-1), np.expand_dims( + picked_box_probs[:, 4], axis=-1), + picked_box_probs[:, :4] + ], + axis=1)) + out_boxes_num.append(len(picked_labels)) + + out_boxes_list = np.concatenate(out_boxes_list, axis=0) + out_boxes_num = np.asarray(out_boxes_num).astype(np.int32) + + for dt in out_boxes_list: + clsid, bbox, score = int(dt[0]), dt[2:], dt[1] + label = self.labels[clsid] + result = {'bbox': bbox, 'label': label} + results.append(result) + return results diff --git a/ppocr/postprocess/pse_postprocess/__init__.py b/ppocr/postprocess/pse_postprocess/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..680473bf4b1863ac695dc8173778e59bd4fdacf9 --- /dev/null +++ b/ppocr/postprocess/pse_postprocess/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .pse_postprocess import PSEPostProcess \ No newline at end of file diff --git a/ppocr/postprocess/pse_postprocess/pse/README.md b/ppocr/postprocess/pse_postprocess/pse/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6a19d5d1b6b1d8e6952eb054d74c6672ed10bc48 --- /dev/null +++ b/ppocr/postprocess/pse_postprocess/pse/README.md @@ -0,0 +1,6 @@ +## 编译 +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/post_processing/pse +```python +python3 setup.py build_ext --inplace +``` diff --git a/ppocr/postprocess/pse_postprocess/pse/__init__.py b/ppocr/postprocess/pse_postprocess/pse/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1903a9149a7703ac7ae9a66273eca620e0d77272 --- /dev/null +++ b/ppocr/postprocess/pse_postprocess/pse/__init__.py @@ -0,0 +1,29 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +import os +import subprocess + +python_path = sys.executable + +ori_path = os.getcwd() +os.chdir('ppocr/postprocess/pse_postprocess/pse') +if subprocess.call( + '{} setup.py build_ext --inplace'.format(python_path), shell=True) != 0: + raise RuntimeError( + 'Cannot compile pse: {}, if your system is windows, you need to install all the default components of `desktop development using C++` in visual studio 2019+'. + format(os.path.dirname(os.path.realpath(__file__)))) +os.chdir(ori_path) + +from .pse import pse diff --git a/ppocr/postprocess/pse_postprocess/pse/pse.pyx b/ppocr/postprocess/pse_postprocess/pse/pse.pyx new file mode 100644 index 0000000000000000000000000000000000000000..b2be49e9471865c11b840207f922258e67a554b6 --- /dev/null +++ b/ppocr/postprocess/pse_postprocess/pse/pse.pyx @@ -0,0 +1,70 @@ + +import numpy as np +import cv2 +cimport numpy as np +cimport cython +cimport libcpp +cimport libcpp.pair +cimport libcpp.queue +from libcpp.pair cimport * +from libcpp.queue cimport * + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef np.ndarray[np.int32_t, ndim=2] _pse(np.ndarray[np.uint8_t, ndim=3] kernels, + np.ndarray[np.int32_t, ndim=2] label, + int kernel_num, + int label_num, + float min_area=0): + cdef np.ndarray[np.int32_t, ndim=2] pred + pred = np.zeros((label.shape[0], label.shape[1]), dtype=np.int32) + + for label_idx in range(1, label_num): + if np.sum(label == label_idx) < min_area: + label[label == label_idx] = 0 + + cdef libcpp.queue.queue[libcpp.pair.pair[np.int16_t,np.int16_t]] que = \ + queue[libcpp.pair.pair[np.int16_t,np.int16_t]]() + cdef libcpp.queue.queue[libcpp.pair.pair[np.int16_t,np.int16_t]] nxt_que = \ + queue[libcpp.pair.pair[np.int16_t,np.int16_t]]() + cdef np.int16_t* dx = [-1, 1, 0, 0] + cdef np.int16_t* dy = [0, 0, -1, 1] + cdef np.int16_t tmpx, tmpy + + points = np.array(np.where(label > 0)).transpose((1, 0)) + for point_idx in range(points.shape[0]): + tmpx, tmpy = points[point_idx, 0], points[point_idx, 1] + que.push(pair[np.int16_t,np.int16_t](tmpx, tmpy)) + pred[tmpx, tmpy] = label[tmpx, tmpy] + + cdef libcpp.pair.pair[np.int16_t,np.int16_t] cur + cdef int cur_label + for kernel_idx in range(kernel_num - 1, -1, -1): + while not que.empty(): + cur = que.front() + que.pop() + cur_label = pred[cur.first, cur.second] + + is_edge = True + for j in range(4): + tmpx = cur.first + dx[j] + tmpy = cur.second + dy[j] + if tmpx < 0 or tmpx >= label.shape[0] or tmpy < 0 or tmpy >= label.shape[1]: + continue + if kernels[kernel_idx, tmpx, tmpy] == 0 or pred[tmpx, tmpy] > 0: + continue + + que.push(pair[np.int16_t,np.int16_t](tmpx, tmpy)) + pred[tmpx, tmpy] = cur_label + is_edge = False + if is_edge: + nxt_que.push(cur) + + que, nxt_que = nxt_que, que + + return pred + +def pse(kernels, min_area): + kernel_num = kernels.shape[0] + label_num, label = cv2.connectedComponents(kernels[-1], connectivity=4) + return _pse(kernels[:-1], label, kernel_num, label_num, min_area) \ No newline at end of file diff --git a/ppocr/postprocess/pse_postprocess/pse/setup.py b/ppocr/postprocess/pse_postprocess/pse/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..03746782af791938bff31c24e4a760f566c73b49 --- /dev/null +++ b/ppocr/postprocess/pse_postprocess/pse/setup.py @@ -0,0 +1,14 @@ +from distutils.core import setup, Extension +from Cython.Build import cythonize +import numpy + +setup(ext_modules=cythonize(Extension( + 'pse', + sources=['pse.pyx'], + language='c++', + include_dirs=[numpy.get_include()], + library_dirs=[], + libraries=[], + extra_compile_args=['-O3'], + extra_link_args=[] +))) diff --git a/ppocr/postprocess/pse_postprocess/pse_postprocess.py b/ppocr/postprocess/pse_postprocess/pse_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..962f3efe922c4a2656e0f44f478e1baf301a5542 --- /dev/null +++ b/ppocr/postprocess/pse_postprocess/pse_postprocess.py @@ -0,0 +1,120 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/head/psenet_head.py +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import cv2 +import paddle +from paddle.nn import functional as F + +from ppocr.postprocess.pse_postprocess.pse import pse + + +class PSEPostProcess(object): + """ + The post process for PSE. + """ + + def __init__(self, + thresh=0.5, + box_thresh=0.85, + min_area=16, + box_type='quad', + scale=4, + **kwargs): + assert box_type in ['quad', 'poly'], 'Only quad and poly is supported' + self.thresh = thresh + self.box_thresh = box_thresh + self.min_area = min_area + self.box_type = box_type + self.scale = scale + + def __call__(self, outs_dict, shape_list): + pred = outs_dict['maps'] + if not isinstance(pred, paddle.Tensor): + pred = paddle.to_tensor(pred) + pred = F.interpolate( + pred, scale_factor=4 // self.scale, mode='bilinear') + + score = F.sigmoid(pred[:, 0, :, :]) + + kernels = (pred > self.thresh).astype('float32') + text_mask = kernels[:, 0, :, :] + text_mask = paddle.unsqueeze(text_mask, axis=1) + + kernels[:, 0:, :, :] = kernels[:, 0:, :, :] * text_mask + + score = score.numpy() + kernels = kernels.numpy().astype(np.uint8) + + boxes_batch = [] + for batch_index in range(pred.shape[0]): + boxes, scores = self.boxes_from_bitmap(score[batch_index], + kernels[batch_index], + shape_list[batch_index]) + + boxes_batch.append({'points': boxes, 'scores': scores}) + return boxes_batch + + def boxes_from_bitmap(self, score, kernels, shape): + label = pse(kernels, self.min_area) + return self.generate_box(score, label, shape) + + def generate_box(self, score, label, shape): + src_h, src_w, ratio_h, ratio_w = shape + label_num = np.max(label) + 1 + + boxes = [] + scores = [] + for i in range(1, label_num): + ind = label == i + points = np.array(np.where(ind)).transpose((1, 0))[:, ::-1] + + if points.shape[0] < self.min_area: + label[ind] = 0 + continue + + score_i = np.mean(score[ind]) + if score_i < self.box_thresh: + label[ind] = 0 + continue + + if self.box_type == 'quad': + rect = cv2.minAreaRect(points) + bbox = cv2.boxPoints(rect) + elif self.box_type == 'poly': + box_height = np.max(points[:, 1]) + 10 + box_width = np.max(points[:, 0]) + 10 + + mask = np.zeros((box_height, box_width), np.uint8) + mask[points[:, 1], points[:, 0]] = 255 + + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, + cv2.CHAIN_APPROX_SIMPLE) + bbox = np.squeeze(contours[0], 1) + else: + raise NotImplementedError + + bbox[:, 0] = np.clip(np.round(bbox[:, 0] / ratio_w), 0, src_w) + bbox[:, 1] = np.clip(np.round(bbox[:, 1] / ratio_h), 0, src_h) + boxes.append(bbox) + scores.append(score_i) + return boxes, scores diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..fbf8b93e3d11121c99ce5b2dcbf2149e15453d4a --- /dev/null +++ b/ppocr/postprocess/rec_postprocess.py @@ -0,0 +1,931 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +from paddle.nn import functional as F +import re + + +class BaseRecLabelDecode(object): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False): + self.beg_str = "sos" + self.end_str = "eos" + self.reverse = False + self.character_str = [] + + if character_dict_path is None: + self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" + dict_character = list(self.character_str) + else: + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + self.character_str.append(line) + if use_space_char: + self.character_str.append(" ") + dict_character = list(self.character_str) + if 'arabic' in character_dict_path: + self.reverse = True + + dict_character = self.add_special_char(dict_character) + self.dict = {} + for i, char in enumerate(dict_character): + self.dict[char] = i + self.character = dict_character + + def pred_reverse(self, pred): + pred_re = [] + c_current = '' + for c in pred: + if not bool(re.search('[a-zA-Z0-9 :*./%+-]', c)): + if c_current != '': + pred_re.append(c_current) + pred_re.append(c) + c_current = '' + else: + c_current += c + if c_current != '': + pred_re.append(c_current) + + return ''.join(pred_re[::-1]) + + def add_special_char(self, dict_character): + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + selection = np.ones(len(text_index[batch_idx]), dtype=bool) + if is_remove_duplicate: + selection[1:] = text_index[batch_idx][1:] != text_index[ + batch_idx][:-1] + for ignored_token in ignored_tokens: + selection &= text_index[batch_idx] != ignored_token + + char_list = [ + self.character[text_id] + for text_id in text_index[batch_idx][selection] + ] + if text_prob is not None: + conf_list = text_prob[batch_idx][selection] + else: + conf_list = [1] * len(selection) + if len(conf_list) == 0: + conf_list = [0] + + text = ''.join(char_list) + + if self.reverse: # for arabic rec + text = self.pred_reverse(text) + + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def get_ignored_tokens(self): + return [0] # for ctc blank + + +class CTCLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(CTCLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, tuple) or isinstance(preds, list): + preds = preds[-1] + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True) + if label is None: + return text + label = self.decode(label) + return text, label + + def add_special_char(self, dict_character): + dict_character = ['blank'] + dict_character + return dict_character + + +class DistillationCTCLabelDecode(CTCLabelDecode): + """ + Convert + Convert between text-label and text-index + """ + + def __init__(self, + character_dict_path=None, + use_space_char=False, + model_name=["student"], + key=None, + multi_head=False, + **kwargs): + super(DistillationCTCLabelDecode, self).__init__(character_dict_path, + use_space_char) + if not isinstance(model_name, list): + model_name = [model_name] + self.model_name = model_name + + self.key = key + self.multi_head = multi_head + + def __call__(self, preds, label=None, *args, **kwargs): + output = dict() + for name in self.model_name: + pred = preds[name] + if self.key is not None: + pred = pred[self.key] + if self.multi_head and isinstance(pred, dict): + pred = pred['ctc'] + output[name] = super().__call__(pred, label=label, *args, **kwargs) + return output + + +class AttnLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(AttnLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def add_special_char(self, dict_character): + self.beg_str = "sos" + self.end_str = "eos" + dict_character = dict_character + dict_character = [self.beg_str] + dict_character + [self.end_str] + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + [beg_idx, end_idx] = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(end_idx): + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + """ + text = self.decode(text) + if label is None: + return text + else: + label = self.decode(label, is_remove_duplicate=False) + return text, label + """ + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class RFLLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(RFLLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def add_special_char(self, dict_character): + self.beg_str = "sos" + self.end_str = "eos" + dict_character = dict_character + dict_character = [self.beg_str] + dict_character + [self.end_str] + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + [beg_idx, end_idx] = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(end_idx): + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + # if seq_outputs is not None: + if isinstance(preds, tuple) or isinstance(preds, list): + cnt_outputs, seq_outputs = preds + if isinstance(seq_outputs, paddle.Tensor): + seq_outputs = seq_outputs.numpy() + preds_idx = seq_outputs.argmax(axis=2) + preds_prob = seq_outputs.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + else: + cnt_outputs = preds + if isinstance(cnt_outputs, paddle.Tensor): + cnt_outputs = cnt_outputs.numpy() + cnt_length = [] + for lens in cnt_outputs: + length = round(np.sum(lens)) + cnt_length.append(length) + if label is None: + return cnt_length + label = self.decode(label, is_remove_duplicate=False) + length = [len(res[0]) for res in label] + return cnt_length, length + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class SEEDLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(SEEDLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def add_special_char(self, dict_character): + self.padding_str = "padding" + self.end_str = "eos" + self.unknown = "unknown" + dict_character = dict_character + [ + self.end_str, self.padding_str, self.unknown + ] + return dict_character + + def get_ignored_tokens(self): + end_idx = self.get_beg_end_flag_idx("eos") + return [end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "sos": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "eos": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "unsupport type %s in get_beg_end_flag_idx" % beg_or_end + return idx + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + [end_idx] = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if int(text_index[batch_idx][idx]) == int(end_idx): + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + """ + text = self.decode(text) + if label is None: + return text + else: + label = self.decode(label, is_remove_duplicate=False) + return text, label + """ + preds_idx = preds["rec_pred"] + if isinstance(preds_idx, paddle.Tensor): + preds_idx = preds_idx.numpy() + if "rec_pred_scores" in preds: + preds_idx = preds["rec_pred"] + preds_prob = preds["rec_pred_scores"] + else: + preds_idx = preds["rec_pred"].argmax(axis=2) + preds_prob = preds["rec_pred"].max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + +class SRNLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(SRNLabelDecode, self).__init__(character_dict_path, + use_space_char) + self.max_text_length = kwargs.get('max_text_length', 25) + + def __call__(self, preds, label=None, *args, **kwargs): + pred = preds['predict'] + char_num = len(self.character_str) + 2 + if isinstance(pred, paddle.Tensor): + pred = pred.numpy() + pred = np.reshape(pred, [-1, char_num]) + + preds_idx = np.argmax(pred, axis=1) + preds_prob = np.max(pred, axis=1) + + preds_idx = np.reshape(preds_idx, [-1, self.max_text_length]) + + preds_prob = np.reshape(preds_prob, [-1, self.max_text_length]) + + text = self.decode(preds_idx, preds_prob) + + if label is None: + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + return text + label = self.decode(label) + return text, label + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + batch_size = len(text_index) + + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def add_special_char(self, dict_character): + dict_character = dict_character + [self.beg_str, self.end_str] + return dict_character + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class SARLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(SARLabelDecode, self).__init__(character_dict_path, + use_space_char) + + self.rm_symbol = kwargs.get('rm_symbol', False) + + def add_special_char(self, dict_character): + beg_end_str = "" + unknown_str = "" + padding_str = "" + dict_character = dict_character + [unknown_str] + self.unknown_idx = len(dict_character) - 1 + dict_character = dict_character + [beg_end_str] + self.start_idx = len(dict_character) - 1 + self.end_idx = len(dict_character) - 1 + dict_character = dict_character + [padding_str] + self.padding_idx = len(dict_character) - 1 + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(self.end_idx): + if text_prob is None and idx == 0: + continue + else: + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + if self.rm_symbol: + comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]') + text = text.lower() + text = comp.sub('', text) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + def get_ignored_tokens(self): + return [self.padding_idx] + + +class DistillationSARLabelDecode(SARLabelDecode): + """ + Convert + Convert between text-label and text-index + """ + + def __init__(self, + character_dict_path=None, + use_space_char=False, + model_name=["student"], + key=None, + multi_head=False, + **kwargs): + super(DistillationSARLabelDecode, self).__init__(character_dict_path, + use_space_char) + if not isinstance(model_name, list): + model_name = [model_name] + self.model_name = model_name + + self.key = key + self.multi_head = multi_head + + def __call__(self, preds, label=None, *args, **kwargs): + output = dict() + for name in self.model_name: + pred = preds[name] + if self.key is not None: + pred = pred[self.key] + if self.multi_head and isinstance(pred, dict): + pred = pred['sar'] + output[name] = super().__call__(pred, label=label, *args, **kwargs) + return output + + +class PRENLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(PRENLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def add_special_char(self, dict_character): + padding_str = '' # 0 + end_str = '' # 1 + unknown_str = '' # 2 + + dict_character = [padding_str, end_str, unknown_str] + dict_character + self.padding_idx = 0 + self.end_idx = 1 + self.unknown_idx = 2 + + return dict_character + + def decode(self, text_index, text_prob=None): + """ convert text-index into text-label. """ + result_list = [] + batch_size = len(text_index) + + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] == self.end_idx: + break + if text_index[batch_idx][idx] in \ + [self.padding_idx, self.unknown_idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + + text = ''.join(char_list) + if len(text) > 0: + result_list.append((text, np.mean(conf_list).tolist())) + else: + # here confidence of empty recog result is 1 + result_list.append(('', 1)) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob) + if label is None: + return text + label = self.decode(label) + return text, label + + +class NRTRLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=True, **kwargs): + super(NRTRLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + + if len(preds) == 2: + preds_id = preds[0] + preds_prob = preds[1] + if isinstance(preds_id, paddle.Tensor): + preds_id = preds_id.numpy() + if isinstance(preds_prob, paddle.Tensor): + preds_prob = preds_prob.numpy() + if preds_id[0][0] == 2: + preds_idx = preds_id[:, 1:] + preds_prob = preds_prob[:, 1:] + else: + preds_idx = preds_id + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + else: + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + return text, label + + def add_special_char(self, dict_character): + dict_character = ['blank', '', '', ''] + dict_character + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + try: + char_idx = self.character[int(text_index[batch_idx][idx])] + except: + continue + if char_idx == '': # end + break + char_list.append(char_idx) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text.lower(), np.mean(conf_list).tolist())) + return result_list + + +class ViTSTRLabelDecode(NRTRLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(ViTSTRLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, paddle.Tensor): + preds = preds[:, 1:].numpy() + else: + preds = preds[:, 1:] + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + return text, label + + def add_special_char(self, dict_character): + dict_character = ['', ''] + dict_character + return dict_character + + +class ABINetLabelDecode(NRTRLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(ABINetLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, dict): + preds = preds['align'][-1].numpy() + elif isinstance(preds, paddle.Tensor): + preds = preds.numpy() + else: + preds = preds + + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label) + return text, label + + def add_special_char(self, dict_character): + dict_character = [''] + dict_character + return dict_character + + +class SPINLabelDecode(AttnLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(SPINLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def add_special_char(self, dict_character): + self.beg_str = "sos" + self.end_str = "eos" + dict_character = dict_character + dict_character = [self.beg_str] + [self.end_str] + dict_character + return dict_character + + +class VLLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(VLLabelDecode, self).__init__(character_dict_path, use_space_char) + self.max_text_length = kwargs.get('max_text_length', 25) + self.nclass = len(self.character) + 1 + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + selection = np.ones(len(text_index[batch_idx]), dtype=bool) + if is_remove_duplicate: + selection[1:] = text_index[batch_idx][1:] != text_index[ + batch_idx][:-1] + for ignored_token in ignored_tokens: + selection &= text_index[batch_idx] != ignored_token + + char_list = [ + self.character[text_id - 1] + for text_id in text_index[batch_idx][selection] + ] + if text_prob is not None: + conf_list = text_prob[batch_idx][selection] + else: + conf_list = [1] * len(selection) + if len(conf_list) == 0: + conf_list = [0] + + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def __call__(self, preds, label=None, length=None, *args, **kwargs): + if len(preds) == 2: # eval mode + text_pre, x = preds + b = text_pre.shape[1] + lenText = self.max_text_length + nsteps = self.max_text_length + + if not isinstance(text_pre, paddle.Tensor): + text_pre = paddle.to_tensor(text_pre, dtype='float32') + + out_res = paddle.zeros( + shape=[lenText, b, self.nclass], dtype=x.dtype) + out_length = paddle.zeros(shape=[b], dtype=x.dtype) + now_step = 0 + for _ in range(nsteps): + if 0 in out_length and now_step < nsteps: + tmp_result = text_pre[now_step, :, :] + out_res[now_step] = tmp_result + tmp_result = tmp_result.topk(1)[1].squeeze(axis=1) + for j in range(b): + if out_length[j] == 0 and tmp_result[j] == 0: + out_length[j] = now_step + 1 + now_step += 1 + for j in range(0, b): + if int(out_length[j]) == 0: + out_length[j] = nsteps + start = 0 + output = paddle.zeros( + shape=[int(out_length.sum()), self.nclass], dtype=x.dtype) + for i in range(0, b): + cur_length = int(out_length[i]) + output[start:start + cur_length] = out_res[0:cur_length, i, :] + start += cur_length + net_out = output + length = out_length + + else: # train mode + net_out = preds[0] + length = length + net_out = paddle.concat([t[:l] for t, l in zip(net_out, length)]) + text = [] + if not isinstance(net_out, paddle.Tensor): + net_out = paddle.to_tensor(net_out, dtype='float32') + net_out = F.softmax(net_out, axis=1) + for i in range(0, length.shape[0]): + preds_idx = net_out[int(length[:i].sum()):int(length[:i].sum( + ) + length[i])].topk(1)[1][:, 0].tolist() + preds_text = ''.join([ + self.character[idx - 1] + if idx > 0 and idx <= len(self.character) else '' + for idx in preds_idx + ]) + preds_prob = net_out[int(length[:i].sum()):int(length[:i].sum( + ) + length[i])].topk(1)[0][:, 0] + preds_prob = paddle.exp( + paddle.log(preds_prob).sum() / (preds_prob.shape[0] + 1e-6)) + text.append((preds_text, preds_prob.numpy()[0])) + if label is None: + return text + label = self.decode(label) + return text, label + + +class CANLabelDecode(BaseRecLabelDecode): + """ Convert between latex-symbol and symbol-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(CANLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def decode(self, text_index, preds_prob=None): + result_list = [] + batch_size = len(text_index) + for batch_idx in range(batch_size): + seq_end = text_index[batch_idx].argmin(0) + idx_list = text_index[batch_idx][:seq_end].tolist() + symbol_list = [self.character[idx] for idx in idx_list] + probs = [] + if preds_prob is not None: + probs = preds_prob[batch_idx][:len(symbol_list)].tolist() + + result_list.append([' '.join(symbol_list), probs]) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + pred_prob, _, _, _ = preds + preds_idx = pred_prob.argmax(axis=2) + + text = self.decode(preds_idx) + if label is None: + return text + label = self.decode(label) + return text, label diff --git a/ppocr/postprocess/sast_postprocess.py b/ppocr/postprocess/sast_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..bee75c05b1a3ea59193d566f91378c96797f533b --- /dev/null +++ b/ppocr/postprocess/sast_postprocess.py @@ -0,0 +1,355 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +__dir__ = os.path.dirname(__file__) +sys.path.append(__dir__) +sys.path.append(os.path.join(__dir__, '..')) + +import numpy as np +from .locality_aware_nms import nms_locality +import paddle +import cv2 +import time + + +class SASTPostProcess(object): + """ + The post process for SAST. + """ + + def __init__(self, + score_thresh=0.5, + nms_thresh=0.2, + sample_pts_num=2, + shrink_ratio_of_width=0.3, + expand_scale=1.0, + tcl_map_thresh=0.5, + **kwargs): + + self.score_thresh = score_thresh + self.nms_thresh = nms_thresh + self.sample_pts_num = sample_pts_num + self.shrink_ratio_of_width = shrink_ratio_of_width + self.expand_scale = expand_scale + self.tcl_map_thresh = tcl_map_thresh + + # c++ la-nms is faster, but only support python 3.5 + self.is_python35 = False + if sys.version_info.major == 3 and sys.version_info.minor == 5: + self.is_python35 = True + + def point_pair2poly(self, point_pair_list): + """ + Transfer vertical point_pairs into poly point in clockwise. + """ + # constract poly + point_num = len(point_pair_list) * 2 + point_list = [0] * point_num + for idx, point_pair in enumerate(point_pair_list): + point_list[idx] = point_pair[0] + point_list[point_num - 1 - idx] = point_pair[1] + return np.array(point_list).reshape(-1, 2) + + def shrink_quad_along_width(self, + quad, + begin_width_ratio=0., + end_width_ratio=1.): + """ + Generate shrink_quad_along_width. + """ + ratio_pair = np.array( + [[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + def expand_poly_along_width(self, poly, shrink_ratio_of_width=0.3): + """ + expand poly along width. + """ + point_num = poly.shape[0] + left_quad = np.array( + [poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32) + left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \ + (np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6) + left_quad_expand = self.shrink_quad_along_width(left_quad, left_ratio, + 1.0) + right_quad = np.array( + [ + poly[point_num // 2 - 2], poly[point_num // 2 - 1], + poly[point_num // 2], poly[point_num // 2 + 1] + ], + dtype=np.float32) + right_ratio = 1.0 + \ + shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \ + (np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6) + right_quad_expand = self.shrink_quad_along_width(right_quad, 0.0, + right_ratio) + poly[0] = left_quad_expand[0] + poly[-1] = left_quad_expand[-1] + poly[point_num // 2 - 1] = right_quad_expand[1] + poly[point_num // 2] = right_quad_expand[2] + return poly + + def restore_quad(self, tcl_map, tcl_map_thresh, tvo_map): + """Restore quad.""" + xy_text = np.argwhere(tcl_map[:, :, 0] > tcl_map_thresh) + xy_text = xy_text[:, ::-1] # (n, 2) + + # Sort the text boxes via the y axis + xy_text = xy_text[np.argsort(xy_text[:, 1])] + + scores = tcl_map[xy_text[:, 1], xy_text[:, 0], 0] + scores = scores[:, np.newaxis] + + # Restore + point_num = int(tvo_map.shape[-1] / 2) + assert point_num == 4 + tvo_map = tvo_map[xy_text[:, 1], xy_text[:, 0], :] + xy_text_tile = np.tile(xy_text, (1, point_num)) # (n, point_num * 2) + quads = xy_text_tile - tvo_map + + return scores, quads, xy_text + + def quad_area(self, quad): + """ + compute area of a quad. + """ + edge = [(quad[1][0] - quad[0][0]) * (quad[1][1] + quad[0][1]), + (quad[2][0] - quad[1][0]) * (quad[2][1] + quad[1][1]), + (quad[3][0] - quad[2][0]) * (quad[3][1] + quad[2][1]), + (quad[0][0] - quad[3][0]) * (quad[0][1] + quad[3][1])] + return np.sum(edge) / 2. + + def nms(self, dets): + if self.is_python35: + import lanms + dets = lanms.merge_quadrangle_n9(dets, self.nms_thresh) + else: + dets = nms_locality(dets, self.nms_thresh) + return dets + + def cluster_by_quads_tco(self, tcl_map, tcl_map_thresh, quads, tco_map): + """ + Cluster pixels in tcl_map based on quads. + """ + instance_count = quads.shape[0] + 1 # contain background + instance_label_map = np.zeros(tcl_map.shape[:2], dtype=np.int32) + if instance_count == 1: + return instance_count, instance_label_map + + # predict text center + xy_text = np.argwhere(tcl_map[:, :, 0] > tcl_map_thresh) + n = xy_text.shape[0] + xy_text = xy_text[:, ::-1] # (n, 2) + tco = tco_map[xy_text[:, 1], xy_text[:, 0], :] # (n, 2) + pred_tc = xy_text - tco + + # get gt text center + m = quads.shape[0] + gt_tc = np.mean(quads, axis=1) # (m, 2) + + pred_tc_tile = np.tile(pred_tc[:, np.newaxis, :], + (1, m, 1)) # (n, m, 2) + gt_tc_tile = np.tile(gt_tc[np.newaxis, :, :], (n, 1, 1)) # (n, m, 2) + dist_mat = np.linalg.norm(pred_tc_tile - gt_tc_tile, axis=2) # (n, m) + xy_text_assign = np.argmin(dist_mat, axis=1) + 1 # (n,) + + instance_label_map[xy_text[:, 1], xy_text[:, 0]] = xy_text_assign + return instance_count, instance_label_map + + def estimate_sample_pts_num(self, quad, xy_text): + """ + Estimate sample points number. + """ + eh = (np.linalg.norm(quad[0] - quad[3]) + + np.linalg.norm(quad[1] - quad[2])) / 2.0 + ew = (np.linalg.norm(quad[0] - quad[1]) + + np.linalg.norm(quad[2] - quad[3])) / 2.0 + + dense_sample_pts_num = max(2, int(ew)) + dense_xy_center_line = xy_text[np.linspace( + 0, + xy_text.shape[0] - 1, + dense_sample_pts_num, + endpoint=True, + dtype=np.float32).astype(np.int32)] + + dense_xy_center_line_diff = dense_xy_center_line[ + 1:] - dense_xy_center_line[:-1] + estimate_arc_len = np.sum( + np.linalg.norm( + dense_xy_center_line_diff, axis=1)) + + sample_pts_num = max(2, int(estimate_arc_len / eh)) + return sample_pts_num + + def detect_sast(self, + tcl_map, + tvo_map, + tbo_map, + tco_map, + ratio_w, + ratio_h, + src_w, + src_h, + shrink_ratio_of_width=0.3, + tcl_map_thresh=0.5, + offset_expand=1.0, + out_strid=4.0): + """ + first resize the tcl_map, tvo_map and tbo_map to the input_size, then restore the polys + """ + # restore quad + scores, quads, xy_text = self.restore_quad(tcl_map, tcl_map_thresh, + tvo_map) + dets = np.hstack((quads, scores)).astype(np.float32, copy=False) + dets = self.nms(dets) + if dets.shape[0] == 0: + return [] + quads = dets[:, :-1].reshape(-1, 4, 2) + + # Compute quad area + quad_areas = [] + for quad in quads: + quad_areas.append(-self.quad_area(quad)) + + # instance segmentation + # instance_count, instance_label_map = cv2.connectedComponents(tcl_map.astype(np.uint8), connectivity=8) + instance_count, instance_label_map = self.cluster_by_quads_tco( + tcl_map, tcl_map_thresh, quads, tco_map) + + # restore single poly with tcl instance. + poly_list = [] + for instance_idx in range(1, instance_count): + xy_text = np.argwhere(instance_label_map == instance_idx)[:, ::-1] + quad = quads[instance_idx - 1] + q_area = quad_areas[instance_idx - 1] + if q_area < 5: + continue + + # + len1 = float(np.linalg.norm(quad[0] - quad[1])) + len2 = float(np.linalg.norm(quad[1] - quad[2])) + min_len = min(len1, len2) + if min_len < 3: + continue + + # filter small CC + if xy_text.shape[0] <= 0: + continue + + # filter low confidence instance + xy_text_scores = tcl_map[xy_text[:, 1], xy_text[:, 0], 0] + if np.sum(xy_text_scores) / quad_areas[instance_idx - 1] < 0.1: + # if np.sum(xy_text_scores) / quad_areas[instance_idx - 1] < 0.05: + continue + + # sort xy_text + left_center_pt = np.array( + [[(quad[0, 0] + quad[-1, 0]) / 2.0, + (quad[0, 1] + quad[-1, 1]) / 2.0]]) # (1, 2) + right_center_pt = np.array( + [[(quad[1, 0] + quad[2, 0]) / 2.0, + (quad[1, 1] + quad[2, 1]) / 2.0]]) # (1, 2) + proj_unit_vec = (right_center_pt - left_center_pt) / \ + (np.linalg.norm(right_center_pt - left_center_pt) + 1e-6) + proj_value = np.sum(xy_text * proj_unit_vec, axis=1) + xy_text = xy_text[np.argsort(proj_value)] + + # Sample pts in tcl map + if self.sample_pts_num == 0: + sample_pts_num = self.estimate_sample_pts_num(quad, xy_text) + else: + sample_pts_num = self.sample_pts_num + xy_center_line = xy_text[np.linspace( + 0, + xy_text.shape[0] - 1, + sample_pts_num, + endpoint=True, + dtype=np.float32).astype(np.int32)] + + point_pair_list = [] + for x, y in xy_center_line: + # get corresponding offset + offset = tbo_map[y, x, :].reshape(2, 2) + if offset_expand != 1.0: + offset_length = np.linalg.norm( + offset, axis=1, keepdims=True) + expand_length = np.clip( + offset_length * (offset_expand - 1), + a_min=0.5, + a_max=3.0) + offset_detal = offset / offset_length * expand_length + offset = offset + offset_detal + # original point + ori_yx = np.array([y, x], dtype=np.float32) + point_pair = (ori_yx + offset)[:, ::-1] * out_strid / np.array( + [ratio_w, ratio_h]).reshape(-1, 2) + point_pair_list.append(point_pair) + + # ndarry: (x, 2), expand poly along width + detected_poly = self.point_pair2poly(point_pair_list) + detected_poly = self.expand_poly_along_width(detected_poly, + shrink_ratio_of_width) + detected_poly[:, 0] = np.clip( + detected_poly[:, 0], a_min=0, a_max=src_w) + detected_poly[:, 1] = np.clip( + detected_poly[:, 1], a_min=0, a_max=src_h) + poly_list.append(detected_poly) + + return poly_list + + def __call__(self, outs_dict, shape_list): + score_list = outs_dict['f_score'] + border_list = outs_dict['f_border'] + tvo_list = outs_dict['f_tvo'] + tco_list = outs_dict['f_tco'] + if isinstance(score_list, paddle.Tensor): + score_list = score_list.numpy() + border_list = border_list.numpy() + tvo_list = tvo_list.numpy() + tco_list = tco_list.numpy() + + img_num = len(shape_list) + poly_lists = [] + for ino in range(img_num): + p_score = score_list[ino].transpose((1, 2, 0)) + p_border = border_list[ino].transpose((1, 2, 0)) + p_tvo = tvo_list[ino].transpose((1, 2, 0)) + p_tco = tco_list[ino].transpose((1, 2, 0)) + src_h, src_w, ratio_h, ratio_w = shape_list[ino] + + poly_list = self.detect_sast( + p_score, + p_tvo, + p_border, + p_tco, + ratio_w, + ratio_h, + src_w, + src_h, + shrink_ratio_of_width=self.shrink_ratio_of_width, + tcl_map_thresh=self.tcl_map_thresh, + offset_expand=self.expand_scale) + poly_lists.append({'points': np.array(poly_list)}) + + return poly_lists diff --git a/ppocr/postprocess/table_postprocess.py b/ppocr/postprocess/table_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..a47061f935e31b24fdb624df170f8abb38e01f40 --- /dev/null +++ b/ppocr/postprocess/table_postprocess.py @@ -0,0 +1,186 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle + +from .rec_postprocess import AttnLabelDecode + + +class TableLabelDecode(AttnLabelDecode): + """ """ + + def __init__(self, + character_dict_path, + merge_no_span_structure=False, + **kwargs): + dict_character = [] + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + dict_character.append(line) + + if merge_no_span_structure: + if "" not in dict_character: + dict_character.append("") + if "" in dict_character: + dict_character.remove("") + + dict_character = self.add_special_char(dict_character) + self.dict = {} + for i, char in enumerate(dict_character): + self.dict[char] = i + self.character = dict_character + self.td_token = ['', ''] + + def __call__(self, preds, batch=None): + structure_probs = preds['structure_probs'] + bbox_preds = preds['loc_preds'] + if isinstance(structure_probs, paddle.Tensor): + structure_probs = structure_probs.numpy() + if isinstance(bbox_preds, paddle.Tensor): + bbox_preds = bbox_preds.numpy() + shape_list = batch[-1] + result = self.decode(structure_probs, bbox_preds, shape_list) + if len(batch) == 1: # only contains shape + return result + + label_decode_result = self.decode_label(batch) + return result, label_decode_result + + def decode(self, structure_probs, bbox_preds, shape_list): + """convert text-label into text-index. + """ + ignored_tokens = self.get_ignored_tokens() + end_idx = self.dict[self.end_str] + + structure_idx = structure_probs.argmax(axis=2) + structure_probs = structure_probs.max(axis=2) + + structure_batch_list = [] + bbox_batch_list = [] + batch_size = len(structure_idx) + for batch_idx in range(batch_size): + structure_list = [] + bbox_list = [] + score_list = [] + for idx in range(len(structure_idx[batch_idx])): + char_idx = int(structure_idx[batch_idx][idx]) + if idx > 0 and char_idx == end_idx: + break + if char_idx in ignored_tokens: + continue + text = self.character[char_idx] + if text in self.td_token: + bbox = bbox_preds[batch_idx, idx] + bbox = self._bbox_decode(bbox, shape_list[batch_idx]) + bbox_list.append(bbox) + structure_list.append(text) + score_list.append(structure_probs[batch_idx, idx]) + structure_batch_list.append([structure_list, np.mean(score_list)]) + bbox_batch_list.append(np.array(bbox_list)) + result = { + 'bbox_batch_list': bbox_batch_list, + 'structure_batch_list': structure_batch_list, + } + return result + + def decode_label(self, batch): + """convert text-label into text-index. + """ + structure_idx = batch[1] + gt_bbox_list = batch[2] + shape_list = batch[-1] + ignored_tokens = self.get_ignored_tokens() + end_idx = self.dict[self.end_str] + + structure_batch_list = [] + bbox_batch_list = [] + batch_size = len(structure_idx) + for batch_idx in range(batch_size): + structure_list = [] + bbox_list = [] + for idx in range(len(structure_idx[batch_idx])): + char_idx = int(structure_idx[batch_idx][idx]) + if idx > 0 and char_idx == end_idx: + break + if char_idx in ignored_tokens: + continue + structure_list.append(self.character[char_idx]) + + bbox = gt_bbox_list[batch_idx][idx] + if bbox.sum() != 0: + bbox = self._bbox_decode(bbox, shape_list[batch_idx]) + bbox_list.append(bbox) + structure_batch_list.append(structure_list) + bbox_batch_list.append(bbox_list) + result = { + 'bbox_batch_list': bbox_batch_list, + 'structure_batch_list': structure_batch_list, + } + return result + + def _bbox_decode(self, bbox, shape): + h, w, ratio_h, ratio_w, pad_h, pad_w = shape + bbox[0::2] *= w + bbox[1::2] *= h + return bbox + + +class TableMasterLabelDecode(TableLabelDecode): + """ """ + + def __init__(self, + character_dict_path, + box_shape='ori', + merge_no_span_structure=True, + **kwargs): + super(TableMasterLabelDecode, self).__init__(character_dict_path, + merge_no_span_structure) + self.box_shape = box_shape + assert box_shape in [ + 'ori', 'pad' + ], 'The shape used for box normalization must be ori or pad' + + def add_special_char(self, dict_character): + self.beg_str = '' + self.end_str = '' + self.unknown_str = '' + self.pad_str = '' + dict_character = dict_character + dict_character = dict_character + [ + self.unknown_str, self.beg_str, self.end_str, self.pad_str + ] + return dict_character + + def get_ignored_tokens(self): + pad_idx = self.dict[self.pad_str] + start_idx = self.dict[self.beg_str] + end_idx = self.dict[self.end_str] + unknown_idx = self.dict[self.unknown_str] + return [start_idx, end_idx, pad_idx, unknown_idx] + + def _bbox_decode(self, bbox, shape): + h, w, ratio_h, ratio_w, pad_h, pad_w = shape + if self.box_shape == 'pad': + h, w = pad_h, pad_w + bbox[0::2] *= w + bbox[1::2] *= h + bbox[0::2] /= ratio_w + bbox[1::2] /= ratio_h + x, y, w, h = bbox + x1, y1, x2, y2 = x - w // 2, y - h // 2, x + w // 2, y + h // 2 + bbox = np.array([x1, y1, x2, y2]) + return bbox diff --git a/ppocr/postprocess/vqa_token_re_layoutlm_postprocess.py b/ppocr/postprocess/vqa_token_re_layoutlm_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..64f7d761950249eaef2946e09365dbaab4d94c6c --- /dev/null +++ b/ppocr/postprocess/vqa_token_re_layoutlm_postprocess.py @@ -0,0 +1,95 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle + + +class VQAReTokenLayoutLMPostProcess(object): + """ Convert between text-label and text-index """ + + def __init__(self, **kwargs): + super(VQAReTokenLayoutLMPostProcess, self).__init__() + + def __call__(self, preds, label=None, *args, **kwargs): + pred_relations = preds['pred_relations'] + if isinstance(preds['pred_relations'], paddle.Tensor): + pred_relations = pred_relations.numpy() + pred_relations = self.decode_pred(pred_relations) + + if label is not None: + return self._metric(pred_relations, label) + else: + return self._infer(pred_relations, *args, **kwargs) + + def _metric(self, pred_relations, label): + return pred_relations, label[-1], label[-2] + + def _infer(self, pred_relations, *args, **kwargs): + ser_results = kwargs['ser_results'] + entity_idx_dict_batch = kwargs['entity_idx_dict_batch'] + + # merge relations and ocr info + results = [] + for pred_relation, ser_result, entity_idx_dict in zip( + pred_relations, ser_results, entity_idx_dict_batch): + result = [] + used_tail_id = [] + for relation in pred_relation: + if relation['tail_id'] in used_tail_id: + continue + used_tail_id.append(relation['tail_id']) + ocr_info_head = ser_result[entity_idx_dict[relation['head_id']]] + ocr_info_tail = ser_result[entity_idx_dict[relation['tail_id']]] + result.append((ocr_info_head, ocr_info_tail)) + results.append(result) + return results + + def decode_pred(self, pred_relations): + pred_relations_new = [] + for pred_relation in pred_relations: + pred_relation_new = [] + pred_relation = pred_relation[1:pred_relation[0, 0, 0] + 1] + for relation in pred_relation: + relation_new = dict() + relation_new['head_id'] = relation[0, 0] + relation_new['head'] = tuple(relation[1]) + relation_new['head_type'] = relation[2, 0] + relation_new['tail_id'] = relation[3, 0] + relation_new['tail'] = tuple(relation[4]) + relation_new['tail_type'] = relation[5, 0] + relation_new['type'] = relation[6, 0] + pred_relation_new.append(relation_new) + pred_relations_new.append(pred_relation_new) + return pred_relations_new + + +class DistillationRePostProcess(VQAReTokenLayoutLMPostProcess): + """ + DistillationRePostProcess + """ + + def __init__(self, model_name=["Student"], key=None, **kwargs): + super().__init__(**kwargs) + if not isinstance(model_name, list): + model_name = [model_name] + self.model_name = model_name + self.key = key + + def __call__(self, preds, *args, **kwargs): + output = dict() + for name in self.model_name: + pred = preds[name] + if self.key is not None: + pred = pred[self.key] + output[name] = super().__call__(pred, *args, **kwargs) + return output diff --git a/ppocr/postprocess/vqa_token_ser_layoutlm_postprocess.py b/ppocr/postprocess/vqa_token_ser_layoutlm_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..5541da90a05d0137628f45f72b15fd61eba1e203 --- /dev/null +++ b/ppocr/postprocess/vqa_token_ser_layoutlm_postprocess.py @@ -0,0 +1,117 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import paddle +from ppocr.utils.utility import load_vqa_bio_label_maps + + +class VQASerTokenLayoutLMPostProcess(object): + """ Convert between text-label and text-index """ + + def __init__(self, class_path, **kwargs): + super(VQASerTokenLayoutLMPostProcess, self).__init__() + label2id_map, self.id2label_map = load_vqa_bio_label_maps(class_path) + + self.label2id_map_for_draw = dict() + for key in label2id_map: + if key.startswith("I-"): + self.label2id_map_for_draw[key] = label2id_map["B" + key[1:]] + else: + self.label2id_map_for_draw[key] = label2id_map[key] + + self.id2label_map_for_show = dict() + for key in self.label2id_map_for_draw: + val = self.label2id_map_for_draw[key] + if key == "O": + self.id2label_map_for_show[val] = key + if key.startswith("B-") or key.startswith("I-"): + self.id2label_map_for_show[val] = key[2:] + else: + self.id2label_map_for_show[val] = key + + def __call__(self, preds, batch=None, *args, **kwargs): + if isinstance(preds, tuple): + preds = preds[0] + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + + if batch is not None: + return self._metric(preds, batch[5]) + else: + return self._infer(preds, **kwargs) + + def _metric(self, preds, label): + pred_idxs = preds.argmax(axis=2) + decode_out_list = [[] for _ in range(pred_idxs.shape[0])] + label_decode_out_list = [[] for _ in range(pred_idxs.shape[0])] + + for i in range(pred_idxs.shape[0]): + for j in range(pred_idxs.shape[1]): + if label[i, j] != -100: + label_decode_out_list[i].append(self.id2label_map[label[i, + j]]) + decode_out_list[i].append(self.id2label_map[pred_idxs[i, + j]]) + return decode_out_list, label_decode_out_list + + def _infer(self, preds, segment_offset_ids, ocr_infos): + results = [] + + for pred, segment_offset_id, ocr_info in zip(preds, segment_offset_ids, + ocr_infos): + pred = np.argmax(pred, axis=1) + pred = [self.id2label_map[idx] for idx in pred] + + for idx in range(len(segment_offset_id)): + if idx == 0: + start_id = 0 + else: + start_id = segment_offset_id[idx - 1] + + end_id = segment_offset_id[idx] + + curr_pred = pred[start_id:end_id] + curr_pred = [self.label2id_map_for_draw[p] for p in curr_pred] + + if len(curr_pred) <= 0: + pred_id = 0 + else: + counts = np.bincount(curr_pred) + pred_id = np.argmax(counts) + ocr_info[idx]["pred_id"] = int(pred_id) + ocr_info[idx]["pred"] = self.id2label_map_for_show[int(pred_id)] + results.append(ocr_info) + return results + + +class DistillationSerPostProcess(VQASerTokenLayoutLMPostProcess): + """ + DistillationSerPostProcess + """ + + def __init__(self, class_path, model_name=["Student"], key=None, **kwargs): + super().__init__(class_path, **kwargs) + if not isinstance(model_name, list): + model_name = [model_name] + self.model_name = model_name + self.key = key + + def __call__(self, preds, batch=None, *args, **kwargs): + output = dict() + for name in self.model_name: + pred = preds[name] + if self.key is not None: + pred = pred[self.key] + output[name] = super().__call__(pred, batch=batch, *args, **kwargs) + return output diff --git a/ppocr/utils/EN_symbol_dict.txt b/ppocr/utils/EN_symbol_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..1aef43d6b842731a54cbe682ccda5c2dbfa694d9 --- /dev/null +++ b/ppocr/utils/EN_symbol_dict.txt @@ -0,0 +1,94 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +: +; +< += +> +? +@ +[ +\ +] +^ +_ +` +{ +| +} +~ \ No newline at end of file diff --git a/ppocr/utils/__init__.py b/ppocr/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/ppocr/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppocr/utils/dict/ar_dict.txt b/ppocr/utils/dict/ar_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc6380293eb51754b3d82a4b20ce4bdc297d56ed --- /dev/null +++ b/ppocr/utils/dict/ar_dict.txt @@ -0,0 +1,117 @@ +a +r +b +i +c +_ +m +g +/ +1 +0 +I +L +S +V +R +C +2 +v +l +6 +3 +9 +. +j +p +ا +ل +م +ر +ج +و +ح +ي +ة +5 +8 +7 +أ +ب +ض +4 +ك +س +ه +ث +ن +ط +ع +ت +غ +خ +ف +ئ +ز +إ +د +ص +ظ +ذ +ش +ى +ق +ؤ +آ +ء +s +e +n +w +t +u +z +d +A +N +G +h +o +E +T +H +O +B +y +F +U +J +X +W +P +Z +M +k +q +Y +Q +D +f +K +x +' +% +- +# +@ +! +& +$ +, +: +é +? ++ +É +( + diff --git a/ppocr/utils/dict/arabic_dict.txt b/ppocr/utils/dict/arabic_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..916d421c53bad563dfd980c1b64dcce07a3c9d24 --- /dev/null +++ b/ppocr/utils/dict/arabic_dict.txt @@ -0,0 +1,161 @@ +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ء +آ +أ +ؤ +إ +ئ +ا +ب +ة +ت +ث +ج +ح +خ +د +ذ +ر +ز +س +ش +ص +ض +ط +ظ +ع +غ +ف +ق +ك +ل +م +ن +ه +و +ى +ي +ً +ٌ +ٍ +َ +ُ +ِ +ّ +ْ +ٓ +ٔ +ٰ +ٱ +ٹ +پ +چ +ڈ +ڑ +ژ +ک +ڭ +گ +ں +ھ +ۀ +ہ +ۂ +ۃ +ۆ +ۇ +ۈ +ۋ +ی +ې +ے +ۓ +ە +١ +٢ +٣ +٤ +٥ +٦ +٧ +٨ +٩ diff --git a/ppocr/utils/dict/be_dict.txt b/ppocr/utils/dict/be_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8458baaf2f1b1da82dc56bd259ca6fb3e887b89 --- /dev/null +++ b/ppocr/utils/dict/be_dict.txt @@ -0,0 +1,145 @@ +b +e +_ +i +m +g +/ +2 +0 +I +L +S +V +R +C +1 +v +a +l +6 +9 +4 +3 +. +j +p +п +а +з +б +у +г +н +ц +ь +8 +м +л +і +о +ў +ы +7 +5 +М +х +с +р +ф +я +е +д +ж +ю +ч +й +к +Д +в +Б +т +І +ш +ё +э +К +Л +Н +А +Ж +Г +В +П +З +Е +О +Р +С +У +Ё +Й +Т +Ч +Э +Ц +Ю +Ш +Ф +Х +Я +Ь +Ы +Ў +s +c +n +w +M +o +t +T +E +A +B +u +h +y +k +r +H +d +Y +O +U +F +f +x +D +G +N +K +P +z +J +X +W +Z +Q +% +- +q +@ +' +! +# +& +, +: +$ +( +? +é ++ +É + diff --git a/ppocr/utils/dict/bg_dict.txt b/ppocr/utils/dict/bg_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..84713c373b5df1f59d51bd9c505721d8ec239b98 --- /dev/null +++ b/ppocr/utils/dict/bg_dict.txt @@ -0,0 +1,140 @@ +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +А +Б +В +Г +Д +Е +Ж +З +И +Й +К +Л +М +Н +О +П +Р +С +Т +У +Ф +Х +Ц +Ч +Ш +Щ +Ъ +Ю +Я +а +б +в +г +д +е +ж +з +и +й +к +л +м +н +о +п +р +с +т +у +ф +х +ц +ч +ш +щ +ъ +ь +ю +я + diff --git a/ppocr/utils/dict/chinese_cht_dict.txt b/ppocr/utils/dict/chinese_cht_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc1aa4724b9a6f0e15275bcf61c91c26b6550c3e --- /dev/null +++ b/ppocr/utils/dict/chinese_cht_dict.txt @@ -0,0 +1,8421 @@ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +¥ +® +° +± +² +´ +· +» +É +Ë +Ó +× +Ü +à +á +ä +è +é +ì +í +ò +ó +÷ +ú +ü +ā +ē +ī +ō +ū +ǐ +ǒ +ɔ +ɡ +ʌ +ˋ +Λ +Ο +Φ +Ω +α +β +ε +θ +μ +π +З +И +Й +П +Я +г +— +‖ +‘ +’ +“ +” +• +… +‧ +′ +″ +※ +℃ +№ +™ +Ⅱ +Ⅲ +Ⅳ +← +↑ +→ +↓ +⇋ +∈ +∑ +√ +∞ +∣ +∧ +∩ +∫ +∶ +≈ +≠ +≤ +≥ +⊙ +⊥ +① +② +③ +④ +⑧ +⑴ +⑵ +⑶ +─ +│ +┅ +┌ +├ +█ +▎ +▏ +▕ +■ +□ +▪ +▲ +△ +▼ +◆ +◇ +○ +◎ +● +◥ +★ +☆ +❋ +❤ +  +、 +。 +〇 +〉 +《 +》 +「 +」 +『 +』 +【 +】 +〔 +〕 +〖 +〗 +の +サ +シ +ジ +マ +ㄱ +ㆍ +㎏ +㎡ +㐂 +㐱 +㙟 +㴪 +㸃 +䖝 +䝉 +䰾 +䲁 +一 +丁 +七 +丄 +丈 +三 +上 +下 +丌 +不 +与 +丏 +丐 +丑 +且 +丕 +世 +丘 +丙 +丞 +丟 +両 +並 +丨 +丫 +中 +丰 +串 +丶 +丸 +丹 +主 +丼 +丿 +乂 +乃 +久 +么 +之 +乍 +乎 +乏 +乒 +乓 +乖 +乗 +乘 +乙 +乚 +乜 +九 +乞 +也 +乩 +乭 +乳 +乸 +乹 +乾 +亀 +亂 +亅 +了 +予 +亊 +事 +二 +亍 +云 +互 +亓 +五 +井 +亘 +些 +亜 +亞 +亟 +亠 +亡 +亢 +交 +亥 +亦 +亨 +享 +京 +亭 +亮 +亰 +亳 +亶 +亹 +人 +亻 +什 +仁 +仂 +仃 +仄 +仇 +仉 +今 +介 +仍 +仏 +仔 +仕 +他 +仗 +付 +仙 +仛 +仝 +仞 +仟 +仡 +代 +令 +以 +仨 +仫 +仮 +仰 +仲 +仳 +仵 +件 +仺 +任 +仼 +份 +仿 +企 +伃 +伈 +伉 +伊 +伋 +伍 +伎 +伏 +伐 +休 +伕 +伙 +伝 +伢 +伯 +估 +伱 +伴 +伶 +伷 +伸 +伺 +似 +伽 +伾 +佀 +佁 +佃 +但 +佇 +佈 +佉 +佋 +位 +低 +住 +佐 +佑 +体 +佔 +何 +佗 +佘 +余 +佚 +佛 +作 +佝 +佞 +佟 +你 +佣 +佤 +佧 +佩 +佬 +佯 +佰 +佳 +併 +佶 +佹 +佺 +佼 +佾 +使 +侁 +侃 +侄 +侅 +來 +侈 +侊 +例 +侍 +侏 +侑 +侖 +侗 +侘 +侚 +供 +依 +侞 +価 +侮 +侯 +侵 +侶 +侷 +侹 +便 +俁 +係 +促 +俄 +俅 +俊 +俋 +俌 +俍 +俎 +俏 +俐 +俑 +俗 +俘 +俚 +俛 +保 +俞 +俟 +俠 +信 +俬 +修 +俯 +俱 +俳 +俴 +俵 +俶 +俸 +俺 +俽 +俾 +倆 +倈 +倉 +個 +倌 +倍 +們 +倒 +倓 +倔 +倖 +倗 +倘 +候 +倚 +倜 +倞 +借 +倡 +倢 +倣 +値 +倦 +倧 +倩 +倪 +倫 +倬 +倭 +倮 +倻 +值 +偁 +偃 +假 +偈 +偉 +偊 +偌 +偍 +偎 +偏 +偓 +偕 +做 +停 +健 +偪 +偲 +側 +偵 +偶 +偷 +偸 +偽 +傀 +傃 +傅 +傈 +傉 +傍 +傑 +傒 +傕 +傖 +傘 +備 +傜 +傢 +傣 +催 +傭 +傲 +傳 +債 +傷 +傻 +傾 +僅 +僉 +僊 +働 +像 +僑 +僔 +僕 +僖 +僙 +僚 +僜 +僡 +僧 +僩 +僭 +僮 +僰 +僱 +僳 +僴 +僵 +價 +僻 +儀 +儁 +儂 +億 +儆 +儇 +儈 +儉 +儋 +儐 +儒 +儔 +儕 +儘 +儚 +儞 +償 +儡 +儥 +儦 +優 +儫 +儱 +儲 +儷 +儺 +儻 +儼 +兀 +允 +元 +兄 +充 +兆 +先 +光 +克 +兌 +免 +児 +兒 +兔 +兕 +兗 +兜 +入 +內 +全 +兩 +兪 +八 +公 +六 +兮 +共 +兵 +其 +具 +典 +兼 +兿 +冀 +冂 +円 +冇 +冉 +冊 +再 +冏 +冑 +冒 +冕 +冖 +冗 +冚 +冠 +冢 +冤 +冥 +冧 +冨 +冪 +冫 +冬 +冮 +冰 +冴 +冶 +冷 +冼 +冽 +凃 +凄 +准 +凈 +凋 +凌 +凍 +凖 +凜 +凝 +凞 +几 +凡 +処 +凪 +凬 +凰 +凱 +凳 +凵 +凶 +凸 +凹 +出 +函 +刀 +刁 +刂 +刃 +刄 +分 +切 +刈 +刊 +刎 +刑 +划 +列 +初 +判 +別 +刦 +刧 +刨 +利 +刪 +刮 +到 +制 +刷 +券 +刺 +刻 +刼 +剁 +剃 +則 +削 +剋 +剌 +前 +剎 +剏 +剔 +剖 +剛 +剝 +剡 +剣 +剩 +剪 +剮 +副 +割 +創 +剿 +劃 +劄 +劇 +劈 +劉 +劊 +劌 +劍 +劑 +劔 +力 +功 +加 +劣 +助 +努 +劫 +劬 +劭 +劵 +効 +劼 +劾 +勁 +勃 +勅 +勇 +勉 +勐 +勑 +勒 +勔 +動 +勖 +勗 +勘 +務 +勛 +勝 +勞 +募 +勢 +勣 +勤 +勦 +勰 +勱 +勲 +勳 +勵 +勷 +勸 +勺 +勻 +勾 +勿 +匂 +匄 +包 +匆 +匈 +匋 +匍 +匏 +匐 +匕 +化 +北 +匙 +匚 +匝 +匠 +匡 +匣 +匪 +匯 +匱 +匸 +匹 +匾 +匿 +區 +十 +千 +卅 +升 +午 +卉 +半 +卋 +卍 +卐 +卑 +卒 +卓 +協 +南 +博 +卜 +卞 +卟 +占 +卡 +卣 +卦 +卧 +卩 +卬 +卮 +卯 +印 +危 +卲 +即 +卵 +卷 +卸 +卹 +卺 +卻 +卽 +卿 +厄 +厓 +厔 +厙 +厚 +厝 +原 +厥 +厭 +厰 +厲 +厴 +厶 +去 +參 +叄 +又 +叉 +及 +友 +反 +収 +叔 +叕 +取 +受 +叛 +叟 +叡 +叢 +口 +古 +句 +另 +叨 +叩 +只 +叫 +召 +叭 +叮 +可 +台 +叱 +史 +右 +叵 +司 +叻 +叼 +吁 +吃 +各 +吆 +合 +吉 +吊 +吋 +同 +名 +后 +吏 +吐 +向 +吒 +吔 +吖 +君 +吝 +吞 +吟 +吠 +吡 +吥 +否 +吧 +吩 +含 +吮 +吱 +吲 +吳 +吵 +吶 +吸 +吹 +吻 +吼 +吾 +呀 +呂 +呃 +呈 +呉 +告 +呋 +呎 +呢 +呤 +呦 +周 +呱 +味 +呵 +呷 +呸 +呼 +命 +呾 +咀 +咁 +咂 +咄 +咅 +咆 +咋 +和 +咎 +咑 +咒 +咔 +咕 +咖 +咗 +咘 +咚 +咟 +咤 +咥 +咧 +咨 +咩 +咪 +咫 +咬 +咭 +咯 +咱 +咲 +咳 +咸 +咻 +咼 +咽 +咾 +咿 +哀 +品 +哂 +哄 +哆 +哇 +哈 +哉 +哌 +哎 +哏 +哐 +哖 +哚 +哞 +員 +哥 +哦 +哨 +哩 +哪 +哭 +哮 +哱 +哲 +哺 +哼 +唃 +唄 +唆 +唇 +唉 +唏 +唐 +唑 +唔 +唘 +唧 +唫 +唬 +唭 +售 +唯 +唱 +唳 +唵 +唷 +唸 +唻 +唾 +啁 +啃 +啄 +商 +啉 +啊 +啍 +問 +啓 +啖 +啚 +啜 +啞 +啟 +啡 +啣 +啤 +啥 +啦 +啪 +啫 +啯 +啰 +啱 +啲 +啵 +啶 +啷 +啻 +啼 +啾 +喀 +喂 +喃 +善 +喆 +喇 +喈 +喉 +喊 +喋 +喏 +喔 +喘 +喙 +喚 +喜 +喝 +喢 +喦 +喧 +喪 +喫 +喬 +單 +喰 +喱 +喲 +喳 +喵 +喹 +喻 +喼 +嗄 +嗅 +嗆 +嗇 +嗊 +嗎 +嗑 +嗒 +嗓 +嗔 +嗖 +嗚 +嗜 +嗝 +嗞 +嗡 +嗢 +嗣 +嗦 +嗨 +嗩 +嗪 +嗮 +嗯 +嗲 +嗶 +嗹 +嗽 +嘀 +嘅 +嘆 +嘉 +嘌 +嘍 +嘎 +嘏 +嘔 +嘗 +嘚 +嘛 +嘜 +嘞 +嘟 +嘢 +嘣 +嘥 +嘧 +嘩 +嘬 +嘮 +嘯 +嘰 +嘲 +嘴 +嘶 +嘸 +嘹 +嘻 +嘿 +噁 +噌 +噍 +噏 +噓 +噗 +噝 +噠 +噢 +噤 +噥 +噦 +器 +噩 +噪 +噬 +噯 +噰 +噲 +噴 +噶 +噸 +噹 +噻 +嚇 +嚈 +嚎 +嚏 +嚐 +嚒 +嚓 +嚕 +嚗 +嚙 +嚞 +嚟 +嚤 +嚦 +嚧 +嚨 +嚩 +嚮 +嚳 +嚴 +嚶 +嚷 +嚼 +嚿 +囀 +囂 +囃 +囉 +囊 +囍 +囑 +囒 +囓 +囗 +囚 +四 +囝 +回 +因 +囡 +団 +囤 +囧 +囪 +囮 +囯 +困 +囲 +図 +囶 +囷 +囹 +固 +囿 +圂 +圃 +圄 +圈 +圉 +國 +圍 +圏 +園 +圓 +圖 +圗 +團 +圜 +土 +圧 +在 +圩 +圪 +圭 +圯 +地 +圳 +圻 +圾 +址 +均 +坊 +坋 +坌 +坍 +坎 +坐 +坑 +坖 +坡 +坣 +坤 +坦 +坨 +坩 +坪 +坫 +坬 +坭 +坮 +坯 +坳 +坵 +坶 +坷 +坻 +垂 +垃 +垈 +型 +垍 +垓 +垕 +垚 +垛 +垞 +垟 +垠 +垢 +垣 +垮 +垯 +垰 +垵 +垸 +垻 +垿 +埃 +埅 +埇 +埈 +埋 +埌 +城 +埏 +埒 +埔 +埕 +埗 +埜 +域 +埠 +埡 +埤 +埧 +埨 +埪 +埭 +埮 +埴 +埵 +執 +培 +基 +埻 +埼 +堀 +堂 +堃 +堅 +堆 +堇 +堈 +堉 +堊 +堍 +堖 +堝 +堡 +堤 +堦 +堪 +堮 +堯 +堰 +報 +場 +堵 +堷 +堺 +塀 +塅 +塆 +塊 +塋 +塌 +塍 +塏 +塑 +塔 +塗 +塘 +塙 +塜 +塞 +塡 +塢 +塤 +塨 +塩 +填 +塬 +塭 +塰 +塱 +塲 +塵 +塹 +塽 +塾 +墀 +境 +墅 +墉 +墊 +墎 +墓 +増 +墘 +墜 +增 +墟 +墡 +墣 +墨 +墩 +墫 +墬 +墮 +墱 +墳 +墺 +墼 +墾 +壁 +壄 +壆 +壇 +壋 +壌 +壎 +壐 +壑 +壓 +壔 +壕 +壘 +壙 +壞 +壟 +壠 +壢 +壤 +壩 +士 +壬 +壯 +壱 +壴 +壹 +壺 +壽 +夀 +夆 +変 +夊 +夋 +夌 +夏 +夔 +夕 +外 +夙 +多 +夜 +夠 +夢 +夤 +夥 +大 +天 +太 +夫 +夬 +夭 +央 +夯 +失 +夷 +夾 +奀 +奄 +奇 +奈 +奉 +奎 +奏 +奐 +契 +奓 +奔 +奕 +套 +奘 +奚 +奠 +奢 +奣 +奧 +奩 +奪 +奫 +奭 +奮 +女 +奴 +奶 +她 +好 +妀 +妁 +如 +妃 +妄 +妊 +妍 +妏 +妑 +妒 +妓 +妖 +妙 +妝 +妞 +妠 +妤 +妥 +妧 +妨 +妭 +妮 +妯 +妲 +妳 +妸 +妹 +妺 +妻 +妾 +姀 +姁 +姃 +姆 +姈 +姉 +姊 +始 +姌 +姍 +姐 +姑 +姒 +姓 +委 +姚 +姜 +姝 +姣 +姥 +姦 +姨 +姪 +姫 +姬 +姮 +姵 +姶 +姸 +姻 +姿 +威 +娃 +娉 +娋 +娌 +娍 +娎 +娑 +娖 +娘 +娛 +娜 +娟 +娠 +娣 +娥 +娩 +娫 +娳 +娶 +娸 +娼 +娽 +婀 +婁 +婆 +婉 +婊 +婑 +婕 +婚 +婢 +婦 +婧 +婪 +婭 +婯 +婷 +婺 +婻 +婼 +婿 +媃 +媄 +媊 +媐 +媒 +媓 +媖 +媗 +媚 +媛 +媜 +媞 +媧 +媭 +媯 +媲 +媳 +媺 +媼 +媽 +媾 +媿 +嫁 +嫂 +嫄 +嫈 +嫉 +嫌 +嫖 +嫘 +嫚 +嫡 +嫣 +嫦 +嫩 +嫪 +嫲 +嫳 +嫵 +嫺 +嫻 +嬅 +嬈 +嬉 +嬋 +嬌 +嬗 +嬛 +嬝 +嬡 +嬤 +嬨 +嬪 +嬬 +嬭 +嬰 +嬴 +嬸 +嬾 +嬿 +孀 +孃 +孆 +孋 +孌 +子 +孑 +孔 +孕 +孖 +字 +存 +孚 +孛 +孜 +孝 +孟 +孢 +季 +孤 +孩 +孫 +孬 +孮 +孰 +孳 +孵 +學 +孺 +孻 +孽 +孿 +宀 +它 +宅 +宇 +守 +安 +宋 +完 +宍 +宏 +宓 +宕 +宗 +官 +宙 +定 +宛 +宜 +実 +客 +宣 +室 +宥 +宦 +宧 +宮 +宰 +害 +宴 +宵 +家 +宸 +容 +宿 +寀 +寁 +寂 +寄 +寅 +密 +寇 +寈 +寊 +富 +寐 +寒 +寓 +寔 +寕 +寖 +寗 +寘 +寛 +寜 +寞 +察 +寡 +寢 +寤 +寥 +實 +寧 +寨 +審 +寫 +寬 +寮 +寯 +寰 +寳 +寵 +寶 +寸 +寺 +対 +封 +専 +尃 +射 +將 +專 +尉 +尊 +尋 +對 +導 +小 +尐 +少 +尓 +尕 +尖 +尗 +尙 +尚 +尢 +尤 +尨 +尪 +尬 +就 +尷 +尹 +尺 +尻 +尼 +尾 +尿 +局 +屁 +屄 +居 +屆 +屇 +屈 +屋 +屌 +屍 +屎 +屏 +屐 +屑 +屓 +展 +屚 +屜 +屠 +屢 +層 +履 +屬 +屭 +屯 +山 +屹 +屺 +屻 +岀 +岈 +岌 +岐 +岑 +岔 +岡 +岢 +岣 +岧 +岩 +岪 +岫 +岬 +岰 +岱 +岳 +岵 +岷 +岸 +岻 +峁 +峅 +峇 +峋 +峍 +峒 +峘 +峙 +峚 +峠 +峨 +峩 +峪 +峭 +峯 +峰 +峴 +島 +峻 +峼 +峽 +崁 +崆 +崇 +崈 +崋 +崍 +崎 +崐 +崑 +崒 +崔 +崖 +崗 +崘 +崙 +崚 +崛 +崞 +崟 +崠 +崢 +崤 +崧 +崩 +崬 +崮 +崱 +崴 +崵 +崶 +崽 +嵇 +嵊 +嵋 +嵌 +嵎 +嵐 +嵒 +嵕 +嵖 +嵗 +嵙 +嵛 +嵜 +嵨 +嵩 +嵬 +嵮 +嵯 +嵰 +嵴 +嵻 +嵿 +嶁 +嶂 +嶃 +嶄 +嶇 +嶋 +嶌 +嶍 +嶒 +嶔 +嶗 +嶝 +嶠 +嶢 +嶦 +嶧 +嶪 +嶬 +嶰 +嶲 +嶴 +嶷 +嶸 +嶺 +嶼 +嶽 +巂 +巄 +巆 +巋 +巌 +巍 +巎 +巑 +巒 +巔 +巖 +巘 +巛 +川 +州 +巡 +巢 +工 +左 +巧 +巨 +巫 +差 +巰 +己 +已 +巳 +巴 +巶 +巷 +巻 +巽 +巾 +巿 +市 +布 +帆 +希 +帑 +帔 +帕 +帖 +帘 +帙 +帚 +帛 +帝 +帡 +帢 +帥 +師 +席 +帯 +帰 +帳 +帶 +帷 +常 +帽 +幀 +幃 +幄 +幅 +幌 +幔 +幕 +幗 +幚 +幛 +幟 +幡 +幢 +幣 +幪 +幫 +干 +平 +年 +幵 +幷 +幸 +幹 +幺 +幻 +幼 +幽 +幾 +庀 +庁 +広 +庇 +床 +序 +底 +庖 +店 +庚 +府 +庠 +庢 +庥 +度 +座 +庫 +庭 +庲 +庵 +庶 +康 +庸 +庹 +庼 +庾 +廁 +廂 +廄 +廆 +廈 +廉 +廊 +廋 +廌 +廍 +廑 +廓 +廔 +廕 +廖 +廙 +廚 +廝 +廞 +廟 +廠 +廡 +廢 +廣 +廧 +廨 +廩 +廬 +廰 +廱 +廳 +延 +廷 +廸 +建 +廻 +廼 +廿 +弁 +弄 +弅 +弇 +弈 +弉 +弊 +弋 +弍 +式 +弐 +弒 +弓 +弔 +引 +弖 +弗 +弘 +弛 +弟 +弢 +弦 +弧 +弨 +弩 +弭 +弱 +張 +強 +弸 +弼 +弾 +彀 +彄 +彅 +彆 +彈 +彊 +彌 +彎 +彐 +彔 +彖 +彗 +彘 +彙 +彜 +彞 +彠 +彡 +形 +彣 +彤 +彥 +彧 +彩 +彪 +彫 +彬 +彭 +彰 +影 +彳 +彷 +役 +彼 +彿 +往 +征 +徂 +待 +徇 +很 +徉 +徊 +律 +後 +徐 +徑 +徒 +得 +徘 +徙 +徜 +從 +徠 +御 +徧 +徨 +復 +循 +徫 +徬 +徭 +微 +徳 +徴 +徵 +德 +徸 +徹 +徽 +心 +忄 +必 +忉 +忌 +忍 +忐 +忑 +忒 +志 +忘 +忙 +応 +忝 +忞 +忠 +快 +忬 +忯 +忱 +忳 +念 +忻 +忽 +忿 +怍 +怎 +怒 +怕 +怖 +怙 +怛 +思 +怠 +怡 +急 +怦 +性 +怨 +怪 +怯 +怵 +恁 +恂 +恃 +恆 +恊 +恍 +恐 +恕 +恙 +恢 +恣 +恤 +恥 +恨 +恩 +恪 +恬 +恭 +息 +恰 +恵 +恿 +悄 +悅 +悆 +悉 +悌 +悍 +悔 +悖 +悚 +悛 +悝 +悞 +悟 +悠 +患 +悧 +您 +悪 +悰 +悲 +悳 +悵 +悶 +悸 +悼 +情 +惆 +惇 +惑 +惔 +惕 +惘 +惚 +惜 +惟 +惠 +惡 +惣 +惦 +惰 +惱 +惲 +想 +惶 +惹 +惺 +愁 +愃 +愆 +愈 +愉 +愍 +意 +愐 +愒 +愔 +愕 +愚 +愛 +愜 +感 +愣 +愧 +愨 +愫 +愭 +愴 +愷 +愼 +愾 +愿 +慄 +慈 +態 +慌 +慎 +慕 +慘 +慚 +慜 +慟 +慢 +慣 +慥 +慧 +慨 +慮 +慰 +慳 +慵 +慶 +慷 +慾 +憂 +憊 +憋 +憍 +憎 +憐 +憑 +憓 +憕 +憙 +憚 +憤 +憧 +憨 +憩 +憫 +憬 +憲 +憶 +憺 +憻 +憾 +懂 +懃 +懇 +懈 +應 +懋 +懌 +懍 +懐 +懣 +懦 +懮 +懲 +懵 +懶 +懷 +懸 +懺 +懼 +懽 +懾 +懿 +戀 +戇 +戈 +戊 +戌 +戍 +戎 +成 +我 +戒 +戔 +戕 +或 +戙 +戚 +戛 +戟 +戡 +戢 +戥 +戦 +戩 +截 +戮 +戰 +戱 +戲 +戳 +戴 +戶 +戸 +戻 +戽 +戾 +房 +所 +扁 +扆 +扇 +扈 +扉 +手 +扌 +才 +扎 +扒 +打 +扔 +托 +扙 +扛 +扞 +扣 +扥 +扦 +扭 +扮 +扯 +扳 +扶 +批 +扼 +找 +承 +技 +抃 +抄 +抇 +抉 +把 +抑 +抒 +抓 +投 +抖 +抗 +折 +抦 +披 +抬 +抱 +抵 +抹 +抻 +押 +抽 +抿 +拂 +拆 +拇 +拈 +拉 +拋 +拌 +拍 +拎 +拏 +拐 +拒 +拓 +拔 +拖 +拗 +拘 +拙 +拚 +招 +拜 +拝 +拡 +括 +拭 +拮 +拯 +拱 +拳 +拴 +拷 +拺 +拼 +拽 +拾 +拿 +持 +指 +按 +挎 +挑 +挖 +挙 +挨 +挪 +挫 +振 +挲 +挵 +挹 +挺 +挻 +挾 +捂 +捆 +捉 +捌 +捍 +捎 +捏 +捐 +捒 +捕 +捜 +捦 +捧 +捨 +捩 +捫 +捭 +捱 +捲 +捶 +捷 +捺 +捻 +掀 +掂 +掃 +掄 +掇 +授 +掉 +掌 +掏 +掐 +排 +掖 +掘 +掙 +掛 +掞 +掟 +掠 +採 +探 +掣 +接 +控 +推 +掩 +措 +掬 +掰 +掾 +揀 +揄 +揆 +揉 +揍 +描 +提 +插 +揔 +揖 +揚 +換 +握 +揪 +揭 +揮 +援 +揸 +揺 +損 +搏 +搐 +搓 +搔 +搖 +搗 +搜 +搞 +搠 +搢 +搪 +搬 +搭 +搳 +搴 +搵 +搶 +搽 +搾 +摂 +摒 +摔 +摘 +摜 +摞 +摟 +摠 +摧 +摩 +摭 +摯 +摳 +摴 +摵 +摶 +摸 +摹 +摺 +摻 +摽 +撃 +撇 +撈 +撐 +撒 +撓 +撕 +撖 +撙 +撚 +撞 +撣 +撤 +撥 +撩 +撫 +撬 +播 +撮 +撰 +撲 +撳 +撻 +撼 +撾 +撿 +擀 +擁 +擂 +擅 +擇 +擊 +擋 +操 +擎 +擒 +擔 +擘 +據 +擠 +擢 +擥 +擦 +擬 +擯 +擰 +擱 +擲 +擴 +擷 +擺 +擼 +擾 +攀 +攏 +攔 +攖 +攘 +攜 +攝 +攞 +攢 +攣 +攤 +攪 +攫 +攬 +支 +攴 +攵 +收 +攷 +攸 +改 +攻 +攽 +放 +政 +故 +效 +敍 +敎 +敏 +救 +敔 +敕 +敖 +敗 +敘 +教 +敝 +敞 +敟 +敢 +散 +敦 +敫 +敬 +敭 +敲 +整 +敵 +敷 +數 +敻 +敾 +斂 +斃 +文 +斌 +斎 +斐 +斑 +斕 +斖 +斗 +料 +斛 +斜 +斝 +斟 +斡 +斤 +斥 +斧 +斬 +斯 +新 +斷 +方 +於 +施 +斿 +旁 +旂 +旃 +旄 +旅 +旉 +旋 +旌 +旎 +族 +旖 +旗 +旙 +旛 +旡 +既 +日 +旦 +旨 +早 +旬 +旭 +旱 +旲 +旳 +旺 +旻 +旼 +旽 +旾 +旿 +昀 +昂 +昃 +昆 +昇 +昉 +昊 +昌 +昍 +明 +昏 +昐 +易 +昔 +昕 +昚 +昛 +昜 +昝 +昞 +星 +映 +昡 +昣 +昤 +春 +昧 +昨 +昪 +昫 +昭 +是 +昰 +昱 +昴 +昵 +昶 +昺 +晁 +時 +晃 +晈 +晉 +晊 +晏 +晗 +晙 +晚 +晛 +晝 +晞 +晟 +晤 +晦 +晧 +晨 +晩 +晪 +晫 +晭 +普 +景 +晰 +晳 +晴 +晶 +晷 +晸 +智 +晾 +暃 +暄 +暅 +暇 +暈 +暉 +暊 +暌 +暎 +暏 +暐 +暑 +暕 +暖 +暗 +暘 +暝 +暟 +暠 +暢 +暦 +暨 +暫 +暮 +暱 +暲 +暴 +暸 +暹 +暻 +暾 +曄 +曅 +曆 +曇 +曉 +曌 +曔 +曖 +曙 +曜 +曝 +曠 +曦 +曧 +曨 +曩 +曬 +曮 +曰 +曲 +曳 +更 +曶 +曷 +書 +曹 +曺 +曼 +曽 +曾 +替 +最 +會 +月 +有 +朊 +朋 +服 +朏 +朐 +朓 +朔 +朕 +朖 +朗 +望 +朝 +期 +朦 +朧 +木 +未 +末 +本 +札 +朱 +朴 +朵 +朶 +朽 +朿 +杁 +杉 +杋 +杌 +李 +杏 +材 +村 +杓 +杖 +杙 +杜 +杞 +束 +杠 +杣 +杤 +杧 +杬 +杭 +杯 +東 +杲 +杳 +杴 +杵 +杷 +杻 +杼 +松 +板 +极 +枇 +枉 +枋 +枏 +析 +枕 +枖 +林 +枚 +枛 +果 +枝 +枠 +枡 +枯 +枰 +枱 +枲 +枳 +架 +枷 +枸 +枹 +枼 +柁 +柃 +柄 +柉 +柊 +柎 +柏 +某 +柑 +柒 +染 +柔 +柘 +柚 +柜 +柝 +柞 +柟 +查 +柩 +柬 +柯 +柰 +柱 +柳 +柴 +柵 +柶 +柷 +査 +柾 +柿 +栃 +栄 +栐 +栒 +栓 +栜 +栝 +栞 +校 +栢 +栨 +栩 +株 +栲 +栴 +核 +根 +栻 +格 +栽 +桀 +桁 +桂 +桃 +桄 +桅 +框 +案 +桉 +桌 +桎 +桐 +桑 +桓 +桔 +桕 +桖 +桙 +桜 +桝 +桫 +桱 +桲 +桴 +桶 +桷 +桼 +桿 +梀 +梁 +梂 +梃 +梅 +梆 +梉 +梏 +梓 +梔 +梗 +梘 +條 +梟 +梠 +梢 +梣 +梧 +梨 +梫 +梭 +梯 +械 +梱 +梳 +梵 +梶 +梽 +棄 +棆 +棉 +棋 +棍 +棐 +棒 +棓 +棕 +棖 +棗 +棘 +棚 +棛 +棟 +棠 +棡 +棣 +棧 +棨 +棩 +棪 +棫 +森 +棱 +棲 +棵 +棶 +棹 +棺 +棻 +棼 +棽 +椅 +椆 +椇 +椋 +植 +椎 +椏 +椒 +椙 +椥 +椪 +椰 +椲 +椴 +椵 +椹 +椽 +椿 +楂 +楊 +楓 +楔 +楗 +楙 +楚 +楝 +楞 +楠 +楡 +楢 +楣 +楤 +楦 +楧 +楨 +楫 +業 +楮 +楯 +楳 +極 +楷 +楸 +楹 +楽 +楿 +概 +榆 +榊 +榍 +榎 +榑 +榔 +榕 +榖 +榗 +榘 +榛 +榜 +榞 +榢 +榣 +榤 +榦 +榧 +榨 +榫 +榭 +榮 +榲 +榴 +榷 +榻 +榿 +槀 +槁 +槃 +槊 +構 +槌 +槍 +槎 +槐 +槓 +槔 +槗 +様 +槙 +槤 +槩 +槭 +槰 +槱 +槲 +槳 +槺 +槻 +槼 +槽 +槿 +樀 +樁 +樂 +樅 +樆 +樊 +樋 +樑 +樓 +樗 +樘 +標 +樞 +樟 +模 +樣 +樨 +権 +樫 +樵 +樸 +樹 +樺 +樻 +樽 +樾 +橄 +橇 +橈 +橋 +橐 +橒 +橓 +橘 +橙 +橚 +機 +橡 +橢 +橪 +橫 +橿 +檀 +檄 +檇 +檉 +檊 +檎 +檐 +檔 +檗 +檜 +檞 +檠 +檡 +檢 +檣 +檦 +檨 +檫 +檬 +檯 +檳 +檵 +檸 +檻 +檽 +櫂 +櫃 +櫆 +櫈 +櫓 +櫚 +櫛 +櫞 +櫟 +櫥 +櫨 +櫪 +櫱 +櫸 +櫻 +櫾 +櫿 +欄 +欉 +權 +欏 +欒 +欖 +欞 +欠 +次 +欣 +欥 +欲 +欸 +欹 +欺 +欽 +款 +歆 +歇 +歉 +歊 +歌 +歎 +歐 +歓 +歙 +歛 +歡 +止 +正 +此 +步 +武 +歧 +歩 +歪 +歲 +歳 +歴 +歷 +歸 +歹 +死 +歿 +殂 +殃 +殄 +殆 +殉 +殊 +殑 +殖 +殘 +殛 +殞 +殟 +殤 +殭 +殮 +殯 +殲 +殳 +段 +殷 +殺 +殻 +殼 +殿 +毀 +毅 +毆 +毉 +毋 +毌 +母 +毎 +每 +毐 +毒 +毓 +比 +毖 +毗 +毘 +毛 +毫 +毬 +毯 +毴 +毸 +毽 +毿 +氂 +氈 +氍 +氏 +氐 +民 +氓 +氖 +気 +氘 +氙 +氚 +氛 +氟 +氣 +氦 +氧 +氨 +氪 +氫 +氬 +氮 +氯 +氰 +水 +氵 +氷 +永 +氹 +氻 +氽 +氾 +汀 +汁 +求 +汊 +汎 +汐 +汕 +汗 +汛 +汜 +汝 +汞 +江 +池 +污 +汧 +汨 +汩 +汪 +汭 +汰 +汲 +汴 +汶 +決 +汽 +汾 +沁 +沂 +沃 +沄 +沅 +沆 +沇 +沈 +沉 +沌 +沍 +沏 +沐 +沒 +沓 +沔 +沖 +沘 +沙 +沚 +沛 +沜 +沢 +沨 +沫 +沭 +沮 +沯 +沱 +河 +沸 +油 +沺 +治 +沼 +沽 +沾 +沿 +況 +泂 +泄 +泆 +泇 +泉 +泊 +泌 +泐 +泓 +泔 +法 +泖 +泗 +泚 +泛 +泠 +泡 +波 +泣 +泥 +泩 +泫 +泮 +泯 +泰 +泱 +泳 +泵 +洄 +洋 +洌 +洎 +洗 +洙 +洛 +洞 +洢 +洣 +洤 +津 +洨 +洩 +洪 +洮 +洱 +洲 +洳 +洵 +洸 +洹 +洺 +活 +洽 +派 +流 +浄 +浙 +浚 +浛 +浜 +浞 +浟 +浠 +浡 +浣 +浤 +浥 +浦 +浩 +浪 +浮 +浯 +浴 +浵 +海 +浸 +浹 +涅 +涇 +消 +涉 +涌 +涎 +涑 +涓 +涔 +涕 +涙 +涪 +涫 +涮 +涯 +液 +涵 +涸 +涼 +涿 +淄 +淅 +淆 +淇 +淋 +淌 +淍 +淎 +淏 +淑 +淓 +淖 +淘 +淙 +淚 +淛 +淝 +淞 +淠 +淡 +淤 +淥 +淦 +淨 +淩 +淪 +淫 +淬 +淮 +淯 +淰 +深 +淳 +淵 +淶 +混 +淸 +淹 +淺 +添 +淼 +淽 +渃 +清 +済 +渉 +渋 +渕 +渙 +渚 +減 +渝 +渟 +渠 +渡 +渣 +渤 +渥 +渦 +渫 +測 +渭 +港 +渲 +渴 +游 +渺 +渼 +渽 +渾 +湃 +湄 +湉 +湊 +湍 +湓 +湔 +湖 +湘 +湛 +湜 +湞 +湟 +湣 +湥 +湧 +湫 +湮 +湯 +湳 +湴 +湼 +満 +溁 +溇 +溈 +溉 +溋 +溎 +溏 +源 +準 +溙 +溜 +溝 +溟 +溢 +溥 +溦 +溧 +溪 +溫 +溯 +溱 +溲 +溴 +溵 +溶 +溺 +溼 +滀 +滁 +滂 +滄 +滅 +滇 +滈 +滉 +滋 +滌 +滎 +滏 +滑 +滓 +滔 +滕 +滘 +滙 +滝 +滬 +滯 +滲 +滴 +滷 +滸 +滹 +滻 +滽 +滾 +滿 +漁 +漂 +漆 +漇 +漈 +漎 +漏 +漓 +演 +漕 +漚 +漠 +漢 +漣 +漩 +漪 +漫 +漬 +漯 +漱 +漲 +漳 +漴 +漵 +漷 +漸 +漼 +漾 +漿 +潁 +潑 +潔 +潘 +潛 +潞 +潟 +潢 +潤 +潭 +潮 +潯 +潰 +潲 +潺 +潼 +潽 +潾 +潿 +澀 +澁 +澂 +澄 +澆 +澇 +澈 +澉 +澋 +澌 +澍 +澎 +澔 +澗 +澠 +澡 +澣 +澤 +澥 +澧 +澪 +澮 +澯 +澱 +澳 +澶 +澹 +澻 +激 +濁 +濂 +濃 +濉 +濊 +濋 +濕 +濘 +濙 +濛 +濞 +濟 +濠 +濡 +濤 +濫 +濬 +濮 +濯 +濰 +濱 +濲 +濶 +濺 +濼 +濾 +瀁 +瀅 +瀆 +瀉 +瀍 +瀏 +瀑 +瀔 +瀕 +瀘 +瀚 +瀛 +瀝 +瀞 +瀟 +瀠 +瀣 +瀦 +瀧 +瀨 +瀬 +瀰 +瀲 +瀴 +瀶 +瀹 +瀾 +灃 +灊 +灌 +灑 +灘 +灝 +灞 +灡 +灣 +灤 +灧 +火 +灰 +灴 +灸 +灼 +災 +炁 +炅 +炆 +炊 +炎 +炒 +炔 +炕 +炘 +炙 +炟 +炣 +炤 +炫 +炬 +炭 +炮 +炯 +炱 +炲 +炳 +炷 +炸 +為 +炻 +烈 +烉 +烊 +烋 +烏 +烒 +烔 +烘 +烙 +烜 +烝 +烤 +烯 +烱 +烴 +烷 +烹 +烺 +烽 +焃 +焄 +焉 +焊 +焌 +焓 +焗 +焙 +焚 +焜 +焞 +無 +焦 +焯 +焰 +焱 +焴 +然 +焻 +焼 +焿 +煇 +煉 +煊 +煌 +煎 +煐 +煒 +煔 +煕 +煖 +煙 +煚 +煜 +煞 +煠 +煤 +煥 +煦 +照 +煨 +煩 +煬 +煮 +煲 +煳 +煵 +煶 +煸 +煽 +熄 +熅 +熇 +熈 +熊 +熏 +熒 +熔 +熖 +熗 +熘 +熙 +熜 +熟 +熠 +熤 +熥 +熨 +熬 +熯 +熱 +熲 +熳 +熵 +熹 +熺 +熼 +熾 +熿 +燁 +燃 +燄 +燈 +燉 +燊 +燎 +燏 +燐 +燒 +燔 +燕 +燘 +燙 +燚 +燜 +燝 +營 +燥 +燦 +燧 +燫 +燬 +燭 +燮 +燴 +燹 +燻 +燼 +燾 +燿 +爀 +爆 +爌 +爍 +爐 +爔 +爚 +爛 +爝 +爨 +爪 +爬 +爭 +爯 +爰 +爲 +爵 +父 +爸 +爹 +爺 +爻 +爽 +爾 +爿 +牁 +牂 +牆 +片 +版 +牌 +牒 +牕 +牖 +牘 +牙 +牛 +牝 +牟 +牠 +牡 +牢 +牧 +物 +牯 +牲 +特 +牻 +牼 +牽 +犀 +犁 +犂 +犇 +犍 +犎 +犖 +犛 +犢 +犧 +犨 +犬 +犯 +犰 +犴 +犽 +狀 +狂 +狄 +狍 +狎 +狐 +狒 +狓 +狗 +狙 +狛 +狟 +狠 +狡 +狦 +狨 +狩 +狳 +狶 +狷 +狸 +狹 +狻 +狼 +猁 +猄 +猇 +猊 +猗 +猙 +猛 +猜 +猝 +猞 +猢 +猥 +猨 +猩 +猳 +猴 +猶 +猷 +猺 +猻 +猾 +猿 +獁 +獃 +獄 +獅 +獇 +獎 +獏 +獐 +獒 +獠 +獢 +獣 +獨 +獬 +獮 +獯 +獰 +獲 +獴 +獵 +獷 +獸 +獺 +獻 +獼 +獾 +玀 +玄 +玆 +率 +玉 +王 +玎 +玏 +玓 +玕 +玖 +玗 +玘 +玙 +玟 +玠 +玡 +玢 +玥 +玧 +玨 +玩 +玫 +玭 +玲 +玳 +玶 +玷 +玹 +玻 +玾 +珀 +珂 +珅 +珈 +珉 +珊 +珌 +珍 +珎 +珏 +珖 +珙 +珝 +珞 +珠 +珡 +珣 +珤 +珥 +珦 +珧 +珩 +珪 +班 +珮 +珵 +珹 +珺 +珽 +現 +琁 +球 +琄 +琅 +理 +琇 +琉 +琊 +琍 +琎 +琚 +琛 +琡 +琢 +琤 +琥 +琦 +琨 +琪 +琬 +琮 +琯 +琰 +琱 +琳 +琴 +琵 +琶 +琹 +琺 +琿 +瑀 +瑁 +瑂 +瑄 +瑅 +瑆 +瑈 +瑊 +瑋 +瑑 +瑒 +瑕 +瑗 +瑙 +瑚 +瑛 +瑜 +瑝 +瑞 +瑟 +瑠 +瑢 +瑣 +瑤 +瑥 +瑧 +瑨 +瑩 +瑪 +瑭 +瑯 +瑰 +瑱 +瑳 +瑴 +瑺 +瑾 +璀 +璁 +璃 +璄 +璆 +璇 +璈 +璉 +璋 +璌 +璐 +璕 +璘 +璙 +璚 +璜 +璞 +璟 +璠 +璡 +璣 +璥 +璦 +璧 +璨 +璩 +璪 +璫 +璬 +璮 +環 +璱 +璵 +璸 +璹 +璽 +璿 +瓈 +瓊 +瓌 +瓏 +瓑 +瓔 +瓖 +瓘 +瓚 +瓛 +瓜 +瓞 +瓠 +瓢 +瓣 +瓤 +瓦 +瓮 +瓴 +瓶 +瓷 +瓿 +甂 +甄 +甌 +甍 +甑 +甕 +甘 +甙 +甚 +甜 +生 +甡 +產 +産 +甥 +甦 +用 +甩 +甪 +甫 +甬 +甯 +田 +由 +甲 +申 +男 +甸 +甹 +町 +甾 +畀 +畇 +畈 +畊 +畋 +界 +畎 +畏 +畐 +畑 +畔 +留 +畜 +畝 +畠 +畢 +略 +畦 +畧 +番 +畫 +畬 +畯 +異 +畲 +畳 +畵 +當 +畷 +畸 +畹 +畿 +疃 +疆 +疇 +疊 +疋 +疌 +疍 +疏 +疑 +疒 +疕 +疙 +疚 +疝 +疣 +疤 +疥 +疫 +疲 +疳 +疵 +疸 +疹 +疼 +疽 +疾 +痂 +病 +症 +痊 +痍 +痔 +痕 +痘 +痙 +痛 +痞 +痟 +痠 +痢 +痣 +痤 +痧 +痩 +痰 +痱 +痲 +痴 +痹 +痺 +痿 +瘀 +瘁 +瘊 +瘋 +瘍 +瘓 +瘙 +瘜 +瘞 +瘟 +瘠 +瘡 +瘢 +瘤 +瘦 +瘧 +瘩 +瘰 +瘴 +瘺 +癀 +療 +癆 +癇 +癌 +癒 +癖 +癘 +癜 +癟 +癡 +癢 +癤 +癥 +癩 +癬 +癭 +癮 +癯 +癰 +癱 +癲 +癸 +発 +登 +發 +白 +百 +皂 +的 +皆 +皇 +皈 +皋 +皎 +皐 +皓 +皖 +皙 +皚 +皛 +皝 +皞 +皮 +皰 +皴 +皷 +皸 +皺 +皿 +盂 +盃 +盅 +盆 +盈 +益 +盋 +盌 +盎 +盒 +盔 +盛 +盜 +盞 +盟 +盡 +監 +盤 +盥 +盦 +盧 +盨 +盩 +盪 +盫 +目 +盯 +盱 +盲 +直 +盷 +相 +盹 +盺 +盼 +盾 +眀 +省 +眉 +看 +県 +眙 +眛 +眜 +眞 +真 +眠 +眥 +眨 +眩 +眭 +眯 +眵 +眶 +眷 +眸 +眺 +眼 +眾 +着 +睇 +睛 +睜 +睞 +睡 +睢 +督 +睥 +睦 +睨 +睪 +睫 +睭 +睹 +睺 +睽 +睾 +睿 +瞄 +瞅 +瞋 +瞌 +瞎 +瞑 +瞓 +瞞 +瞢 +瞥 +瞧 +瞪 +瞫 +瞬 +瞭 +瞰 +瞳 +瞻 +瞼 +瞽 +瞿 +矇 +矍 +矗 +矚 +矛 +矜 +矞 +矢 +矣 +知 +矧 +矩 +短 +矮 +矯 +石 +矸 +矽 +砂 +砋 +砌 +砍 +砒 +研 +砝 +砢 +砥 +砦 +砧 +砩 +砫 +砭 +砮 +砯 +砰 +砲 +砳 +破 +砵 +砷 +砸 +砼 +硂 +硃 +硅 +硇 +硏 +硐 +硒 +硓 +硚 +硜 +硝 +硤 +硨 +硫 +硬 +硭 +硯 +硼 +碁 +碇 +碉 +碌 +碎 +碑 +碓 +碕 +碗 +碘 +碚 +碟 +碡 +碣 +碧 +碩 +碪 +碭 +碰 +碲 +碳 +碴 +碶 +碸 +確 +碻 +碼 +碽 +碾 +磁 +磅 +磊 +磋 +磐 +磔 +磕 +磘 +磙 +磚 +磜 +磡 +磨 +磪 +磬 +磯 +磱 +磲 +磵 +磷 +磺 +磻 +磾 +礁 +礄 +礎 +礐 +礑 +礒 +礙 +礠 +礦 +礪 +礫 +礬 +礮 +礱 +礴 +示 +礻 +礽 +社 +祀 +祁 +祂 +祆 +祇 +祈 +祉 +祋 +祏 +祐 +祓 +祕 +祖 +祗 +祙 +祚 +祛 +祜 +祝 +神 +祟 +祠 +祥 +祧 +票 +祭 +祹 +祺 +祼 +祿 +禁 +禃 +禇 +禍 +禎 +福 +禑 +禓 +禔 +禕 +禘 +禛 +禟 +禠 +禤 +禦 +禧 +禨 +禩 +禪 +禮 +禰 +禱 +禵 +禹 +禺 +禼 +禽 +禾 +禿 +秀 +私 +秈 +秉 +秋 +科 +秒 +秕 +秘 +租 +秠 +秣 +秤 +秦 +秧 +秩 +秭 +秳 +秸 +移 +稀 +稅 +稈 +稉 +程 +稍 +稑 +稔 +稗 +稘 +稙 +稚 +稜 +稞 +稟 +稠 +種 +稱 +稲 +稷 +稹 +稺 +稻 +稼 +稽 +稾 +稿 +穀 +穂 +穆 +穈 +穉 +穌 +積 +穎 +穗 +穟 +穠 +穡 +穢 +穣 +穩 +穫 +穰 +穴 +穵 +究 +穹 +空 +穿 +突 +窄 +窅 +窈 +窋 +窒 +窕 +窖 +窗 +窘 +窟 +窠 +窣 +窨 +窩 +窪 +窮 +窯 +窰 +窶 +窺 +窿 +竄 +竅 +竇 +竈 +竊 +立 +竑 +站 +竜 +竟 +章 +竣 +童 +竦 +竩 +竭 +端 +競 +竹 +竺 +竻 +竿 +笄 +笆 +笈 +笏 +笑 +笘 +笙 +笛 +笞 +笠 +笥 +符 +笨 +笩 +笪 +第 +笭 +笮 +笯 +笱 +笳 +笹 +筅 +筆 +等 +筊 +筋 +筌 +筍 +筏 +筐 +筒 +答 +策 +筘 +筠 +筥 +筦 +筧 +筬 +筭 +筱 +筲 +筳 +筵 +筶 +筷 +筻 +箆 +箇 +箋 +箍 +箏 +箐 +箑 +箒 +箔 +箕 +算 +箜 +管 +箬 +箭 +箱 +箴 +箸 +節 +篁 +範 +篆 +篇 +築 +篊 +篋 +篌 +篔 +篙 +篝 +篠 +篡 +篤 +篥 +篦 +篩 +篪 +篭 +篯 +篳 +篷 +簀 +簃 +簇 +簉 +簋 +簍 +簑 +簕 +簗 +簞 +簠 +簡 +簧 +簪 +簫 +簷 +簸 +簹 +簺 +簽 +簾 +簿 +籀 +籃 +籌 +籍 +籐 +籙 +籛 +籜 +籝 +籟 +籠 +籣 +籤 +籥 +籪 +籬 +籮 +籲 +米 +籽 +籾 +粄 +粉 +粍 +粑 +粒 +粕 +粗 +粘 +粟 +粢 +粥 +粦 +粧 +粩 +粱 +粲 +粳 +粵 +粹 +粼 +粽 +精 +粿 +糀 +糅 +糊 +糌 +糍 +糎 +糕 +糖 +糙 +糜 +糝 +糞 +糟 +糠 +糢 +糧 +糬 +糯 +糰 +糴 +糶 +糸 +糹 +糺 +系 +糾 +紀 +紂 +約 +紅 +紆 +紇 +紈 +紉 +紊 +紋 +納 +紐 +紑 +紓 +純 +紕 +紗 +紘 +紙 +級 +紛 +紜 +紝 +紞 +素 +紡 +索 +紫 +紮 +累 +細 +紱 +紲 +紳 +紵 +紹 +紺 +紿 +終 +絃 +組 +絆 +経 +絎 +結 +絕 +絛 +絜 +絞 +絡 +絢 +給 +絨 +絪 +絮 +統 +絲 +絳 +絵 +絶 +絹 +絺 +綁 +綃 +綈 +綉 +綎 +綏 +經 +綖 +継 +続 +綜 +綝 +綞 +綠 +綢 +綣 +綦 +綧 +綫 +綬 +維 +綮 +綰 +綱 +網 +綳 +綴 +綸 +綺 +綻 +綽 +綾 +綿 +緁 +緃 +緄 +緈 +緊 +緋 +総 +緑 +緒 +緖 +緘 +線 +緜 +緝 +緞 +締 +緡 +緣 +緤 +編 +緩 +緬 +緯 +緱 +緲 +練 +緹 +緻 +縂 +縄 +縈 +縉 +縊 +縕 +縛 +縝 +縞 +縠 +縡 +縣 +縤 +縫 +縮 +縯 +縱 +縴 +縵 +縷 +縹 +縻 +總 +績 +繁 +繃 +繆 +繇 +繒 +織 +繕 +繖 +繙 +繚 +繞 +繡 +繩 +繪 +繫 +繭 +繰 +繳 +繹 +繻 +繼 +繽 +繾 +纁 +纂 +纈 +續 +纍 +纏 +纓 +纔 +纕 +纖 +纘 +纛 +纜 +缐 +缶 +缸 +缺 +缽 +罃 +罄 +罅 +罈 +罉 +罌 +罍 +罐 +罔 +罕 +罘 +罟 +罡 +罨 +罩 +罪 +置 +罰 +罱 +署 +罳 +罵 +罶 +罷 +罹 +罽 +羂 +羅 +羆 +羈 +羊 +羋 +羌 +美 +羔 +羕 +羗 +羙 +羚 +羞 +羡 +羣 +群 +羥 +羧 +羨 +義 +羯 +羰 +羱 +羲 +羸 +羹 +羽 +羿 +翀 +翁 +翂 +翃 +翅 +翊 +翌 +翎 +翏 +習 +翔 +翕 +翙 +翜 +翟 +翠 +翡 +翥 +翦 +翩 +翬 +翮 +翰 +翱 +翳 +翹 +翻 +翼 +耀 +老 +考 +耄 +者 +耆 +而 +耍 +耎 +耐 +耑 +耒 +耔 +耕 +耗 +耘 +耙 +耜 +耦 +耨 +耬 +耳 +耵 +耶 +耷 +耽 +耿 +聃 +聆 +聊 +聒 +聖 +聘 +聚 +聞 +聟 +聨 +聯 +聰 +聱 +聲 +聳 +聴 +聶 +職 +聽 +聾 +聿 +肄 +肅 +肆 +肇 +肉 +肋 +肌 +肏 +肖 +肘 +肚 +肛 +肜 +肝 +肟 +股 +肢 +肥 +肩 +肪 +肫 +肯 +肱 +育 +肸 +肹 +肺 +肼 +肽 +胂 +胃 +胄 +胅 +胇 +胊 +背 +胍 +胎 +胖 +胗 +胙 +胚 +胛 +胝 +胞 +胡 +胤 +胥 +胬 +胭 +胰 +胱 +胳 +胴 +胸 +胺 +胼 +能 +脂 +脅 +脆 +脇 +脈 +脊 +脒 +脖 +脘 +脛 +脣 +脩 +脫 +脬 +脭 +脯 +脲 +脳 +脷 +脹 +脾 +腆 +腈 +腊 +腋 +腌 +腎 +腐 +腑 +腓 +腔 +腕 +腥 +腦 +腧 +腩 +腫 +腮 +腰 +腱 +腳 +腴 +腸 +腹 +腺 +腿 +膀 +膂 +膈 +膊 +膏 +膚 +膛 +膜 +膝 +膠 +膣 +膥 +膦 +膨 +膩 +膮 +膳 +膺 +膽 +膾 +膿 +臀 +臂 +臃 +臆 +臉 +臊 +臍 +臏 +臘 +臚 +臞 +臟 +臠 +臣 +臧 +臨 +自 +臭 +臯 +至 +致 +臺 +臻 +臼 +臾 +舂 +舅 +與 +興 +舉 +舊 +舌 +舍 +舎 +舒 +舔 +舖 +舘 +舛 +舜 +舞 +舟 +舢 +舥 +舨 +舩 +航 +舫 +般 +舲 +舵 +舶 +舷 +舸 +船 +舺 +艅 +艇 +艉 +艋 +艎 +艏 +艔 +艘 +艙 +艚 +艦 +艮 +良 +艱 +色 +艶 +艷 +艸 +艽 +艾 +艿 +芃 +芊 +芋 +芍 +芎 +芑 +芒 +芘 +芙 +芛 +芝 +芡 +芥 +芨 +芩 +芪 +芫 +芬 +芭 +芮 +芯 +花 +芳 +芴 +芷 +芸 +芹 +芻 +芽 +芾 +苄 +苅 +苑 +苒 +苓 +苔 +苕 +苗 +苛 +苜 +苝 +苞 +苟 +苡 +苣 +苤 +若 +苦 +苧 +苪 +苫 +苯 +英 +苳 +苴 +苷 +苺 +苻 +苼 +苾 +茀 +茁 +茂 +范 +茄 +茅 +茆 +茇 +茈 +茉 +茌 +茗 +茘 +茚 +茛 +茜 +茝 +茨 +茫 +茬 +茭 +茮 +茯 +茱 +茲 +茴 +茵 +茶 +茷 +茸 +茹 +茺 +茼 +荀 +荃 +荅 +荇 +草 +荊 +荎 +荏 +荒 +荔 +荖 +荘 +荳 +荷 +荸 +荻 +荼 +荽 +莆 +莉 +莊 +莎 +莒 +莓 +莕 +莖 +莘 +莙 +莛 +莜 +莞 +莠 +莢 +莧 +莨 +莩 +莪 +莫 +莽 +莿 +菀 +菁 +菅 +菇 +菈 +菉 +菊 +菌 +菍 +菏 +菑 +菓 +菔 +菖 +菘 +菜 +菝 +菟 +菠 +菡 +菥 +菩 +菪 +菫 +華 +菰 +菱 +菲 +菴 +菶 +菸 +菹 +菺 +菼 +菽 +菾 +萁 +萃 +萄 +萇 +萊 +萌 +萍 +萎 +萐 +萘 +萜 +萠 +萡 +萣 +萩 +萬 +萭 +萱 +萵 +萸 +萹 +萼 +落 +葃 +葆 +葉 +葊 +葎 +葑 +葒 +著 +葙 +葚 +葛 +葜 +葝 +葡 +董 +葦 +葩 +葫 +葬 +葭 +葯 +葰 +葳 +葵 +葶 +葷 +葺 +蒂 +蒄 +蒍 +蒎 +蒐 +蒓 +蒔 +蒗 +蒙 +蒜 +蒞 +蒟 +蒡 +蒢 +蒤 +蒧 +蒨 +蒭 +蒯 +蒲 +蒴 +蒸 +蒹 +蒺 +蒻 +蒼 +蒽 +蒾 +蒿 +蓀 +蓁 +蓂 +蓄 +蓆 +蓉 +蓋 +蓍 +蓑 +蓓 +蓖 +蓘 +蓚 +蓧 +蓨 +蓪 +蓬 +蓭 +蓮 +蓯 +蓳 +蓼 +蓽 +蓿 +蔆 +蔎 +蔑 +蔓 +蔔 +蔕 +蔗 +蔘 +蔚 +蔝 +蔞 +蔡 +蔣 +蔥 +蔦 +蔬 +蔭 +蔴 +蔵 +蔻 +蔽 +蕁 +蕃 +蕅 +蕈 +蕉 +蕊 +蕎 +蕑 +蕒 +蕖 +蕘 +蕙 +蕚 +蕟 +蕡 +蕢 +蕤 +蕨 +蕩 +蕪 +蕭 +蕷 +蕹 +蕺 +蕻 +蕾 +薀 +薄 +薆 +薇 +薈 +薊 +薌 +薏 +薐 +薑 +薔 +薗 +薘 +薙 +薛 +薜 +薞 +薟 +薡 +薦 +薨 +薩 +薪 +薫 +薬 +薯 +薰 +薲 +薷 +薸 +薹 +薺 +薾 +薿 +藁 +藉 +藍 +藎 +藏 +藐 +藔 +藕 +藜 +藝 +藟 +藤 +藥 +藦 +藨 +藩 +藪 +藶 +藸 +藹 +藺 +藻 +藿 +蘂 +蘄 +蘅 +蘆 +蘇 +蘊 +蘋 +蘐 +蘑 +蘓 +蘗 +蘘 +蘚 +蘞 +蘢 +蘧 +蘩 +蘭 +蘵 +蘶 +蘸 +蘼 +蘿 +虉 +虎 +虐 +虓 +虔 +處 +虖 +虛 +虜 +虞 +號 +虢 +虧 +虨 +虯 +虱 +虵 +虹 +虺 +虻 +蚆 +蚊 +蚋 +蚌 +蚍 +蚓 +蚖 +蚜 +蚝 +蚡 +蚢 +蚣 +蚤 +蚧 +蚨 +蚩 +蚪 +蚯 +蚱 +蚴 +蚵 +蚶 +蚺 +蚼 +蛀 +蛄 +蛇 +蛉 +蛋 +蛍 +蛐 +蛑 +蛔 +蛙 +蛛 +蛞 +蛟 +蛤 +蛭 +蛯 +蛸 +蛹 +蛺 +蛻 +蛾 +蜀 +蜂 +蜃 +蜆 +蜇 +蜈 +蜉 +蜊 +蜍 +蜑 +蜒 +蜓 +蜘 +蜚 +蜛 +蜜 +蜞 +蜢 +蜣 +蜥 +蜨 +蜮 +蜯 +蜱 +蜴 +蜷 +蜻 +蜾 +蜿 +蝀 +蝌 +蝍 +蝎 +蝓 +蝕 +蝗 +蝘 +蝙 +蝚 +蝟 +蝠 +蝣 +蝤 +蝦 +蝨 +蝮 +蝯 +蝰 +蝲 +蝴 +蝶 +蝸 +蝽 +螂 +螃 +螄 +螅 +螈 +螋 +融 +螐 +螔 +螞 +螟 +螠 +螢 +螣 +螥 +螫 +螭 +螯 +螳 +螶 +螺 +螻 +螽 +螾 +蟀 +蟄 +蟅 +蟆 +蟊 +蟋 +蟌 +蟎 +蟑 +蟒 +蟜 +蟠 +蟥 +蟪 +蟫 +蟬 +蟯 +蟲 +蟳 +蟴 +蟶 +蟹 +蟻 +蟾 +蠂 +蠃 +蠄 +蠅 +蠆 +蠊 +蠋 +蠍 +蠐 +蠑 +蠓 +蠔 +蠕 +蠖 +蠘 +蠙 +蠟 +蠡 +蠢 +蠣 +蠱 +蠲 +蠵 +蠶 +蠷 +蠹 +蠻 +血 +衂 +衆 +行 +衍 +衎 +術 +衕 +衖 +街 +衙 +衚 +衛 +衜 +衝 +衞 +衡 +衢 +衣 +表 +衩 +衫 +衰 +衲 +衷 +衽 +衾 +衿 +袁 +袂 +袈 +袋 +袍 +袓 +袖 +袛 +袞 +袤 +袪 +被 +袱 +袴 +袾 +裁 +裂 +裊 +裎 +裒 +裔 +裕 +裖 +裘 +裙 +補 +裝 +裟 +裡 +裨 +裬 +裱 +裳 +裴 +裵 +裸 +裹 +製 +裾 +裿 +褀 +褂 +複 +褌 +褍 +褎 +褐 +褒 +褓 +褔 +褘 +褙 +褚 +褞 +褥 +褧 +褪 +褫 +褭 +褲 +褶 +褸 +褻 +襄 +襌 +襖 +襞 +襟 +襠 +襤 +襦 +襪 +襯 +襲 +襴 +襶 +襻 +襾 +西 +要 +覃 +覆 +覇 +覈 +見 +覌 +規 +覓 +視 +覚 +覡 +覦 +覧 +親 +覬 +覲 +観 +覺 +覽 +覿 +觀 +角 +觔 +觙 +觚 +觜 +解 +觭 +觱 +觴 +觶 +觸 +觿 +言 +訁 +訂 +訃 +訇 +計 +訊 +訌 +討 +訏 +訐 +訒 +訓 +訔 +訕 +訖 +託 +記 +訛 +訝 +訟 +訣 +訥 +訪 +設 +許 +訴 +訶 +診 +註 +証 +訾 +詁 +詆 +詈 +詐 +詒 +詔 +評 +詛 +詞 +詠 +詡 +詢 +詣 +詥 +試 +詧 +詩 +詫 +詭 +詮 +詰 +話 +該 +詳 +詵 +詹 +詼 +誄 +誅 +誇 +誌 +認 +誒 +誓 +誕 +誘 +語 +誠 +誡 +誣 +誤 +誥 +誦 +誨 +說 +説 +読 +誰 +課 +誴 +誹 +誼 +誾 +調 +談 +請 +諍 +諏 +諒 +論 +諗 +諜 +諟 +諠 +諡 +諤 +諦 +諧 +諪 +諫 +諭 +諮 +諱 +諲 +諳 +諴 +諶 +諷 +諸 +諺 +諼 +諾 +謀 +謁 +謂 +謄 +謇 +謊 +謌 +謎 +謏 +謐 +謔 +謖 +謗 +謙 +謚 +講 +謜 +謝 +謠 +謢 +謤 +謨 +謩 +謫 +謬 +謳 +謹 +謾 +證 +譏 +譓 +譔 +識 +譙 +譚 +譜 +譞 +警 +譫 +譬 +譭 +譯 +議 +譲 +譳 +譴 +護 +譽 +譿 +讀 +讃 +變 +讌 +讎 +讓 +讖 +讙 +讚 +讜 +讞 +谷 +谿 +豁 +豆 +豇 +豈 +豉 +豊 +豌 +豎 +豐 +豔 +豕 +豚 +象 +豢 +豨 +豪 +豫 +豬 +豳 +豸 +豹 +豺 +豿 +貂 +貅 +貉 +貊 +貌 +貐 +貒 +貓 +貔 +貘 +貝 +貞 +負 +財 +貢 +貤 +貧 +貨 +販 +貪 +貫 +責 +貭 +貮 +貯 +貲 +貳 +貴 +貶 +買 +貸 +貺 +費 +貼 +貽 +貿 +賀 +賁 +賂 +賃 +賄 +資 +賈 +賊 +賑 +賒 +賓 +賔 +賕 +賚 +賜 +賞 +賠 +賡 +賢 +賣 +賤 +賦 +賨 +質 +賬 +賭 +賴 +賹 +賺 +賻 +購 +賽 +賾 +贄 +贅 +贇 +贈 +贊 +贌 +贍 +贏 +贓 +贔 +贖 +贛 +赤 +赦 +赧 +赫 +赬 +赭 +走 +赳 +赴 +起 +趁 +超 +越 +趐 +趕 +趖 +趙 +趟 +趣 +趨 +足 +趴 +趵 +趺 +趼 +趾 +跅 +跆 +跋 +跌 +跏 +跑 +跖 +跗 +跛 +距 +跟 +跡 +跣 +跤 +跨 +跩 +跪 +路 +跳 +踎 +踏 +踐 +踝 +踞 +踢 +踩 +踰 +踴 +踹 +踺 +蹂 +蹄 +蹇 +蹈 +蹉 +蹊 +蹋 +蹕 +蹙 +蹟 +蹠 +蹤 +蹦 +蹬 +蹭 +蹯 +蹲 +蹴 +蹶 +蹺 +蹻 +蹼 +躁 +躂 +躄 +躉 +躋 +躍 +躑 +躒 +躔 +躝 +躪 +身 +躬 +躰 +躲 +躺 +軀 +車 +軋 +軌 +軍 +軎 +軒 +軔 +軛 +軟 +転 +軫 +軲 +軸 +軹 +軺 +軻 +軼 +軽 +軾 +較 +輄 +輅 +載 +輋 +輒 +輓 +輔 +輕 +輛 +輝 +輞 +輟 +輥 +輦 +輩 +輪 +輬 +輭 +輯 +輶 +輸 +輻 +輾 +輿 +轀 +轂 +轄 +轅 +轆 +轉 +轍 +轎 +轘 +轝 +轟 +轤 +辛 +辜 +辟 +辣 +辦 +辧 +辨 +辭 +辮 +辯 +辰 +辱 +農 +辵 +辺 +辻 +込 +迂 +迄 +迅 +迎 +近 +返 +迢 +迤 +迥 +迦 +迪 +迫 +迭 +迮 +述 +迴 +迵 +迷 +迸 +迺 +追 +退 +送 +逃 +逄 +逅 +逆 +逈 +逋 +逌 +逍 +逎 +透 +逐 +逑 +途 +逕 +逖 +逗 +這 +通 +逛 +逝 +逞 +速 +造 +逢 +連 +逤 +逨 +逮 +逯 +進 +逴 +逵 +逸 +逹 +逺 +逼 +逾 +遁 +遂 +遄 +遇 +遊 +運 +遍 +過 +遏 +遐 +遒 +道 +達 +違 +遘 +遙 +遛 +遜 +遞 +遠 +遢 +遣 +遨 +適 +遭 +遮 +遯 +遲 +遴 +遵 +遶 +遷 +選 +遹 +遺 +遼 +避 +邀 +邁 +邂 +邃 +還 +邇 +邈 +邉 +邊 +邋 +邏 +邑 +邕 +邗 +邙 +邛 +邠 +邡 +邢 +那 +邦 +邨 +邪 +邯 +邰 +邱 +邲 +邳 +邴 +邵 +邸 +邽 +邾 +郁 +郃 +郄 +郅 +郇 +郊 +郋 +郎 +郗 +郛 +郜 +郝 +郞 +郟 +郡 +郢 +郤 +部 +郪 +郫 +郭 +郯 +郳 +郴 +郵 +郷 +都 +郾 +郿 +鄂 +鄃 +鄄 +鄆 +鄉 +鄋 +鄑 +鄒 +鄔 +鄖 +鄗 +鄘 +鄙 +鄚 +鄜 +鄞 +鄠 +鄢 +鄣 +鄤 +鄧 +鄩 +鄫 +鄭 +鄯 +鄰 +鄱 +鄲 +鄳 +鄴 +鄺 +酃 +酆 +酈 +酉 +酊 +酋 +酌 +配 +酎 +酏 +酐 +酒 +酔 +酗 +酚 +酞 +酡 +酢 +酣 +酥 +酩 +酪 +酬 +酮 +酯 +酰 +酴 +酵 +酶 +酷 +酸 +酺 +酼 +醁 +醂 +醃 +醅 +醇 +醉 +醋 +醌 +醍 +醐 +醒 +醚 +醛 +醜 +醞 +醢 +醣 +醪 +醫 +醬 +醮 +醯 +醴 +醺 +醾 +醿 +釀 +釁 +釆 +采 +釉 +釋 +里 +重 +野 +量 +釐 +金 +釒 +釓 +釔 +釕 +釗 +釘 +釙 +釚 +釜 +針 +釣 +釤 +釦 +釧 +釩 +釪 +釭 +釴 +釵 +釷 +釹 +釺 +鈀 +鈁 +鈄 +鈇 +鈈 +鈉 +鈊 +鈍 +鈏 +鈐 +鈑 +鈔 +鈕 +鈖 +鈞 +鈢 +鈣 +鈥 +鈦 +鈫 +鈮 +鈰 +鈳 +鈴 +鈷 +鈸 +鈹 +鈺 +鈾 +鈿 +鉀 +鉄 +鉅 +鉆 +鉈 +鉉 +鉋 +鉌 +鉍 +鉏 +鉑 +鉓 +鉗 +鉚 +鉛 +鉞 +鉟 +鉤 +鉦 +鉬 +鉭 +鉲 +鉶 +鉷 +鉸 +鉻 +鉾 +鉿 +銀 +銂 +銃 +銅 +銋 +銍 +銑 +銓 +銕 +銖 +銘 +銚 +銜 +銠 +銣 +銥 +銦 +銨 +銩 +銪 +銫 +銬 +銭 +銱 +銲 +銳 +銶 +銷 +銹 +銻 +銼 +銾 +鋁 +鋅 +鋆 +鋇 +鋌 +鋏 +鋐 +鋒 +鋕 +鋗 +鋙 +鋡 +鋤 +鋥 +鋦 +鋨 +鋪 +鋮 +鋯 +鋰 +鋱 +鋳 +鋶 +鋸 +鋹 +鋼 +錀 +錄 +錏 +錐 +錒 +錕 +錘 +錚 +錞 +錟 +錠 +錡 +錢 +錦 +錨 +錫 +錬 +錮 +錯 +錳 +錶 +錸 +錻 +鍀 +鍇 +鍈 +鍉 +鍊 +鍋 +鍍 +鍏 +鍔 +鍘 +鍛 +鍝 +鍟 +鍠 +鍥 +鍩 +鍬 +鍱 +鍳 +鍵 +鍶 +鍷 +鍺 +鍼 +鍾 +鎂 +鎅 +鎊 +鎌 +鎏 +鎓 +鎔 +鎖 +鎗 +鎘 +鎚 +鎛 +鎢 +鎣 +鎦 +鎧 +鎪 +鎬 +鎭 +鎮 +鎰 +鎳 +鎵 +鎻 +鏃 +鏇 +鏈 +鏊 +鏌 +鏐 +鏑 +鏓 +鏖 +鏗 +鏘 +鏜 +鏝 +鏞 +鏟 +鏡 +鏢 +鏤 +鏦 +鏳 +鏴 +鏵 +鏷 +鏻 +鏽 +鐃 +鐇 +鐈 +鐓 +鐔 +鐘 +鐙 +鐠 +鐡 +鐤 +鐦 +鐧 +鐫 +鐬 +鐭 +鐮 +鐲 +鐳 +鐵 +鐸 +鐺 +鐽 +鐿 +鑀 +鑁 +鑂 +鑄 +鑅 +鑊 +鑌 +鑑 +鑒 +鑛 +鑠 +鑣 +鑨 +鑪 +鑫 +鑭 +鑰 +鑲 +鑴 +鑷 +鑼 +鑽 +鑾 +鑿 +長 +門 +閂 +閃 +閆 +閉 +開 +閎 +閏 +閑 +閒 +間 +閔 +閘 +閜 +閞 +閟 +関 +閣 +閥 +閦 +閨 +閩 +閬 +閭 +閰 +閱 +閶 +閹 +閻 +閼 +閾 +閿 +闆 +闇 +闈 +闊 +闋 +闌 +闍 +闐 +闓 +闔 +闕 +闖 +闘 +關 +闞 +闡 +闢 +闥 +阜 +阝 +阡 +阪 +阭 +阮 +阯 +阱 +防 +阻 +阿 +陀 +陁 +陂 +附 +陋 +陌 +降 +限 +陔 +陘 +陛 +陜 +陝 +陞 +陟 +陡 +院 +陣 +除 +陪 +陬 +陰 +陲 +陳 +陵 +陶 +陷 +陸 +険 +陽 +隄 +隅 +隆 +隈 +隊 +隋 +隍 +階 +隔 +隕 +隗 +隘 +隙 +際 +障 +隣 +隧 +隨 +險 +隰 +隱 +隲 +隳 +隴 +隷 +隸 +隹 +隻 +隼 +雀 +雁 +雄 +雅 +集 +雇 +雉 +雋 +雌 +雍 +雎 +雑 +雒 +雕 +雖 +雙 +雛 +雜 +雝 +雞 +離 +難 +雨 +雩 +雪 +雫 +雯 +雱 +雲 +零 +雷 +雹 +電 +需 +霄 +霅 +霆 +震 +霈 +霉 +霊 +霍 +霎 +霏 +霑 +霓 +霖 +霙 +霜 +霞 +霤 +霧 +霨 +霰 +露 +霶 +霸 +霹 +霽 +霾 +靁 +靂 +靄 +靈 +靉 +靑 +青 +靖 +靚 +靛 +靜 +非 +靠 +靡 +面 +革 +靫 +靬 +靭 +靳 +靴 +靶 +靺 +靼 +鞅 +鞆 +鞋 +鞍 +鞏 +鞘 +鞞 +鞠 +鞣 +鞥 +鞦 +鞨 +鞭 +鞮 +鞴 +韁 +韃 +韆 +韋 +韌 +韑 +韓 +韙 +韜 +韞 +韠 +韡 +韭 +韮 +音 +韶 +韺 +韻 +韾 +響 +頁 +頂 +頃 +項 +順 +須 +頊 +頌 +頍 +頎 +頏 +預 +頑 +頒 +頓 +頔 +頗 +領 +頜 +頠 +頡 +頤 +頦 +頫 +頭 +頰 +頴 +頵 +頷 +頸 +頹 +頻 +頼 +顆 +題 +額 +顎 +顏 +顒 +顓 +顔 +顕 +顗 +願 +顙 +顛 +類 +顥 +顧 +顫 +顯 +顰 +顱 +顳 +顴 +風 +颮 +颯 +颱 +颶 +颺 +颼 +飄 +飆 +飈 +飛 +食 +飠 +飡 +飢 +飥 +飩 +飪 +飫 +飬 +飭 +飮 +飯 +飲 +飴 +飼 +飽 +飾 +餃 +餄 +餅 +餉 +養 +餌 +餎 +餐 +餒 +餓 +餗 +餘 +餚 +餛 +餞 +餠 +餡 +館 +餮 +餵 +餺 +餾 +餿 +饃 +饅 +饋 +饌 +饑 +饒 +饕 +饗 +饞 +饟 +饢 +首 +馗 +馘 +香 +馛 +馥 +馦 +馨 +馬 +馭 +馮 +馯 +馱 +馳 +馴 +馼 +駁 +駄 +駅 +駆 +駐 +駑 +駒 +駔 +駕 +駘 +駙 +駛 +駝 +駟 +駢 +駭 +駰 +駱 +駿 +騁 +騂 +騄 +騅 +騋 +騎 +騏 +験 +騖 +騙 +騤 +騨 +騫 +騭 +騮 +騰 +騶 +騷 +騾 +驁 +驃 +驄 +驅 +驊 +驌 +驍 +驎 +驒 +驕 +驗 +驚 +驛 +驟 +驢 +驤 +驥 +驩 +驪 +骨 +骯 +骰 +骶 +骷 +骸 +骼 +髀 +髂 +髎 +髏 +髑 +髒 +髓 +體 +高 +髙 +髡 +髦 +髪 +髭 +髮 +髯 +髲 +髷 +髹 +髻 +鬃 +鬄 +鬅 +鬆 +鬍 +鬚 +鬟 +鬢 +鬣 +鬥 +鬧 +鬨 +鬩 +鬪 +鬬 +鬮 +鬯 +鬱 +鬲 +鬹 +鬻 +鬼 +魁 +魂 +魃 +魄 +魅 +魈 +魋 +魍 +魎 +魏 +魔 +魕 +魘 +魚 +魛 +魞 +魟 +魣 +魨 +魩 +魮 +魯 +魴 +魷 +鮀 +鮁 +鮃 +鮄 +鮊 +鮋 +鮍 +鮐 +鮑 +鮒 +鮓 +鮗 +鮜 +鮟 +鮠 +鮡 +鮣 +鮨 +鮪 +鮫 +鮭 +鮮 +鮰 +鮸 +鮹 +鮻 +鯀 +鯁 +鯃 +鯇 +鯉 +鯊 +鯏 +鯒 +鯓 +鯔 +鯕 +鯖 +鯗 +鯙 +鯛 +鯡 +鯢 +鯤 +鯧 +鯨 +鯪 +鯭 +鯮 +鯰 +鯶 +鯷 +鯻 +鯽 +鯿 +鰂 +鰃 +鰆 +鰈 +鰉 +鰍 +鰏 +鰒 +鰓 +鰕 +鰗 +鰛 +鰜 +鰟 +鰣 +鰤 +鰧 +鰨 +鰩 +鰭 +鰮 +鰱 +鰲 +鰳 +鰶 +鰷 +鰹 +鰺 +鰻 +鰼 +鰾 +鱀 +鱂 +鱅 +鱇 +鱈 +鱉 +鱊 +鱒 +鱓 +鱔 +鱖 +鱗 +鱘 +鱚 +鱝 +鱟 +鱠 +鱣 +鱥 +鱧 +鱨 +鱬 +鱮 +鱰 +鱲 +鱵 +鱷 +鱸 +鱺 +鱻 +鳥 +鳧 +鳩 +鳯 +鳰 +鳳 +鳴 +鳶 +鳽 +鴆 +鴇 +鴉 +鴒 +鴓 +鴕 +鴗 +鴛 +鴝 +鴞 +鴟 +鴡 +鴣 +鴦 +鴨 +鴫 +鴯 +鴰 +鴴 +鴻 +鴿 +鵂 +鵄 +鵎 +鵐 +鵑 +鵒 +鵓 +鵙 +鵜 +鵝 +鵞 +鵟 +鵠 +鵡 +鵪 +鵬 +鵯 +鵰 +鵲 +鵵 +鵼 +鵾 +鶆 +鶇 +鶉 +鶏 +鶒 +鶓 +鶘 +鶚 +鶡 +鶥 +鶩 +鶬 +鶯 +鶲 +鶴 +鶹 +鶺 +鶻 +鶼 +鶿 +鷂 +鷄 +鷉 +鷎 +鷓 +鷗 +鷙 +鷚 +鷟 +鷥 +鷦 +鷫 +鷯 +鷲 +鷳 +鷸 +鷹 +鷺 +鸊 +鸌 +鸐 +鸑 +鸕 +鸘 +鸚 +鸛 +鸜 +鸝 +鸞 +鹮 +鹵 +鹹 +鹼 +鹽 +鹿 +麂 +麅 +麇 +麈 +麊 +麋 +麐 +麒 +麓 +麗 +麝 +麞 +麟 +麥 +麩 +麪 +麯 +麴 +麵 +麹 +麺 +麻 +麼 +麽 +麾 +麿 +黁 +黃 +黇 +黌 +黍 +黎 +黏 +黐 +黑 +黒 +黔 +默 +黙 +黛 +黜 +黝 +點 +黟 +黥 +黧 +黨 +黯 +黴 +黶 +黻 +黼 +黽 +黿 +鼂 +鼇 +鼈 +鼉 +鼎 +鼐 +鼒 +鼓 +鼕 +鼙 +鼠 +鼢 +鼩 +鼬 +鼯 +鼱 +鼴 +鼷 +鼻 +鼽 +鼾 +齊 +齋 +齒 +齕 +齡 +齣 +齦 +齧 +齲 +齶 +龍 +龎 +龐 +龑 +龔 +龕 +龜 +龝 +龠 +龢 +郎 +凉 +﹑ +﹗ +﹝ +﹞ +﹢ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +K +L +M +N +O +P +R +S +T +U +V +W +Y +Z +[ +] +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +r +s +t +u +z +{ +| +} +~ +¥ +𣇉 + diff --git a/ppocr/utils/dict/confuse.pkl b/ppocr/utils/dict/confuse.pkl new file mode 100644 index 0000000000000000000000000000000000000000..83c8a74b9be7873bb89997e483106bbc924963bd --- /dev/null +++ b/ppocr/utils/dict/confuse.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57d8aa98a781330f7534a81263179e77c159a37050bd31fe09d8ddf1880c5628 +size 30912 diff --git a/ppocr/utils/dict/cyrillic_dict.txt b/ppocr/utils/dict/cyrillic_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b6f66494d5417e18bbd225719aa72690e09e126 --- /dev/null +++ b/ppocr/utils/dict/cyrillic_dict.txt @@ -0,0 +1,163 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +Ё +Є +І +Ј +Љ +Ў +А +Б +В +Г +Д +Е +Ж +З +И +Й +К +Л +М +Н +О +П +Р +С +Т +У +Ф +Х +Ц +Ч +Ш +Щ +Ъ +Ы +Ь +Э +Ю +Я +а +б +в +г +д +е +ж +з +и +й +к +л +м +н +о +п +р +с +т +у +ф +х +ц +ч +ш +щ +ъ +ы +ь +э +ю +я +ё +ђ +є +і +ј +љ +њ +ћ +ў +џ +Ґ +ґ diff --git a/ppocr/utils/dict/devanagari_dict.txt b/ppocr/utils/dict/devanagari_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..f55923061bfd480b875bb3679d7a75a9157387a9 --- /dev/null +++ b/ppocr/utils/dict/devanagari_dict.txt @@ -0,0 +1,167 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ँ +ं +ः +अ +आ +इ +ई +उ +ऊ +ऋ +ए +ऐ +ऑ +ओ +औ +क +ख +ग +घ +ङ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +ऩ +प +फ +ब +भ +म +य +र +ऱ +ल +ळ +व +श +ष +स +ह +़ +ा +ि +ी +ु +ू +ृ +ॅ +े +ै +ॉ +ो +ौ +् +॒ +क़ +ख़ +ग़ +ज़ +ड़ +ढ़ +फ़ +ॠ +। +० +१ +२ +३ +४ +५ +६ +७ +८ +९ +॰ diff --git a/ppocr/utils/dict/en_dict.txt b/ppocr/utils/dict/en_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fbd99f46acca8391a5e86ae546c637399204506 --- /dev/null +++ b/ppocr/utils/dict/en_dict.txt @@ -0,0 +1,63 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + diff --git a/ppocr/utils/dict/fa_dict.txt b/ppocr/utils/dict/fa_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..2328fbd8374b3c551036a8521c1a70104925b5a8 --- /dev/null +++ b/ppocr/utils/dict/fa_dict.txt @@ -0,0 +1,136 @@ +f +a +_ +i +m +g +/ +1 +3 +I +L +S +V +R +C +2 +0 +v +l +6 +8 +5 +. +j +p +و +د +ر +ك +ن +ش +ه +ا +4 +9 +ی +ج +ِ +7 +غ +ل +س +ز +ّ +ت +ک +گ +ي +م +ب +ف +چ +خ +ق +ژ +آ +ص +پ +َ +ع +ئ +ح +ٔ +ض +ُ +ذ +أ +ى +ط +ظ +ث +ة +ً +ء +ؤ +ْ +ۀ +إ +ٍ +ٌ +ٰ +ٓ +ٱ +s +c +e +n +w +N +E +W +Y +D +O +H +A +d +z +r +T +G +o +t +x +h +b +B +M +Z +u +P +F +y +q +U +K +k +J +Q +' +X +# +? +% +$ +, +: +& +! +- +( +É +@ +é ++ + diff --git a/ppocr/utils/dict/french_dict.txt b/ppocr/utils/dict/french_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8f657db35bf0b74f779f38a9e3b9b47b007e3c4 --- /dev/null +++ b/ppocr/utils/dict/french_dict.txt @@ -0,0 +1,136 @@ +f +e +n +c +h +_ +i +m +g +/ +r +v +a +l +t +w +o +d +6 +1 +. +p +B +u +2 +à +3 +R +y +4 +U +E +A +5 +P +O +S +T +D +7 +Z +8 +I +N +L +G +M +H +0 +J +K +- +9 +F +C +V +é +X +' +s +Q +: +è +x +b +Y +Œ +É +z +W +Ç +È +k +Ô +ô +€ +À +Ê +q +ù +° +ê +î +* + +j +" +, +â +% +û +ç +ü +? +! +; +ö +( +) +ï +º +ó +ø +å ++ +™ +á +Ë +< +² +Á +Î +& +@ +œ +ε +Ü +ë +[ +] +í +ò +Ö +ä +ß +« +» +ú +ñ +æ +µ +³ +Å +$ +# + diff --git a/ppocr/utils/dict/german_dict.txt b/ppocr/utils/dict/german_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e121af21a1617dd970234ca98ae7072b0335332 --- /dev/null +++ b/ppocr/utils/dict/german_dict.txt @@ -0,0 +1,143 @@ + +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +£ +§ +­ +° +´ +µ +· +º +¿ +Á +Ä +Å +É +Ï +Ô +Ö +Ü +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +í +ï +ñ +ò +ó +ô +ö +ø +ù +ú +û +ü +ō +Š +Ÿ +ʒ +β +δ +з +Ṡ +‘ +€ +© +ª +« +¬ diff --git a/ppocr/utils/dict/hi_dict.txt b/ppocr/utils/dict/hi_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dfedb5ac483966de40caabe0e95118f88aa5a54 --- /dev/null +++ b/ppocr/utils/dict/hi_dict.txt @@ -0,0 +1,162 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ँ +ं +ः +अ +आ +इ +ई +उ +ऊ +ऋ +ए +ऐ +ऑ +ओ +औ +क +ख +ग +घ +ङ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +प +फ +ब +भ +म +य +र +ल +ळ +व +श +ष +स +ह +़ +ा +ि +ी +ु +ू +ृ +ॅ +े +ै +ॉ +ो +ौ +् +क़ +ख़ +ग़ +ज़ +ड़ +ढ़ +फ़ +० +१ +२ +३ +४ +५ +६ +७ +८ +९ +॰ diff --git a/ppocr/utils/dict/it_dict.txt b/ppocr/utils/dict/it_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..e692c6d4335b4b8b2ed873d7923a69ed5e3d6c9a --- /dev/null +++ b/ppocr/utils/dict/it_dict.txt @@ -0,0 +1,118 @@ +i +t +_ +m +g +/ +5 +I +L +S +V +R +C +2 +0 +1 +v +a +l +7 +8 +9 +6 +. +j +p + +e +r +o +d +s +n +3 +4 +P +u +c +A +- +, +" +z +h +f +b +q +ì +' +à +O +è +G +ù +é +ò +; +F +E +B +N +H +k +: +U +T +X +D +K +? +[ +M +­ +x +y +( +) +W +ö +º +w +] +Q +J ++ +ü +! +È +á +% += +» +ñ +Ö +Y +ä +í +Z +« +@ +ó +ø +ï +ú +ê +ç +Á +É +Å +ß +{ +} +& +` +û +î +# +$ diff --git a/ppocr/utils/dict/japan_dict.txt b/ppocr/utils/dict/japan_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..339d4b89e5159a346636641a0814874faa59754a --- /dev/null +++ b/ppocr/utils/dict/japan_dict.txt @@ -0,0 +1,4399 @@ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +© +° +² +´ +½ +Á +Ä +Å +Ç +È +É +Í +Ó +Ö +× +Ü +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +í +ð +ñ +ò +ó +ô +õ +ö +ø +ú +û +ü +ý +ā +ă +ą +ć +Č +č +đ +ē +ė +ę +ğ +ī +ı +Ł +ł +ń +ň +ō +ř +Ş +ş +Š +š +ţ +ū +ż +Ž +ž +Ș +ș +ț +Δ +α +λ +μ +φ +Г +О +а +в +л +о +р +с +т +я +ồ +​ +— +― +’ +“ +” +… +℃ +→ +∇ +− +■ +☆ +  +、 +。 +々 +〆 +〈 +〉 +「 +」 +『 +』 +〔 +〕 +〜 +ぁ +あ +ぃ +い +う +ぇ +え +ぉ +お +か +が +き +ぎ +く +ぐ +け +げ +こ +ご +さ +ざ +し +じ +す +ず +せ +ぜ +そ +ぞ +た +だ +ち +ぢ +っ +つ +づ +て +で +と +ど +な +に +ぬ +ね +の +は +ば +ぱ +ひ +び +ぴ +ふ +ぶ +ぷ +へ +べ +ぺ +ほ +ぼ +ぽ +ま +み +む +め +も +ゃ +や +ゅ +ゆ +ょ +よ +ら +り +る +れ +ろ +わ +ゑ +を +ん +ゝ +ゞ +ァ +ア +ィ +イ +ゥ +ウ +ェ +エ +ォ +オ +カ +ガ +キ +ギ +ク +グ +ケ +ゲ +コ +ゴ +サ +ザ +シ +ジ +ス +ズ +セ +ゼ +ソ +ゾ +タ +ダ +チ +ヂ +ッ +ツ +ヅ +テ +デ +ト +ド +ナ +ニ +ヌ +ネ +ノ +ハ +バ +パ +ヒ +ビ +ピ +フ +ブ +プ +ヘ +ベ +ペ +ホ +ボ +ポ +マ +ミ +ム +メ +モ +ャ +ヤ +ュ +ユ +ョ +ヨ +ラ +リ +ル +レ +ロ +ワ +ヰ +ン +ヴ +ヵ +ヶ +・ +ー +㈱ +一 +丁 +七 +万 +丈 +三 +上 +下 +不 +与 +丑 +且 +世 +丘 +丙 +丞 +両 +並 +中 +串 +丸 +丹 +主 +丼 +丿 +乃 +久 +之 +乎 +乏 +乗 +乘 +乙 +九 +乞 +也 +乱 +乳 +乾 +亀 +了 +予 +争 +事 +二 +于 +互 +五 +井 +亘 +亙 +些 +亜 +亟 +亡 +交 +亥 +亦 +亨 +享 +京 +亭 +亮 +人 +什 +仁 +仇 +今 +介 +仍 +仏 +仔 +仕 +他 +仗 +付 +仙 +代 +令 +以 +仮 +仰 +仲 +件 +任 +企 +伊 +伍 +伎 +伏 +伐 +休 +会 +伝 +伯 +估 +伴 +伶 +伸 +伺 +似 +伽 +佃 +但 +位 +低 +住 +佐 +佑 +体 +何 +余 +佚 +佛 +作 +佩 +佳 +併 +佶 +使 +侈 +例 +侍 +侏 +侑 +侘 +供 +依 +侠 +価 +侮 +侯 +侵 +侶 +便 +係 +促 +俄 +俊 +俔 +俗 +俘 +保 +信 +俣 +俤 +修 +俯 +俳 +俵 +俸 +俺 +倉 +個 +倍 +倒 +候 +借 +倣 +値 +倫 +倭 +倶 +倹 +偃 +假 +偈 +偉 +偏 +偐 +偕 +停 +健 +側 +偵 +偶 +偽 +傀 +傅 +傍 +傑 +傘 +備 +催 +傭 +傲 +傳 +債 +傷 +傾 +僊 +働 +像 +僑 +僕 +僚 +僧 +僭 +僮 +儀 +億 +儇 +儒 +儛 +償 +儡 +優 +儲 +儺 +儼 +兀 +允 +元 +兄 +充 +兆 +先 +光 +克 +兌 +免 +兎 +児 +党 +兜 +入 +全 +八 +公 +六 +共 +兵 +其 +具 +典 +兼 +内 +円 +冊 +再 +冑 +冒 +冗 +写 +冠 +冤 +冥 +冨 +冬 +冲 +决 +冶 +冷 +准 +凉 +凋 +凌 +凍 +凛 +凝 +凞 +几 +凡 +処 +凪 +凰 +凱 +凶 +凸 +凹 +出 +函 +刀 +刃 +分 +切 +刈 +刊 +刎 +刑 +列 +初 +判 +別 +利 +刪 +到 +制 +刷 +券 +刹 +刺 +刻 +剃 +則 +削 +剋 +前 +剖 +剛 +剣 +剤 +剥 +剪 +副 +剰 +割 +創 +剽 +劇 +劉 +劔 +力 +功 +加 +劣 +助 +努 +劫 +劭 +励 +労 +効 +劾 +勃 +勅 +勇 +勉 +勒 +動 +勘 +務 +勝 +募 +勢 +勤 +勧 +勲 +勺 +勾 +勿 +匁 +匂 +包 +匏 +化 +北 +匙 +匝 +匠 +匡 +匣 +匯 +匲 +匹 +区 +医 +匿 +十 +千 +升 +午 +卉 +半 +卍 +卑 +卒 +卓 +協 +南 +単 +博 +卜 +占 +卦 +卯 +印 +危 +即 +却 +卵 +卸 +卿 +厄 +厚 +原 +厠 +厨 +厩 +厭 +厳 +去 +参 +又 +叉 +及 +友 +双 +反 +収 +叔 +取 +受 +叙 +叛 +叟 +叡 +叢 +口 +古 +句 +叩 +只 +叫 +召 +可 +台 +叱 +史 +右 +叶 +号 +司 +吃 +各 +合 +吉 +吊 +同 +名 +后 +吏 +吐 +向 +君 +吝 +吟 +吠 +否 +含 +吸 +吹 +吻 +吽 +吾 +呂 +呆 +呈 +呉 +告 +呑 +周 +呪 +呰 +味 +呼 +命 +咀 +咄 +咋 +和 +咒 +咫 +咲 +咳 +咸 +哀 +品 +哇 +哉 +員 +哨 +哩 +哭 +哲 +哺 +唄 +唆 +唇 +唐 +唖 +唯 +唱 +唳 +唸 +唾 +啄 +商 +問 +啓 +啼 +善 +喋 +喚 +喜 +喝 +喧 +喩 +喪 +喫 +喬 +單 +喰 +営 +嗅 +嗇 +嗔 +嗚 +嗜 +嗣 +嘆 +嘉 +嘗 +嘘 +嘩 +嘯 +嘱 +嘲 +嘴 +噂 +噌 +噛 +器 +噴 +噺 +嚆 +嚢 +囀 +囃 +囉 +囚 +四 +回 +因 +団 +困 +囲 +図 +固 +国 +圀 +圃 +國 +圏 +園 +圓 +團 +圜 +土 +圧 +在 +圭 +地 +址 +坂 +均 +坊 +坐 +坑 +坡 +坤 +坦 +坪 +垂 +型 +垢 +垣 +埃 +埋 +城 +埒 +埔 +域 +埠 +埴 +埵 +執 +培 +基 +埼 +堀 +堂 +堅 +堆 +堕 +堤 +堪 +堯 +堰 +報 +場 +堵 +堺 +塀 +塁 +塊 +塑 +塔 +塗 +塘 +塙 +塚 +塞 +塩 +填 +塵 +塾 +境 +墉 +墓 +増 +墜 +墟 +墨 +墳 +墺 +墻 +墾 +壁 +壇 +壊 +壌 +壕 +士 +壬 +壮 +声 +壱 +売 +壷 +壹 +壺 +壽 +変 +夏 +夕 +外 +夙 +多 +夜 +夢 +夥 +大 +天 +太 +夫 +夬 +夭 +央 +失 +夷 +夾 +奄 +奇 +奈 +奉 +奎 +奏 +契 +奔 +奕 +套 +奘 +奠 +奢 +奥 +奨 +奪 +奮 +女 +奴 +奸 +好 +如 +妃 +妄 +妊 +妍 +妓 +妖 +妙 +妥 +妨 +妬 +妲 +妹 +妻 +妾 +姉 +始 +姐 +姓 +委 +姚 +姜 +姞 +姥 +姦 +姨 +姪 +姫 +姶 +姻 +姿 +威 +娑 +娘 +娟 +娠 +娩 +娯 +娼 +婆 +婉 +婚 +婢 +婦 +婬 +婿 +媄 +媒 +媓 +媚 +媛 +媞 +媽 +嫁 +嫄 +嫉 +嫌 +嫐 +嫗 +嫡 +嬉 +嬌 +嬢 +嬪 +嬬 +嬾 +孁 +子 +孔 +字 +存 +孚 +孝 +孟 +季 +孤 +学 +孫 +孵 +學 +宅 +宇 +守 +安 +宋 +完 +宍 +宏 +宕 +宗 +官 +宙 +定 +宛 +宜 +宝 +実 +客 +宣 +室 +宥 +宮 +宰 +害 +宴 +宵 +家 +宸 +容 +宿 +寂 +寄 +寅 +密 +寇 +富 +寒 +寓 +寔 +寛 +寝 +察 +寡 +實 +寧 +審 +寮 +寵 +寶 +寸 +寺 +対 +寿 +封 +専 +射 +将 +尉 +尊 +尋 +對 +導 +小 +少 +尖 +尚 +尤 +尪 +尭 +就 +尹 +尺 +尻 +尼 +尽 +尾 +尿 +局 +居 +屈 +届 +屋 +屍 +屎 +屏 +屑 +屓 +展 +属 +屠 +層 +履 +屯 +山 +岐 +岑 +岡 +岩 +岫 +岬 +岳 +岷 +岸 +峠 +峡 +峨 +峯 +峰 +島 +峻 +崇 +崋 +崎 +崑 +崖 +崗 +崛 +崩 +嵌 +嵐 +嵩 +嵯 +嶂 +嶋 +嶠 +嶺 +嶼 +嶽 +巀 +巌 +巒 +巖 +川 +州 +巡 +巣 +工 +左 +巧 +巨 +巫 +差 +己 +巳 +巴 +巷 +巻 +巽 +巾 +市 +布 +帆 +希 +帖 +帚 +帛 +帝 +帥 +師 +席 +帯 +帰 +帳 +帷 +常 +帽 +幄 +幅 +幇 +幌 +幔 +幕 +幟 +幡 +幢 +幣 +干 +平 +年 +并 +幸 +幹 +幻 +幼 +幽 +幾 +庁 +広 +庄 +庇 +床 +序 +底 +庖 +店 +庚 +府 +度 +座 +庫 +庭 +庵 +庶 +康 +庸 +廂 +廃 +廉 +廊 +廓 +廟 +廠 +廣 +廬 +延 +廷 +建 +廻 +廼 +廿 +弁 +弄 +弉 +弊 +弌 +式 +弐 +弓 +弔 +引 +弖 +弗 +弘 +弛 +弟 +弥 +弦 +弧 +弱 +張 +強 +弼 +弾 +彈 +彊 +彌 +彎 +当 +彗 +彙 +彝 +形 +彦 +彩 +彫 +彬 +彭 +彰 +影 +彷 +役 +彼 +往 +征 +徂 +径 +待 +律 +後 +徐 +徑 +徒 +従 +得 +徠 +御 +徧 +徨 +復 +循 +徭 +微 +徳 +徴 +德 +徹 +徽 +心 +必 +忉 +忌 +忍 +志 +忘 +忙 +応 +忠 +快 +忯 +念 +忻 +忽 +忿 +怒 +怖 +思 +怠 +怡 +急 +性 +怨 +怪 +怯 +恂 +恋 +恐 +恒 +恕 +恣 +恤 +恥 +恨 +恩 +恬 +恭 +息 +恵 +悉 +悌 +悍 +悔 +悟 +悠 +患 +悦 +悩 +悪 +悲 +悼 +情 +惇 +惑 +惚 +惜 +惟 +惠 +惣 +惧 +惨 +惰 +想 +惹 +惺 +愈 +愉 +愍 +意 +愔 +愚 +愛 +感 +愷 +愿 +慈 +態 +慌 +慎 +慕 +慢 +慣 +慧 +慨 +慮 +慰 +慶 +憂 +憎 +憐 +憑 +憙 +憤 +憧 +憩 +憬 +憲 +憶 +憾 +懇 +應 +懌 +懐 +懲 +懸 +懺 +懽 +懿 +戈 +戊 +戌 +戎 +成 +我 +戒 +戔 +或 +戚 +戟 +戦 +截 +戮 +戯 +戴 +戸 +戻 +房 +所 +扁 +扇 +扈 +扉 +手 +才 +打 +払 +托 +扮 +扱 +扶 +批 +承 +技 +抄 +把 +抑 +抓 +投 +抗 +折 +抜 +択 +披 +抱 +抵 +抹 +押 +抽 +担 +拇 +拈 +拉 +拍 +拏 +拐 +拒 +拓 +拘 +拙 +招 +拝 +拠 +拡 +括 +拭 +拳 +拵 +拶 +拾 +拿 +持 +挂 +指 +按 +挑 +挙 +挟 +挨 +振 +挺 +挽 +挿 +捉 +捕 +捗 +捜 +捧 +捨 +据 +捺 +捻 +掃 +掄 +授 +掌 +排 +掖 +掘 +掛 +掟 +採 +探 +掣 +接 +控 +推 +掩 +措 +掬 +掲 +掴 +掻 +掾 +揃 +揄 +揆 +揉 +描 +提 +揖 +揚 +換 +握 +揮 +援 +揶 +揺 +損 +搦 +搬 +搭 +携 +搾 +摂 +摘 +摩 +摸 +摺 +撃 +撒 +撞 +撤 +撥 +撫 +播 +撮 +撰 +撲 +撹 +擁 +操 +擔 +擦 +擬 +擾 +攘 +攝 +攣 +支 +收 +改 +攻 +放 +政 +故 +敏 +救 +敗 +教 +敢 +散 +敦 +敬 +数 +整 +敵 +敷 +斂 +文 +斉 +斎 +斐 +斑 +斗 +料 +斜 +斟 +斤 +斥 +斧 +斬 +断 +斯 +新 +方 +於 +施 +旁 +旅 +旋 +旌 +族 +旗 +旛 +无 +旡 +既 +日 +旦 +旧 +旨 +早 +旬 +旭 +旺 +旻 +昂 +昆 +昇 +昉 +昌 +明 +昏 +易 +昔 +星 +映 +春 +昧 +昨 +昪 +昭 +是 +昵 +昼 +晁 +時 +晃 +晋 +晏 +晒 +晟 +晦 +晧 +晩 +普 +景 +晴 +晶 +智 +暁 +暇 +暈 +暉 +暑 +暖 +暗 +暘 +暢 +暦 +暫 +暮 +暲 +暴 +暹 +暾 +曄 +曇 +曉 +曖 +曙 +曜 +曝 +曠 +曰 +曲 +曳 +更 +書 +曹 +曼 +曽 +曾 +替 +最 +會 +月 +有 +朋 +服 +朏 +朔 +朕 +朗 +望 +朝 +期 +朧 +木 +未 +末 +本 +札 +朱 +朴 +机 +朽 +杁 +杉 +李 +杏 +材 +村 +杓 +杖 +杜 +杞 +束 +条 +杢 +杣 +来 +杭 +杮 +杯 +東 +杲 +杵 +杷 +杼 +松 +板 +枅 +枇 +析 +枓 +枕 +林 +枚 +果 +枝 +枠 +枡 +枢 +枯 +枳 +架 +柄 +柊 +柏 +某 +柑 +染 +柔 +柘 +柚 +柯 +柱 +柳 +柴 +柵 +査 +柾 +柿 +栂 +栃 +栄 +栖 +栗 +校 +株 +栲 +栴 +核 +根 +栻 +格 +栽 +桁 +桂 +桃 +框 +案 +桐 +桑 +桓 +桔 +桜 +桝 +桟 +桧 +桴 +桶 +桾 +梁 +梅 +梆 +梓 +梔 +梗 +梛 +條 +梟 +梢 +梧 +梨 +械 +梱 +梲 +梵 +梶 +棄 +棋 +棒 +棗 +棘 +棚 +棟 +棠 +森 +棲 +棹 +棺 +椀 +椅 +椋 +植 +椎 +椏 +椒 +椙 +検 +椥 +椹 +椿 +楊 +楓 +楕 +楚 +楞 +楠 +楡 +楢 +楨 +楪 +楫 +業 +楮 +楯 +楳 +極 +楷 +楼 +楽 +概 +榊 +榎 +榕 +榛 +榜 +榮 +榱 +榴 +槃 +槇 +槊 +構 +槌 +槍 +槐 +様 +槙 +槻 +槽 +槿 +樂 +樋 +樓 +樗 +標 +樟 +模 +権 +横 +樫 +樵 +樹 +樺 +樽 +橇 +橋 +橘 +機 +橿 +檀 +檄 +檎 +檐 +檗 +檜 +檣 +檥 +檬 +檮 +檸 +檻 +櫃 +櫓 +櫛 +櫟 +櫨 +櫻 +欄 +欅 +欠 +次 +欣 +欧 +欲 +欺 +欽 +款 +歌 +歎 +歓 +止 +正 +此 +武 +歩 +歪 +歯 +歳 +歴 +死 +殆 +殉 +殊 +残 +殖 +殯 +殴 +段 +殷 +殺 +殻 +殿 +毀 +毅 +母 +毎 +毒 +比 +毘 +毛 +毫 +毬 +氈 +氏 +民 +気 +水 +氷 +永 +氾 +汀 +汁 +求 +汎 +汐 +汗 +汚 +汝 +江 +池 +汪 +汰 +汲 +決 +汽 +沂 +沃 +沅 +沆 +沈 +沌 +沐 +沓 +沖 +沙 +没 +沢 +沱 +河 +沸 +油 +治 +沼 +沽 +沿 +況 +泉 +泊 +泌 +法 +泗 +泡 +波 +泣 +泥 +注 +泯 +泰 +泳 +洋 +洒 +洗 +洛 +洞 +津 +洩 +洪 +洲 +洸 +洹 +活 +洽 +派 +流 +浄 +浅 +浙 +浚 +浜 +浣 +浦 +浩 +浪 +浮 +浴 +海 +浸 +涅 +消 +涌 +涙 +涛 +涯 +液 +涵 +涼 +淀 +淄 +淆 +淇 +淋 +淑 +淘 +淡 +淤 +淨 +淫 +深 +淳 +淵 +混 +淹 +添 +清 +済 +渉 +渋 +渓 +渕 +渚 +減 +渟 +渠 +渡 +渤 +渥 +渦 +温 +渫 +測 +港 +游 +渾 +湊 +湖 +湘 +湛 +湧 +湫 +湯 +湾 +湿 +満 +源 +準 +溜 +溝 +溢 +溥 +溪 +溶 +溺 +滄 +滅 +滋 +滌 +滑 +滕 +滝 +滞 +滴 +滸 +滹 +滿 +漁 +漂 +漆 +漉 +漏 +漑 +演 +漕 +漠 +漢 +漣 +漫 +漬 +漱 +漸 +漿 +潅 +潔 +潙 +潜 +潟 +潤 +潭 +潮 +潰 +潴 +澁 +澂 +澄 +澎 +澗 +澤 +澪 +澱 +澳 +激 +濁 +濃 +濟 +濠 +濡 +濤 +濫 +濯 +濱 +濾 +瀉 +瀋 +瀑 +瀕 +瀞 +瀟 +瀧 +瀬 +瀾 +灌 +灑 +灘 +火 +灯 +灰 +灸 +災 +炉 +炊 +炎 +炒 +炭 +炮 +炷 +点 +為 +烈 +烏 +烙 +烝 +烹 +焔 +焙 +焚 +無 +焦 +然 +焼 +煇 +煉 +煌 +煎 +煕 +煙 +煤 +煥 +照 +煩 +煬 +煮 +煽 +熈 +熊 +熙 +熟 +熨 +熱 +熹 +熾 +燃 +燈 +燎 +燔 +燕 +燗 +燥 +燭 +燻 +爆 +爐 +爪 +爬 +爲 +爵 +父 +爺 +爼 +爽 +爾 +片 +版 +牌 +牒 +牘 +牙 +牛 +牝 +牟 +牡 +牢 +牧 +物 +牲 +特 +牽 +犂 +犠 +犬 +犯 +状 +狂 +狄 +狐 +狗 +狙 +狛 +狡 +狩 +独 +狭 +狷 +狸 +狼 +猊 +猛 +猟 +猥 +猨 +猩 +猪 +猫 +献 +猴 +猶 +猷 +猾 +猿 +獄 +獅 +獏 +獣 +獲 +玄 +玅 +率 +玉 +王 +玖 +玩 +玲 +珀 +珂 +珈 +珉 +珊 +珍 +珎 +珞 +珠 +珣 +珥 +珪 +班 +現 +球 +理 +琉 +琢 +琥 +琦 +琮 +琲 +琳 +琴 +琵 +琶 +瑁 +瑋 +瑙 +瑚 +瑛 +瑜 +瑞 +瑠 +瑤 +瑩 +瑪 +瑳 +瑾 +璃 +璋 +璜 +璞 +璧 +璨 +環 +璵 +璽 +璿 +瓊 +瓔 +瓜 +瓢 +瓦 +瓶 +甍 +甑 +甕 +甘 +甚 +甞 +生 +産 +甥 +用 +甫 +田 +由 +甲 +申 +男 +町 +画 +界 +畏 +畑 +畔 +留 +畜 +畝 +畠 +畢 +略 +番 +異 +畳 +當 +畷 +畸 +畺 +畿 +疆 +疇 +疋 +疎 +疏 +疑 +疫 +疱 +疲 +疹 +疼 +疾 +病 +症 +痒 +痔 +痕 +痘 +痙 +痛 +痢 +痩 +痴 +痺 +瘍 +瘡 +瘧 +療 +癇 +癌 +癒 +癖 +癡 +癪 +発 +登 +白 +百 +的 +皆 +皇 +皋 +皐 +皓 +皮 +皺 +皿 +盂 +盃 +盆 +盈 +益 +盒 +盗 +盛 +盞 +盟 +盡 +監 +盤 +盥 +盧 +目 +盲 +直 +相 +盾 +省 +眉 +看 +県 +眞 +真 +眠 +眷 +眺 +眼 +着 +睡 +督 +睦 +睨 +睿 +瞋 +瞑 +瞞 +瞬 +瞭 +瞰 +瞳 +瞻 +瞼 +瞿 +矍 +矛 +矜 +矢 +知 +矧 +矩 +短 +矮 +矯 +石 +砂 +砌 +研 +砕 +砥 +砦 +砧 +砲 +破 +砺 +硝 +硫 +硬 +硯 +碁 +碇 +碌 +碑 +碓 +碕 +碗 +碣 +碧 +碩 +確 +碾 +磁 +磐 +磔 +磧 +磨 +磬 +磯 +礁 +礎 +礒 +礙 +礫 +礬 +示 +礼 +社 +祀 +祁 +祇 +祈 +祉 +祐 +祓 +祕 +祖 +祗 +祚 +祝 +神 +祟 +祠 +祢 +祥 +票 +祭 +祷 +祺 +禁 +禄 +禅 +禊 +禍 +禎 +福 +禔 +禖 +禛 +禦 +禧 +禮 +禰 +禹 +禽 +禿 +秀 +私 +秋 +科 +秒 +秘 +租 +秤 +秦 +秩 +称 +移 +稀 +程 +税 +稔 +稗 +稙 +稚 +稜 +稠 +種 +稱 +稲 +稷 +稻 +稼 +稽 +稿 +穀 +穂 +穆 +積 +穎 +穏 +穗 +穜 +穢 +穣 +穫 +穴 +究 +空 +突 +窃 +窄 +窒 +窓 +窟 +窠 +窩 +窪 +窮 +窯 +竃 +竄 +竈 +立 +站 +竜 +竝 +竟 +章 +童 +竪 +竭 +端 +竴 +競 +竹 +竺 +竽 +竿 +笄 +笈 +笏 +笑 +笙 +笛 +笞 +笠 +笥 +符 +第 +笹 +筅 +筆 +筇 +筈 +等 +筋 +筌 +筍 +筏 +筐 +筑 +筒 +答 +策 +筝 +筥 +筧 +筬 +筮 +筯 +筰 +筵 +箆 +箇 +箋 +箏 +箒 +箔 +箕 +算 +箙 +箜 +管 +箪 +箭 +箱 +箸 +節 +篁 +範 +篆 +篇 +築 +篋 +篌 +篝 +篠 +篤 +篥 +篦 +篩 +篭 +篳 +篷 +簀 +簒 +簡 +簧 +簪 +簫 +簺 +簾 +簿 +籀 +籃 +籌 +籍 +籐 +籟 +籠 +籤 +籬 +米 +籾 +粂 +粉 +粋 +粒 +粕 +粗 +粘 +粛 +粟 +粥 +粧 +粮 +粳 +精 +糊 +糖 +糜 +糞 +糟 +糠 +糧 +糯 +糸 +糺 +系 +糾 +紀 +約 +紅 +紋 +納 +紐 +純 +紗 +紘 +紙 +級 +紛 +素 +紡 +索 +紫 +紬 +累 +細 +紳 +紵 +紹 +紺 +絁 +終 +絃 +組 +絅 +経 +結 +絖 +絞 +絡 +絣 +給 +統 +絲 +絵 +絶 +絹 +絽 +綏 +經 +継 +続 +綜 +綟 +綬 +維 +綱 +網 +綴 +綸 +綺 +綽 +綾 +綿 +緊 +緋 +総 +緑 +緒 +線 +締 +緥 +編 +緩 +緬 +緯 +練 +緻 +縁 +縄 +縅 +縒 +縛 +縞 +縢 +縣 +縦 +縫 +縮 +縹 +總 +績 +繁 +繊 +繋 +繍 +織 +繕 +繝 +繦 +繧 +繰 +繹 +繼 +纂 +纈 +纏 +纐 +纒 +纛 +缶 +罔 +罠 +罧 +罪 +置 +罰 +署 +罵 +罷 +罹 +羂 +羅 +羆 +羇 +羈 +羊 +羌 +美 +群 +羨 +義 +羯 +羲 +羹 +羽 +翁 +翅 +翌 +習 +翔 +翛 +翠 +翡 +翫 +翰 +翺 +翻 +翼 +耀 +老 +考 +者 +耆 +而 +耐 +耕 +耗 +耨 +耳 +耶 +耽 +聊 +聖 +聘 +聚 +聞 +聟 +聡 +聨 +聯 +聰 +聲 +聴 +職 +聾 +肄 +肆 +肇 +肉 +肋 +肌 +肖 +肘 +肛 +肝 +股 +肢 +肥 +肩 +肪 +肯 +肱 +育 +肴 +肺 +胃 +胆 +背 +胎 +胖 +胚 +胝 +胞 +胡 +胤 +胱 +胴 +胸 +能 +脂 +脅 +脆 +脇 +脈 +脊 +脚 +脛 +脩 +脱 +脳 +腋 +腎 +腐 +腑 +腔 +腕 +腫 +腰 +腱 +腸 +腹 +腺 +腿 +膀 +膏 +膚 +膜 +膝 +膠 +膣 +膨 +膩 +膳 +膵 +膾 +膿 +臂 +臆 +臈 +臍 +臓 +臘 +臚 +臣 +臥 +臨 +自 +臭 +至 +致 +臺 +臼 +舂 +舅 +與 +興 +舌 +舍 +舎 +舒 +舖 +舗 +舘 +舜 +舞 +舟 +舩 +航 +般 +舳 +舶 +船 +艇 +艘 +艦 +艮 +良 +色 +艶 +芋 +芒 +芙 +芝 +芥 +芦 +芬 +芭 +芯 +花 +芳 +芸 +芹 +芻 +芽 +芿 +苅 +苑 +苔 +苗 +苛 +苞 +苡 +若 +苦 +苧 +苫 +英 +苴 +苻 +茂 +范 +茄 +茅 +茎 +茗 +茘 +茜 +茨 +茲 +茵 +茶 +茸 +茹 +草 +荊 +荏 +荒 +荘 +荷 +荻 +荼 +莞 +莪 +莫 +莬 +莱 +莵 +莽 +菅 +菊 +菌 +菓 +菖 +菘 +菜 +菟 +菩 +菫 +華 +菱 +菴 +萄 +萊 +萌 +萍 +萎 +萠 +萩 +萬 +萱 +落 +葉 +著 +葛 +葡 +董 +葦 +葩 +葬 +葭 +葱 +葵 +葺 +蒋 +蒐 +蒔 +蒙 +蒟 +蒡 +蒲 +蒸 +蒻 +蒼 +蒿 +蓄 +蓆 +蓉 +蓋 +蓑 +蓬 +蓮 +蓼 +蔀 +蔑 +蔓 +蔚 +蔡 +蔦 +蔬 +蔭 +蔵 +蔽 +蕃 +蕉 +蕊 +蕎 +蕨 +蕩 +蕪 +蕭 +蕾 +薄 +薇 +薊 +薔 +薗 +薙 +薛 +薦 +薨 +薩 +薪 +薫 +薬 +薭 +薮 +藁 +藉 +藍 +藏 +藐 +藝 +藤 +藩 +藪 +藷 +藹 +藺 +藻 +蘂 +蘆 +蘇 +蘊 +蘭 +虎 +虐 +虔 +虚 +虜 +虞 +號 +虫 +虹 +虻 +蚊 +蚕 +蛇 +蛉 +蛍 +蛎 +蛙 +蛛 +蛟 +蛤 +蛭 +蛮 +蛸 +蛹 +蛾 +蜀 +蜂 +蜃 +蜆 +蜊 +蜘 +蜜 +蜷 +蜻 +蝉 +蝋 +蝕 +蝙 +蝠 +蝦 +蝶 +蝿 +螂 +融 +螣 +螺 +蟄 +蟇 +蟠 +蟷 +蟹 +蟻 +蠢 +蠣 +血 +衆 +行 +衍 +衒 +術 +街 +衙 +衛 +衝 +衞 +衡 +衢 +衣 +表 +衫 +衰 +衵 +衷 +衽 +衾 +衿 +袁 +袈 +袋 +袍 +袒 +袖 +袙 +袞 +袢 +被 +袰 +袱 +袴 +袷 +袿 +裁 +裂 +裃 +装 +裏 +裔 +裕 +裘 +裙 +補 +裟 +裡 +裲 +裳 +裴 +裸 +裹 +製 +裾 +褂 +褄 +複 +褌 +褐 +褒 +褥 +褪 +褶 +褻 +襄 +襖 +襞 +襟 +襠 +襦 +襪 +襲 +襴 +襷 +西 +要 +覆 +覇 +覈 +見 +規 +視 +覗 +覚 +覧 +親 +覲 +観 +覺 +觀 +角 +解 +触 +言 +訂 +計 +討 +訓 +託 +記 +訛 +訟 +訢 +訥 +訪 +設 +許 +訳 +訴 +訶 +診 +註 +証 +詐 +詔 +評 +詛 +詞 +詠 +詢 +詣 +試 +詩 +詫 +詮 +詰 +話 +該 +詳 +誄 +誅 +誇 +誉 +誌 +認 +誓 +誕 +誘 +語 +誠 +誡 +誣 +誤 +誥 +誦 +説 +読 +誰 +課 +誼 +誾 +調 +談 +請 +諌 +諍 +諏 +諒 +論 +諚 +諜 +諟 +諡 +諦 +諧 +諫 +諭 +諮 +諱 +諶 +諷 +諸 +諺 +諾 +謀 +謄 +謌 +謎 +謗 +謙 +謚 +講 +謝 +謡 +謫 +謬 +謹 +證 +識 +譚 +譛 +譜 +警 +譬 +譯 +議 +譲 +譴 +護 +讀 +讃 +讐 +讒 +谷 +谿 +豅 +豆 +豊 +豎 +豐 +豚 +象 +豪 +豫 +豹 +貌 +貝 +貞 +負 +財 +貢 +貧 +貨 +販 +貪 +貫 +責 +貯 +貰 +貴 +買 +貸 +費 +貼 +貿 +賀 +賁 +賂 +賃 +賄 +資 +賈 +賊 +賎 +賑 +賓 +賛 +賜 +賞 +賠 +賢 +賣 +賤 +賦 +質 +賭 +購 +賽 +贄 +贅 +贈 +贋 +贔 +贖 +赤 +赦 +走 +赴 +起 +超 +越 +趙 +趣 +足 +趺 +趾 +跋 +跏 +距 +跡 +跨 +跪 +路 +跳 +践 +踊 +踏 +踐 +踞 +踪 +踵 +蹄 +蹉 +蹊 +蹟 +蹲 +蹴 +躅 +躇 +躊 +躍 +躑 +躙 +躪 +身 +躬 +躯 +躰 +車 +軋 +軌 +軍 +軒 +軟 +転 +軸 +軻 +軽 +軾 +較 +載 +輌 +輔 +輜 +輝 +輦 +輩 +輪 +輯 +輸 +輿 +轄 +轍 +轟 +轢 +辛 +辞 +辟 +辥 +辦 +辨 +辰 +辱 +農 +辺 +辻 +込 +迂 +迅 +迎 +近 +返 +迢 +迦 +迪 +迫 +迭 +述 +迷 +迹 +追 +退 +送 +逃 +逅 +逆 +逍 +透 +逐 +逓 +途 +逕 +逗 +這 +通 +逝 +逞 +速 +造 +逢 +連 +逮 +週 +進 +逸 +逼 +遁 +遂 +遅 +遇 +遊 +運 +遍 +過 +遐 +道 +達 +違 +遙 +遜 +遠 +遡 +遣 +遥 +適 +遭 +遮 +遯 +遵 +遷 +選 +遺 +遼 +避 +邀 +邁 +邂 +邃 +還 +邇 +邉 +邊 +邑 +那 +邦 +邨 +邪 +邯 +邵 +邸 +郁 +郊 +郎 +郡 +郢 +部 +郭 +郴 +郵 +郷 +都 +鄂 +鄙 +鄭 +鄰 +鄲 +酉 +酋 +酌 +配 +酎 +酒 +酔 +酢 +酥 +酪 +酬 +酵 +酷 +酸 +醍 +醐 +醒 +醗 +醜 +醤 +醪 +醵 +醸 +采 +釈 +釉 +釋 +里 +重 +野 +量 +釐 +金 +釘 +釜 +針 +釣 +釧 +釿 +鈍 +鈎 +鈐 +鈔 +鈞 +鈦 +鈴 +鈷 +鈸 +鈿 +鉄 +鉇 +鉉 +鉋 +鉛 +鉢 +鉤 +鉦 +鉱 +鉾 +銀 +銃 +銅 +銈 +銑 +銕 +銘 +銚 +銜 +銭 +鋏 +鋒 +鋤 +鋭 +鋲 +鋳 +鋸 +鋺 +鋼 +錆 +錍 +錐 +錘 +錠 +錣 +錦 +錫 +錬 +錯 +録 +錵 +鍋 +鍍 +鍑 +鍔 +鍛 +鍬 +鍮 +鍵 +鍼 +鍾 +鎌 +鎖 +鎗 +鎚 +鎧 +鎬 +鎮 +鎰 +鎹 +鏃 +鏑 +鏡 +鐃 +鐇 +鐐 +鐔 +鐘 +鐙 +鐚 +鐡 +鐵 +鐸 +鑁 +鑊 +鑑 +鑒 +鑚 +鑠 +鑢 +鑰 +鑵 +鑷 +鑼 +鑽 +鑿 +長 +門 +閃 +閇 +閉 +開 +閏 +閑 +間 +閔 +閘 +関 +閣 +閤 +閥 +閦 +閨 +閬 +閲 +閻 +閼 +閾 +闇 +闍 +闔 +闕 +闘 +關 +闡 +闢 +闥 +阜 +阪 +阮 +阯 +防 +阻 +阿 +陀 +陂 +附 +陌 +降 +限 +陛 +陞 +院 +陣 +除 +陥 +陪 +陬 +陰 +陳 +陵 +陶 +陸 +険 +陽 +隅 +隆 +隈 +隊 +隋 +階 +随 +隔 +際 +障 +隠 +隣 +隧 +隷 +隻 +隼 +雀 +雁 +雄 +雅 +集 +雇 +雉 +雊 +雋 +雌 +雍 +雑 +雖 +雙 +雛 +離 +難 +雨 +雪 +雫 +雰 +雲 +零 +雷 +雹 +電 +需 +震 +霊 +霍 +霖 +霜 +霞 +霧 +霰 +露 +靈 +青 +靖 +静 +靜 +非 +面 +革 +靫 +靭 +靱 +靴 +靺 +鞁 +鞄 +鞆 +鞋 +鞍 +鞏 +鞘 +鞠 +鞨 +鞭 +韋 +韓 +韜 +韮 +音 +韶 +韻 +響 +頁 +頂 +頃 +項 +順 +須 +頌 +預 +頑 +頒 +頓 +領 +頚 +頬 +頭 +頴 +頸 +頻 +頼 +顆 +題 +額 +顎 +顔 +顕 +顗 +願 +顛 +類 +顧 +顯 +風 +飛 +食 +飢 +飩 +飫 +飯 +飲 +飴 +飼 +飽 +飾 +餃 +餅 +餉 +養 +餌 +餐 +餓 +餘 +餝 +餡 +館 +饂 +饅 +饉 +饋 +饌 +饒 +饗 +首 +馗 +香 +馨 +馬 +馳 +馴 +駄 +駅 +駆 +駈 +駐 +駒 +駕 +駝 +駿 +騁 +騎 +騏 +騒 +験 +騙 +騨 +騰 +驕 +驚 +驛 +驢 +骨 +骸 +髄 +體 +高 +髙 +髢 +髪 +髭 +髮 +髷 +髻 +鬘 +鬚 +鬢 +鬨 +鬯 +鬱 +鬼 +魁 +魂 +魄 +魅 +魏 +魔 +魚 +魯 +鮎 +鮑 +鮒 +鮪 +鮫 +鮭 +鮮 +鯉 +鯔 +鯖 +鯛 +鯨 +鯰 +鯱 +鰐 +鰒 +鰭 +鰯 +鰰 +鰹 +鰻 +鱈 +鱒 +鱗 +鱧 +鳥 +鳩 +鳰 +鳳 +鳴 +鳶 +鴈 +鴉 +鴎 +鴛 +鴟 +鴦 +鴨 +鴫 +鴻 +鵄 +鵜 +鵞 +鵡 +鵬 +鵲 +鵺 +鶉 +鶏 +鶯 +鶴 +鷄 +鷙 +鷲 +鷹 +鷺 +鸚 +鸞 +鹸 +鹽 +鹿 +麁 +麒 +麓 +麗 +麝 +麞 +麟 +麦 +麩 +麹 +麺 +麻 +麾 +麿 +黄 +黌 +黍 +黒 +黙 +黛 +黠 +鼈 +鼉 +鼎 +鼓 +鼠 +鼻 +齊 +齋 +齟 +齢 +齬 +龍 +龕 +龗 +! +# +% +& +( +) ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; += +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +R +S +T +U +V +W +X +Z +a +c +d +e +f +h +i +j +k +l +m +n +o +p +r +s +t +u +y +z +~ +・ + diff --git a/ppocr/utils/dict/ka_dict.txt b/ppocr/utils/dict/ka_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..d506b691bd1a6c55299ad89a72cf3a69a2c879a9 --- /dev/null +++ b/ppocr/utils/dict/ka_dict.txt @@ -0,0 +1,153 @@ +k +a +_ +i +m +g +/ +1 +2 +I +L +S +V +R +C +0 +v +l +6 +4 +8 +. +j +p +ಗ +ು +ಣ +ಪ +ಡ +ಿ +ಸ +ಲ +ಾ +ದ +್ +7 +5 +3 +ವ +ಷ +ಬ +ಹ +ೆ +9 +ಅ +ಳ +ನ +ರ +ಉ +ಕ +ಎ +ೇ +ಂ +ೈ +ೊ +ೀ +ಯ +ೋ +ತ +ಶ +ಭ +ಧ +ಚ +ಜ +ೂ +ಮ +ಒ +ೃ +ಥ +ಇ +ಟ +ಖ +ಆ +ಞ +ಫ +- +ಢ +ಊ +ಓ +ಐ +ಃ +ಘ +ಝ +ೌ +ಠ +ಛ +ಔ +ಏ +ಈ +ಋ +೨ +೦ +೧ +೮ +೯ +೪ +, +೫ +೭ +೩ +೬ +ಙ +s +c +e +n +w +o +u +t +d +E +A +T +B +Z +N +G +O +q +z +r +x +P +K +M +J +U +D +f +F +h +b +W +Y +y +H +X +Q +' +# +& +! +@ +$ +: +% +é +É +( +? ++ + diff --git a/ppocr/utils/dict/kie_dict/xfund_class_list.txt b/ppocr/utils/dict/kie_dict/xfund_class_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..faded9f9b8f56bd258909bec9b8f1755aa688367 --- /dev/null +++ b/ppocr/utils/dict/kie_dict/xfund_class_list.txt @@ -0,0 +1,4 @@ +OTHER +QUESTION +ANSWER +HEADER diff --git a/ppocr/utils/dict/korean_dict.txt b/ppocr/utils/dict/korean_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..a13899f14dfe3bfc25b34904390c7b1e4ed8674b --- /dev/null +++ b/ppocr/utils/dict/korean_dict.txt @@ -0,0 +1,3688 @@ +! +" +# +$ +% +& +' +* ++ +- +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +© +° +² +½ +Á +Ä +Å +Ç +É +Í +Î +Ó +Ö +× +Ü +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +ì +í +î +ï +ð +ñ +ò +ó +ô +õ +ö +ø +ú +û +ü +ý +ā +ă +ą +ć +Č +č +đ +ē +ė +ę +ě +ğ +ī +İ +ı +Ł +ł +ń +ň +ō +ř +Ş +ş +Š +š +ţ +ū +ź +ż +Ž +ž +Ș +ș +Α +Δ +α +λ +φ +Г +О +а +в +л +о +р +с +т +я +​ +’ +“ +” +→ +∇ +∼ +「 +」 +ア +カ +グ +ニ +ラ +ン +ㄱ +ㄴ +ㄷ +ㄸ +ㄹ +ㅂ +ㅅ +ㅆ +ㅇ +ㅈ +ㅊ +ㅋ +ㅌ +ㅎ +ㅓ +ㅜ +ㅣ +一 +丁 +七 +三 +上 +下 +不 +丑 +世 +丘 +丞 +中 +丸 +丹 +主 +乃 +久 +之 +乎 +乘 +九 +也 +乳 +乾 +事 +二 +云 +互 +五 +井 +亞 +亡 +交 +亥 +亨 +享 +京 +亭 +人 +仁 +今 +他 +仙 +代 +令 +以 +仰 +仲 +件 +任 +企 +伊 +伍 +伎 +伏 +伐 +休 +伯 +伴 +伸 +佃 +佈 +位 +低 +住 +佐 +何 +佛 +作 +使 +來 +供 +依 +侯 +侵 +侶 +便 +俗 +保 +俠 +信 +修 +俱 +俳 +倉 +個 +倍 +倒 +候 +借 +値 +倫 +倭 +假 +偈 +偉 +偏 +停 +偶 +傅 +傑 +傳 +傷 +傾 +像 +僞 +僥 +僧 +價 +儀 +儉 +儒 +優 +儼 +兀 +允 +元 +兆 +先 +光 +克 +兒 +入 +內 +全 +八 +公 +六 +共 +兵 +其 +具 +典 +兼 +再 +冠 +冥 +冶 +准 +凞 +凡 +凱 +出 +函 +刀 +分 +刊 +刑 +列 +初 +判 +別 +利 +到 +制 +券 +刺 +刻 +則 +前 +剛 +副 +創 +劃 +劑 +力 +功 +加 +劣 +助 +劫 +勇 +動 +務 +勝 +勢 +勳 +勸 +匈 +化 +北 +匠 +區 +十 +千 +午 +半 +卍 +卑 +卒 +卓 +南 +博 +卜 +占 +卦 +印 +危 +卵 +卷 +卽 +卿 +厄 +原 +厦 +去 +參 +又 +叉 +友 +反 +叔 +受 +口 +古 +句 +可 +台 +史 +右 +司 +各 +合 +吉 +同 +名 +后 +吏 +吐 +君 +吠 +吳 +呂 +告 +周 +味 +呵 +命 +和 +咳 +咸 +咽 +哀 +品 +哨 +哮 +哲 +唐 +唯 +唱 +商 +問 +啼 +善 +喆 +喉 +喜 +喩 +喪 +嘗 +器 +嚴 +囊 +四 +回 +因 +困 +固 +圈 +國 +圍 +園 +圓 +圖 +團 +土 +在 +地 +均 +坊 +坐 +坑 +坵 +型 +垢 +城 +域 +埴 +執 +培 +基 +堂 +堅 +堆 +堤 +堯 +報 +場 +塔 +塚 +塞 +塵 +境 +墜 +墟 +墨 +墳 +墾 +壁 +壇 +壓 +壤 +士 +壬 +壯 +壺 +壽 +夏 +夕 +外 +多 +夜 +夢 +大 +天 +太 +夫 +央 +失 +夷 +奄 +奇 +奉 +奎 +奏 +契 +奔 +奮 +女 +奴 +好 +如 +妄 +妊 +妖 +妙 +始 +姑 +姓 +姚 +姜 +威 +婆 +婚 +婦 +媒 +媚 +子 +孔 +字 +存 +孝 +孟 +季 +孤 +孫 +學 +孺 +宇 +守 +安 +宋 +宗 +官 +宙 +定 +客 +宣 +室 +宮 +害 +家 +容 +寂 +寃 +寄 +寅 +密 +寇 +富 +寒 +寓 +實 +審 +寫 +寬 +寶 +寸 +寺 +封 +將 +專 +尊 +對 +小 +少 +尙 +尹 +尼 +尿 +局 +居 +屈 +屋 +屍 +屎 +屛 +層 +屬 +山 +岐 +岡 +岩 +岳 +岸 +峙 +峰 +島 +峻 +峽 +崇 +崔 +崖 +崩 +嶋 +巖 +川 +州 +巢 +工 +左 +巧 +巨 +巫 +差 +己 +巷 +市 +布 +帝 +師 +帶 +常 +帽 +幕 +干 +平 +年 +幹 +幻 +幼 +幽 +庇 +序 +店 +府 +度 +座 +庫 +庭 +康 +廟 +廣 +廳 +延 +廷 +建 +廻 +弁 +式 +弑 +弓 +引 +弘 +弟 +弱 +張 +强 +弼 +彌 +彛 +形 +彬 +影 +役 +彼 +彿 +往 +征 +待 +律 +後 +徐 +徑 +得 +從 +循 +微 +德 +徹 +心 +必 +忌 +忍 +志 +忠 +思 +怡 +急 +性 +恐 +恒 +恨 +恩 +悅 +悖 +患 +悲 +情 +惑 +惟 +惠 +惡 +想 +惺 +愁 +意 +愚 +愛 +感 +愼 +慈 +態 +慕 +慣 +慧 +慾 +憂 +憤 +憺 +應 +懸 +戎 +成 +我 +戟 +戮 +戰 +戴 +戶 +房 +所 +手 +才 +打 +批 +承 +技 +抄 +把 +抗 +抱 +抽 +拇 +拓 +拘 +拙 +拜 +拾 +持 +指 +捌 +捨 +捿 +授 +掌 +排 +接 +推 +提 +揚 +揭 +援 +損 +搗 +摩 +播 +操 +擒 +擔 +擘 +據 +擧 +攘 +攝 +攬 +支 +改 +攻 +放 +政 +故 +敍 +敎 +救 +敗 +散 +敬 +整 +數 +文 +斗 +料 +斛 +斜 +斧 +斯 +新 +斷 +方 +於 +施 +旋 +族 +旗 +日 +旨 +早 +旱 +昌 +明 +易 +昔 +星 +春 +昧 +昭 +是 +時 +晉 +晋 +晩 +普 +景 +晴 +晶 +智 +暈 +暑 +暗 +暘 +曉 +曜 +曠 +曦 +曰 +曲 +書 +曹 +曼 +曾 +最 +會 +月 +有 +朋 +服 +望 +朝 +期 +木 +未 +末 +本 +朱 +朴 +李 +材 +村 +杖 +杜 +杞 +杭 +杯 +東 +松 +板 +林 +果 +枝 +枯 +枰 +枾 +柏 +柑 +柱 +栗 +校 +栢 +核 +根 +格 +桀 +桂 +案 +桎 +桑 +桓 +桔 +梁 +梏 +梓 +梗 +條 +梨 +梵 +棗 +棟 +森 +植 +椒 +楊 +楓 +楚 +業 +楮 +極 +榮 +槃 +槍 +樂 +樓 +樗 +樣 +樸 +樹 +樺 +樽 +橄 +橋 +橘 +機 +橡 +檀 +檎 +權 +欌 +欖 +次 +欲 +歌 +歐 +止 +正 +此 +步 +武 +歲 +歸 +死 +殖 +段 +殷 +殺 +殿 +毅 +母 +毒 +比 +毛 +氏 +民 +氣 +水 +永 +求 +汎 +汗 +江 +池 +沅 +沒 +沖 +沙 +沛 +河 +油 +治 +沼 +沿 +泉 +泊 +法 +泗 +泡 +波 +注 +泰 +洋 +洙 +洛 +洞 +津 +洲 +活 +派 +流 +浅 +浦 +浮 +浴 +海 +涅 +涇 +消 +涌 +液 +淑 +淡 +淨 +淫 +深 +淳 +淵 +淸 +渠 +渡 +游 +渾 +湖 +湯 +源 +溪 +溫 +溶 +滄 +滅 +滋 +滯 +滿 +漁 +漆 +漢 +漫 +漸 +潑 +潤 +潭 +澄 +澎 +澤 +澳 +澹 +濁 +濕 +濟 +濤 +濯 +瀋 +瀝 +灣 +火 +灰 +灸 +災 +炎 +炭 +点 +烈 +烏 +烙 +焚 +無 +焦 +然 +煌 +煎 +照 +煬 +煮 +熟 +熱 +燁 +燈 +燔 +燕 +燥 +燧 +燮 +爲 +爵 +父 +片 +版 +牌 +牛 +牝 +牟 +牡 +物 +特 +犧 +犬 +狀 +狗 +猥 +猩 +猪 +獨 +獵 +獸 +獻 +玄 +玉 +王 +玲 +珍 +珠 +珪 +班 +現 +球 +理 +琴 +瑞 +瑟 +瑪 +璃 +璋 +璽 +瓜 +瓦 +甑 +甘 +生 +産 +用 +甫 +田 +由 +甲 +申 +男 +界 +畏 +留 +畜 +畢 +略 +番 +異 +畵 +當 +畸 +疏 +疑 +疫 +疹 +疼 +病 +症 +痔 +痛 +痺 +瘀 +瘍 +瘡 +療 +癌 +癖 +登 +發 +白 +百 +的 +皆 +皇 +皮 +盂 +盆 +益 +盛 +盜 +盟 +盡 +盤 +盧 +目 +直 +相 +省 +看 +眞 +眼 +睡 +督 +瞋 +矢 +矣 +知 +短 +石 +破 +碍 +碑 +磁 +磨 +磬 +示 +社 +祇 +祖 +祝 +神 +祥 +祭 +祺 +禁 +禅 +禍 +福 +禦 +禪 +禮 +禹 +禽 +禾 +秀 +私 +秉 +秋 +科 +秘 +秤 +秦 +秩 +移 +稀 +稗 +種 +稱 +稷 +稼 +稽 +穀 +穆 +積 +空 +窮 +竅 +立 +章 +童 +竭 +端 +竹 +笑 +符 +第 +筆 +等 +筍 +答 +策 +箋 +箕 +管 +箱 +節 +篇 +簡 +米 +粉 +粘 +粥 +精 +糖 +糞 +系 +紀 +紂 +約 +紅 +紋 +純 +紙 +級 +素 +索 +紫 +紬 +累 +細 +紳 +終 +組 +結 +絡 +統 +絲 +絶 +絹 +經 +綠 +維 +綱 +網 +綸 +綽 +緖 +線 +緣 +緯 +縣 +縱 +總 +織 +繡 +繩 +繪 +繭 +纂 +續 +罕 +置 +罰 +羅 +羊 +美 +群 +義 +羽 +翁 +習 +翟 +老 +考 +者 +而 +耐 +耕 +耳 +聃 +聖 +聞 +聰 +聲 +職 +肇 +肉 +肖 +肝 +股 +肥 +育 +肺 +胃 +胎 +胚 +胞 +胡 +胥 +能 +脂 +脈 +脚 +脛 +脣 +脩 +脫 +脯 +脾 +腋 +腎 +腫 +腸 +腹 +膜 +膠 +膨 +膽 +臆 +臟 +臣 +臥 +臨 +自 +至 +致 +臺 +臼 +臾 +與 +興 +舊 +舌 +舍 +舒 +舜 +舟 +般 +船 +艦 +良 +色 +芋 +花 +芳 +芽 +苑 +苔 +苕 +苛 +苞 +若 +苦 +英 +茂 +茵 +茶 +茹 +荀 +荇 +草 +荒 +荷 +莊 +莫 +菊 +菌 +菜 +菩 +菫 +華 +菴 +菽 +萊 +萍 +萬 +落 +葉 +著 +葛 +董 +葬 +蒙 +蒜 +蒲 +蒸 +蒿 +蓮 +蔓 +蔘 +蔡 +蔬 +蕃 +蕉 +蕓 +薄 +薑 +薛 +薩 +薪 +薺 +藏 +藝 +藤 +藥 +藩 +藻 +蘆 +蘇 +蘊 +蘚 +蘭 +虎 +處 +虛 +虞 +虹 +蜀 +蜂 +蜜 +蝕 +蝶 +融 +蟬 +蟲 +蠶 +蠻 +血 +衆 +行 +術 +衛 +衡 +衣 +表 +袁 +裔 +裕 +裙 +補 +製 +複 +襄 +西 +要 +見 +視 +親 +覺 +觀 +角 +解 +言 +訂 +訊 +訓 +託 +記 +訣 +設 +診 +註 +評 +詩 +話 +詵 +誅 +誌 +認 +誕 +語 +誠 +誤 +誥 +誦 +說 +調 +談 +諍 +論 +諡 +諫 +諭 +諸 +謙 +講 +謝 +謠 +證 +識 +譚 +譜 +譯 +議 +護 +讀 +變 +谷 +豆 +豊 +豚 +象 +豪 +豫 +貝 +貞 +財 +貧 +貨 +貪 +貫 +貴 +貸 +費 +資 +賊 +賓 +賞 +賢 +賣 +賦 +質 +贍 +赤 +赫 +走 +起 +超 +越 +趙 +趣 +趨 +足 +趾 +跋 +跡 +路 +踏 +蹟 +身 +躬 +車 +軍 +軒 +軟 +載 +輓 +輕 +輪 +輯 +輸 +輻 +輿 +轅 +轉 +辨 +辭 +辯 +辰 +農 +近 +迦 +述 +追 +逆 +透 +逐 +通 +逝 +造 +逢 +連 +進 +逵 +遂 +遊 +運 +遍 +過 +道 +達 +遠 +遡 +適 +遷 +選 +遺 +遽 +還 +邊 +邑 +那 +邪 +郞 +郡 +部 +都 +鄒 +鄕 +鄭 +鄲 +配 +酒 +酸 +醉 +醫 +醯 +釋 +里 +重 +野 +量 +釐 +金 +針 +鈍 +鈴 +鉞 +銀 +銅 +銘 +鋼 +錄 +錢 +錦 +鎭 +鏡 +鐘 +鐵 +鑑 +鑛 +長 +門 +閃 +開 +間 +閔 +閣 +閥 +閭 +閻 +闕 +關 +阪 +防 +阿 +陀 +降 +限 +陝 +院 +陰 +陳 +陵 +陶 +陸 +陽 +隆 +隊 +隋 +階 +際 +障 +隣 +隨 +隱 +隷 +雀 +雄 +雅 +集 +雇 +雌 +雖 +雙 +雜 +離 +難 +雨 +雪 +雲 +電 +霜 +露 +靈 +靑 +靖 +靜 +非 +面 +革 +靴 +鞏 +韓 +音 +韶 +韻 +順 +須 +頊 +頌 +領 +頭 +顔 +願 +顚 +類 +顯 +風 +飛 +食 +飢 +飮 +飯 +飾 +養 +餓 +餘 +首 +香 +馨 +馬 +駒 +騫 +騷 +驕 +骨 +骸 +髓 +體 +高 +髥 +髮 +鬪 +鬱 +鬼 +魏 +魔 +魚 +魯 +鮮 +鰍 +鰐 +鳥 +鳧 +鳳 +鴨 +鵲 +鶴 +鷄 +鷹 +鹽 +鹿 +麗 +麥 +麻 +黃 +黑 +默 +點 +黨 +鼎 +齊 +齋 +齒 +龍 +龜 +가 +각 +간 +갇 +갈 +갉 +감 +갑 +값 +갓 +갔 +강 +갖 +갗 +같 +갚 +갛 +개 +객 +갠 +갤 +갬 +갭 +갯 +갰 +갱 +갸 +걀 +걔 +걘 +거 +걱 +건 +걷 +걸 +검 +겁 +것 +겄 +겅 +겆 +겉 +겊 +겋 +게 +겐 +겔 +겟 +겠 +겡 +겨 +격 +겪 +견 +결 +겸 +겹 +겻 +겼 +경 +곁 +계 +곕 +곗 +고 +곡 +곤 +곧 +골 +곪 +곬 +곯 +곰 +곱 +곳 +공 +곶 +과 +곽 +관 +괄 +괌 +광 +괘 +괜 +괭 +괴 +괸 +굉 +교 +구 +국 +군 +굳 +굴 +굵 +굶 +굼 +굽 +굿 +궁 +궂 +궈 +권 +궐 +궜 +궝 +궤 +귀 +귄 +귈 +귓 +규 +균 +귤 +그 +극 +근 +글 +긁 +금 +급 +긋 +긍 +기 +긴 +길 +김 +깁 +깃 +깅 +깊 +까 +깍 +깎 +깐 +깔 +깜 +깝 +깟 +깡 +깥 +깨 +깬 +깰 +깻 +깼 +깽 +꺄 +꺼 +꺽 +꺾 +껀 +껄 +껌 +껍 +껏 +껐 +껑 +께 +껴 +꼈 +꼍 +꼐 +꼬 +꼭 +꼴 +꼼 +꼽 +꼿 +꽁 +꽂 +꽃 +꽉 +꽝 +꽤 +꽥 +꾀 +꾜 +꾸 +꾹 +꾼 +꿀 +꿇 +꿈 +꿉 +꿋 +꿍 +꿎 +꿔 +꿨 +꿩 +꿰 +꿴 +뀄 +뀌 +뀐 +뀔 +뀜 +뀝 +끄 +끈 +끊 +끌 +끓 +끔 +끕 +끗 +끙 +끝 +끼 +끽 +낀 +낄 +낌 +낍 +낏 +낑 +나 +낙 +낚 +난 +낟 +날 +낡 +남 +납 +낫 +났 +낭 +낮 +낯 +낱 +낳 +내 +낵 +낸 +낼 +냄 +냅 +냇 +냈 +냉 +냐 +냔 +냘 +냥 +너 +넉 +넋 +넌 +널 +넓 +넘 +넙 +넛 +넜 +넝 +넣 +네 +넥 +넨 +넬 +넴 +넵 +넷 +넸 +넹 +녀 +녁 +년 +념 +녔 +녕 +녘 +녜 +노 +녹 +논 +놀 +놈 +놋 +농 +높 +놓 +놔 +놨 +뇌 +뇨 +뇩 +뇽 +누 +눅 +눈 +눌 +눔 +눕 +눗 +눠 +눴 +뉘 +뉜 +뉩 +뉴 +늄 +늅 +늉 +느 +늑 +는 +늘 +늙 +늠 +늡 +능 +늦 +늪 +늬 +니 +닉 +닌 +닐 +님 +닙 +닛 +닝 +닢 +다 +닥 +닦 +단 +닫 +달 +닭 +닮 +닯 +닳 +담 +답 +닷 +당 +닻 +닿 +대 +댁 +댄 +댈 +댐 +댑 +댓 +댔 +댕 +댜 +더 +덕 +덖 +던 +덜 +덟 +덤 +덥 +덧 +덩 +덫 +덮 +데 +덱 +덴 +델 +뎀 +뎃 +뎅 +뎌 +뎠 +뎨 +도 +독 +돈 +돋 +돌 +돔 +돕 +돗 +동 +돛 +돝 +돼 +됐 +되 +된 +될 +됨 +됩 +됴 +두 +둑 +둔 +둘 +둠 +둡 +둣 +둥 +둬 +뒀 +뒤 +뒬 +뒷 +뒹 +듀 +듈 +듐 +드 +득 +든 +듣 +들 +듦 +듬 +듭 +듯 +등 +듸 +디 +딕 +딘 +딛 +딜 +딤 +딥 +딧 +딨 +딩 +딪 +따 +딱 +딴 +딸 +땀 +땄 +땅 +때 +땐 +땔 +땜 +땝 +땠 +땡 +떠 +떡 +떤 +떨 +떫 +떰 +떱 +떳 +떴 +떵 +떻 +떼 +떽 +뗀 +뗄 +뗍 +뗏 +뗐 +뗑 +또 +똑 +똘 +똥 +뙤 +뚜 +뚝 +뚤 +뚫 +뚱 +뛰 +뛴 +뛸 +뜀 +뜁 +뜨 +뜩 +뜬 +뜯 +뜰 +뜸 +뜻 +띄 +띈 +띌 +띔 +띕 +띠 +띤 +띨 +띱 +띵 +라 +락 +란 +랄 +람 +랍 +랏 +랐 +랑 +랒 +랗 +래 +랙 +랜 +랠 +램 +랩 +랫 +랬 +랭 +랴 +략 +량 +러 +럭 +런 +럴 +럼 +럽 +럿 +렀 +렁 +렇 +레 +렉 +렌 +렐 +렘 +렙 +렛 +렝 +려 +력 +련 +렬 +렴 +렵 +렷 +렸 +령 +례 +로 +록 +론 +롤 +롬 +롭 +롯 +롱 +롸 +롹 +뢰 +뢴 +뢸 +룃 +료 +룐 +룡 +루 +룩 +룬 +룰 +룸 +룹 +룻 +룽 +뤄 +뤘 +뤼 +류 +륙 +륜 +률 +륨 +륭 +르 +륵 +른 +를 +름 +릅 +릇 +릉 +릎 +리 +릭 +린 +릴 +림 +립 +릿 +링 +마 +막 +만 +많 +맏 +말 +맑 +맘 +맙 +맛 +망 +맞 +맡 +맣 +매 +맥 +맨 +맬 +맴 +맵 +맷 +맸 +맹 +맺 +먀 +먁 +머 +먹 +먼 +멀 +멈 +멋 +멍 +멎 +메 +멕 +멘 +멜 +멤 +멥 +멧 +멩 +며 +멱 +면 +멸 +몄 +명 +몇 +모 +목 +몫 +몬 +몰 +몸 +몹 +못 +몽 +뫼 +묘 +무 +묵 +묶 +문 +묻 +물 +묽 +뭄 +뭅 +뭇 +뭉 +뭍 +뭏 +뭐 +뭔 +뭘 +뭡 +뭣 +뮈 +뮌 +뮐 +뮤 +뮬 +므 +믈 +믐 +미 +믹 +민 +믿 +밀 +밈 +밉 +밋 +밌 +밍 +및 +밑 +바 +박 +밖 +반 +받 +발 +밝 +밟 +밤 +밥 +밧 +방 +밭 +배 +백 +밴 +밸 +뱀 +뱁 +뱃 +뱄 +뱅 +뱉 +뱍 +뱐 +버 +벅 +번 +벌 +범 +법 +벗 +벙 +벚 +베 +벡 +벤 +벨 +벰 +벱 +벳 +벵 +벼 +벽 +변 +별 +볍 +볏 +볐 +병 +볕 +보 +복 +볶 +본 +볼 +봄 +봅 +봇 +봉 +봐 +봤 +뵈 +뵐 +뵙 +부 +북 +분 +붇 +불 +붉 +붐 +붓 +붕 +붙 +뷔 +뷰 +뷴 +뷸 +브 +븐 +블 +비 +빅 +빈 +빌 +빔 +빕 +빗 +빙 +빚 +빛 +빠 +빡 +빤 +빨 +빳 +빴 +빵 +빻 +빼 +빽 +뺀 +뺄 +뺌 +뺏 +뺐 +뺑 +뺨 +뻐 +뻑 +뻔 +뻗 +뻘 +뻣 +뻤 +뻥 +뻬 +뼈 +뼉 +뼘 +뽀 +뽈 +뽐 +뽑 +뽕 +뾰 +뿌 +뿍 +뿐 +뿔 +뿜 +쁘 +쁜 +쁠 +쁨 +삐 +삔 +삘 +사 +삭 +삯 +산 +살 +삵 +삶 +삼 +삽 +삿 +샀 +상 +샅 +새 +색 +샌 +샐 +샘 +샙 +샛 +샜 +생 +샤 +샨 +샬 +샴 +샵 +샷 +샹 +서 +석 +섞 +선 +섣 +설 +섬 +섭 +섯 +섰 +성 +섶 +세 +섹 +센 +셀 +셈 +셉 +셋 +셌 +셍 +셔 +션 +셜 +셨 +셰 +셴 +셸 +소 +속 +손 +솔 +솜 +솝 +솟 +송 +솥 +쇄 +쇠 +쇤 +쇳 +쇼 +숀 +숄 +숍 +수 +숙 +순 +숟 +술 +숨 +숩 +숫 +숭 +숯 +숱 +숲 +숴 +쉐 +쉘 +쉬 +쉭 +쉰 +쉴 +쉼 +쉽 +슈 +슐 +슘 +슛 +슝 +스 +슥 +슨 +슬 +슭 +슴 +습 +슷 +승 +시 +식 +신 +싣 +실 +싫 +심 +십 +싯 +싱 +싶 +싸 +싹 +싼 +쌀 +쌈 +쌉 +쌌 +쌍 +쌓 +쌔 +쌘 +쌩 +써 +썩 +썬 +썰 +썸 +썹 +썼 +썽 +쎄 +쎈 +쏘 +쏙 +쏜 +쏟 +쏠 +쏭 +쏴 +쐈 +쐐 +쐬 +쑤 +쑥 +쑨 +쒀 +쒔 +쓰 +쓱 +쓴 +쓸 +씀 +씁 +씌 +씨 +씩 +씬 +씰 +씸 +씹 +씻 +씽 +아 +악 +안 +앉 +않 +알 +앎 +앓 +암 +압 +앗 +았 +앙 +앞 +애 +액 +앤 +앨 +앰 +앱 +앳 +앴 +앵 +야 +약 +얀 +얄 +얇 +얌 +얍 +얏 +양 +얕 +얗 +얘 +얜 +어 +억 +언 +얹 +얻 +얼 +얽 +엄 +업 +없 +엇 +었 +엉 +엊 +엌 +엎 +에 +엑 +엔 +엘 +엠 +엡 +엣 +엥 +여 +역 +엮 +연 +열 +엷 +염 +엽 +엾 +엿 +였 +영 +옅 +옆 +옇 +예 +옌 +옐 +옙 +옛 +오 +옥 +온 +올 +옭 +옮 +옳 +옴 +옵 +옷 +옹 +옻 +와 +왁 +완 +왈 +왑 +왓 +왔 +왕 +왜 +왠 +왱 +외 +왼 +요 +욕 +욘 +욜 +욤 +용 +우 +욱 +운 +울 +움 +웁 +웃 +웅 +워 +웍 +원 +월 +웜 +웠 +웡 +웨 +웬 +웰 +웸 +웹 +위 +윅 +윈 +윌 +윔 +윗 +윙 +유 +육 +윤 +율 +윱 +윳 +융 +으 +윽 +은 +을 +읊 +음 +읍 +응 +의 +읜 +읠 +이 +익 +인 +일 +읽 +잃 +임 +입 +잇 +있 +잉 +잊 +잎 +자 +작 +잔 +잖 +잘 +잠 +잡 +잣 +잤 +장 +잦 +재 +잭 +잰 +잴 +잽 +잿 +쟀 +쟁 +쟈 +쟉 +쟤 +저 +적 +전 +절 +젊 +점 +접 +젓 +정 +젖 +제 +젝 +젠 +젤 +젬 +젭 +젯 +져 +젼 +졀 +졌 +졍 +조 +족 +존 +졸 +좀 +좁 +종 +좇 +좋 +좌 +좍 +좽 +죄 +죠 +죤 +주 +죽 +준 +줄 +줌 +줍 +줏 +중 +줘 +줬 +쥐 +쥔 +쥘 +쥬 +쥴 +즈 +즉 +즌 +즐 +즘 +즙 +증 +지 +직 +진 +짇 +질 +짊 +짐 +집 +짓 +징 +짖 +짙 +짚 +짜 +짝 +짠 +짢 +짤 +짧 +짬 +짭 +짰 +짱 +째 +짹 +짼 +쨀 +쨉 +쨋 +쨌 +쨍 +쩄 +쩌 +쩍 +쩐 +쩔 +쩜 +쩝 +쩡 +쩨 +쪄 +쪘 +쪼 +쪽 +쪾 +쫀 +쫄 +쫑 +쫓 +쫙 +쬐 +쭈 +쭉 +쭐 +쭙 +쯔 +쯤 +쯧 +찌 +찍 +찐 +찔 +찜 +찝 +찡 +찢 +찧 +차 +착 +찬 +찮 +찰 +참 +찹 +찻 +찼 +창 +찾 +채 +책 +챈 +챌 +챔 +챕 +챗 +챘 +챙 +챠 +챤 +처 +척 +천 +철 +첨 +첩 +첫 +청 +체 +첵 +첸 +첼 +쳄 +쳇 +쳉 +쳐 +쳔 +쳤 +초 +촉 +촌 +촘 +촛 +총 +촨 +촬 +최 +쵸 +추 +축 +춘 +출 +춤 +춥 +춧 +충 +춰 +췄 +췌 +취 +췬 +츄 +츠 +측 +츨 +츰 +층 +치 +칙 +친 +칠 +칡 +침 +칩 +칫 +칭 +카 +칵 +칸 +칼 +캄 +캅 +캇 +캉 +캐 +캔 +캘 +캠 +캡 +캣 +캤 +캥 +캬 +커 +컥 +컨 +컫 +컬 +컴 +컵 +컷 +컸 +컹 +케 +켄 +켈 +켐 +켓 +켕 +켜 +켠 +켤 +켭 +켯 +켰 +코 +콕 +콘 +콜 +콤 +콥 +콧 +콩 +콰 +콱 +콴 +콸 +쾅 +쾌 +쾡 +쾨 +쾰 +쿄 +쿠 +쿡 +쿤 +쿨 +쿰 +쿵 +쿼 +퀀 +퀄 +퀘 +퀭 +퀴 +퀵 +퀸 +퀼 +큐 +큘 +크 +큰 +클 +큼 +큽 +키 +킥 +킨 +킬 +킴 +킵 +킷 +킹 +타 +탁 +탄 +탈 +탉 +탐 +탑 +탓 +탔 +탕 +태 +택 +탠 +탤 +탬 +탭 +탯 +탰 +탱 +터 +턱 +턴 +털 +텀 +텁 +텃 +텄 +텅 +테 +텍 +텐 +텔 +템 +텝 +텡 +텨 +톈 +토 +톡 +톤 +톨 +톰 +톱 +톳 +통 +퇴 +툇 +투 +툭 +툰 +툴 +툼 +퉁 +퉈 +퉜 +튀 +튄 +튈 +튕 +튜 +튠 +튤 +튬 +트 +특 +튼 +튿 +틀 +틈 +틉 +틋 +틔 +티 +틱 +틴 +틸 +팀 +팁 +팅 +파 +팍 +팎 +판 +팔 +팜 +팝 +팟 +팠 +팡 +팥 +패 +팩 +팬 +팰 +팸 +팻 +팼 +팽 +퍼 +퍽 +펀 +펄 +펌 +펍 +펐 +펑 +페 +펙 +펜 +펠 +펨 +펩 +펫 +펭 +펴 +편 +펼 +폄 +폈 +평 +폐 +포 +폭 +폰 +폴 +폼 +폿 +퐁 +표 +푭 +푸 +푹 +푼 +풀 +품 +풋 +풍 +퓨 +퓬 +퓰 +퓸 +프 +픈 +플 +픔 +픕 +피 +픽 +핀 +필 +핌 +핍 +핏 +핑 +하 +학 +한 +할 +핥 +함 +합 +핫 +항 +해 +핵 +핸 +핼 +햄 +햅 +햇 +했 +행 +햐 +향 +헀 +허 +헉 +헌 +헐 +험 +헙 +헛 +헝 +헤 +헥 +헨 +헬 +헴 +헵 +헷 +헹 +혀 +혁 +현 +혈 +혐 +협 +혓 +혔 +형 +혜 +호 +혹 +혼 +홀 +홈 +홉 +홋 +홍 +홑 +화 +확 +환 +활 +홧 +황 +홰 +홱 +횃 +회 +획 +횝 +횟 +횡 +효 +후 +훅 +훈 +훌 +훑 +훔 +훗 +훤 +훨 +훼 +휄 +휑 +휘 +휙 +휜 +휠 +휩 +휭 +휴 +휼 +흄 +흉 +흐 +흑 +흔 +흘 +흙 +흠 +흡 +흣 +흥 +흩 +희 +흰 +흽 +히 +힉 +힌 +힐 +힘 +힙 +힝 +車 +滑 +金 +奈 +羅 +洛 +卵 +欄 +蘭 +郎 +來 +盧 +老 +魯 +綠 +鹿 +論 +雷 +樓 +縷 +凌 +樂 +不 +參 +葉 +沈 +若 +兩 +凉 +梁 +呂 +女 +廬 +麗 +黎 +曆 +歷 +戀 +蓮 +連 +列 +烈 +裂 +念 +獵 +靈 +領 +例 +禮 +醴 +惡 +尿 +料 +遼 +龍 +暈 +柳 +流 +類 +六 +陸 +倫 +律 +栗 +利 +李 +梨 +理 +離 +燐 +林 +臨 +立 +茶 +切 +宅 + diff --git a/ppocr/utils/dict/latex_symbol_dict.txt b/ppocr/utils/dict/latex_symbol_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..d17f2c2cf79cf50fa83c17e087bb77a79a862b39 --- /dev/null +++ b/ppocr/utils/dict/latex_symbol_dict.txt @@ -0,0 +1,111 @@ +eos +sos +! +' +( +) ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +< += +> +A +B +C +E +F +G +H +I +L +M +N +P +R +S +T +V +X +Y +[ +\Delta +\alpha +\beta +\cdot +\cdots +\cos +\div +\exists +\forall +\frac +\gamma +\geq +\in +\infty +\int +\lambda +\ldots +\leq +\lim +\log +\mu +\neq +\phi +\pi +\pm +\prime +\rightarrow +\sigma +\sin +\sqrt +\sum +\tan +\theta +\times +] +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +\{ +| +\} +{ +} +^ +_ \ No newline at end of file diff --git a/ppocr/utils/dict/latin_dict.txt b/ppocr/utils/dict/latin_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..e166bf33ecfbdc90ddb3d9743fded23306acabd5 --- /dev/null +++ b/ppocr/utils/dict/latin_dict.txt @@ -0,0 +1,185 @@ + +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +} +¡ +£ +§ +ª +« +­ +° +² +³ +´ +µ +· +º +» +¿ +À +Á + +Ä +Å +Ç +È +É +Ê +Ë +Ì +Í +Î +Ï +Ò +Ó +Ô +Õ +Ö +Ú +Ü +Ý +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +ì +í +î +ï +ñ +ò +ó +ô +õ +ö +ø +ù +ú +û +ü +ý +ą +Ć +ć +Č +č +Đ +đ +ę +ı +Ł +ł +ō +Œ +œ +Š +š +Ÿ +Ž +ž +ʒ +β +δ +ε +з +Ṡ +‘ +€ +™ diff --git a/ppocr/utils/dict/layout_dict/layout_cdla_dict.txt b/ppocr/utils/dict/layout_dict/layout_cdla_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..8be0f48600a88463d840fffe27eebd63629576ce --- /dev/null +++ b/ppocr/utils/dict/layout_dict/layout_cdla_dict.txt @@ -0,0 +1,10 @@ +text +title +figure +figure_caption +table +table_caption +header +footer +reference +equation \ No newline at end of file diff --git a/ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt b/ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca6acf4eef8d4d5f9ba5a4ced4858a119a4ef983 --- /dev/null +++ b/ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt @@ -0,0 +1,5 @@ +text +title +list +table +figure \ No newline at end of file diff --git a/ppocr/utils/dict/layout_dict/layout_table_dict.txt b/ppocr/utils/dict/layout_dict/layout_table_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..faea15ea07d7d1a6f77dbd4287bb9fa87165cbb9 --- /dev/null +++ b/ppocr/utils/dict/layout_dict/layout_table_dict.txt @@ -0,0 +1 @@ +table \ No newline at end of file diff --git a/ppocr/utils/dict/mr_dict.txt b/ppocr/utils/dict/mr_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..283b1504ae344ed7db95050ddb9e3682126cc741 --- /dev/null +++ b/ppocr/utils/dict/mr_dict.txt @@ -0,0 +1,153 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ँ +ं +ः +अ +आ +इ +ई +उ +ऊ +ए +ऐ +ऑ +ओ +औ +क +ख +ग +घ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +प +फ +ब +भ +म +य +र +ऱ +ल +ळ +व +श +ष +स +ह +़ +ा +ि +ी +ु +ू +ृ +ॅ +े +ै +ॉ +ो +ौ +् +० +१ +२ +३ +४ +५ +६ +७ +८ +९ diff --git a/ppocr/utils/dict/ne_dict.txt b/ppocr/utils/dict/ne_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a7df9537fc886c4de53ee68ab67b171b386780f --- /dev/null +++ b/ppocr/utils/dict/ne_dict.txt @@ -0,0 +1,153 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ः +अ +आ +इ +ई +उ +ऊ +ऋ +ए +ऐ +ओ +औ +क +ख +ग +घ +ङ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +ऩ +प +फ +ब +भ +म +य +र +ऱ +ल +व +श +ष +स +ह +़ +ा +ि +ी +ु +ू +ृ +े +ै +ो +ौ +् +॒ +ॠ +। +० +१ +२ +३ +४ +५ +६ +७ +८ +९ diff --git a/ppocr/utils/dict/oc_dict.txt b/ppocr/utils/dict/oc_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..e88af8bd85f4e01c43d08e5ba3ae6cadc5a465a4 --- /dev/null +++ b/ppocr/utils/dict/oc_dict.txt @@ -0,0 +1,96 @@ +o +c +_ +i +m +g +/ +2 +0 +I +L +S +V +R +C +1 +v +a +l +4 +3 +. +j +p +r +e +è +t +9 +7 +5 +8 +n +' +b +s +6 +q +u +á +d +ò +à +h +z +f +ï +í +A +ç +x +ó +é +P +O +Ò +ü +k +À +F +- +ú +­ +æ +Á +D +E +w +K +T +N +y +U +Z +G +B +J +H +M +W +Y +X +Q +% +$ +, +@ +& +! +: +( +# +? ++ +É + diff --git a/ppocr/utils/dict/pu_dict.txt b/ppocr/utils/dict/pu_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..9500fae6e4976ea632bf579b533f82f176f3b7e7 --- /dev/null +++ b/ppocr/utils/dict/pu_dict.txt @@ -0,0 +1,130 @@ +p +u +_ +i +m +g +/ +8 +I +L +S +V +R +C +2 +0 +1 +v +a +l +6 +7 +4 +5 +. +j + +q +e +s +t +ã +o +x +9 +c +n +r +z +ç +õ +3 +A +U +d +º +ô +­ +, +E +; +ó +á +b +D +? +ú +ê +- +h +P +f +à +N +í +O +M +G +É +é +â +F +: +T +Á +" +Q +) +W +J +B +H +( +ö +% +Ö +« +w +K +y +! +k +] +' +Z ++ +Ç +Õ +Y +À +X +µ +» +ª +Í +ü +ä +´ +è +ñ +ß +ï +Ú +ë +Ô +Ï +Ó +[ +Ì +< + +ò +§ +³ +ø +å +# +$ +& +@ diff --git a/ppocr/utils/dict/rs_dict.txt b/ppocr/utils/dict/rs_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1ce46d240841b8471cbb1209ee92864895c667c --- /dev/null +++ b/ppocr/utils/dict/rs_dict.txt @@ -0,0 +1,91 @@ +r +s +_ +i +m +g +/ +1 +I +L +S +V +R +C +2 +0 +v +a +l +7 +5 +8 +6 +. +j +p + +t +d +9 +3 +e +š +4 +k +u +ć +c +n +đ +o +z +č +b +ž +f +Z +T +h +M +F +O +Š +B +H +A +E +Đ +Ž +D +P +G +Č +K +U +N +J +Ć +w +y +W +x +Y +X +q +Q +# +& +$ +, +- +% +' +@ +! +: +? +( +É +é ++ diff --git a/ppocr/utils/dict/rsc_dict.txt b/ppocr/utils/dict/rsc_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..95dd4636057e5b6dd8bd3a3dd6aacf19e790cffb --- /dev/null +++ b/ppocr/utils/dict/rsc_dict.txt @@ -0,0 +1,134 @@ +r +s +c +_ +i +m +g +/ +5 +I +L +S +V +R +C +2 +0 +1 +v +a +l +9 +7 +8 +. +j +p +м +а +с +и +р +ћ +е +ш +3 +4 +о +г +н +з +в +л +6 +т +ж +у +к +п +њ +д +ч +С +ј +ф +ц +љ +х +О +И +А +б +Ш +К +ђ +џ +М +В +З +Д +Р +У +Н +Т +Б +? +П +Х +Ј +Ц +Г +Љ +Л +Ф +e +n +w +E +F +A +N +f +o +b +M +G +t +y +W +k +P +u +H +B +T +z +h +O +Y +d +U +K +D +x +X +J +Z +Q +q +' +- +@ +é +# +! +, +% +$ +: +& ++ +( +É + diff --git a/ppocr/utils/dict/ru_dict.txt b/ppocr/utils/dict/ru_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b0cf3a8d6cd61ae395d1242dae3d42906029e2c --- /dev/null +++ b/ppocr/utils/dict/ru_dict.txt @@ -0,0 +1,125 @@ +к +в +а +з +и +у +р +о +н +я +х +п +л +ы +г +е +т +м +д +ж +ш +ь +с +ё +б +й +ч +ю +ц +щ +М +э +ф +А +ъ +С +Ф +Ю +В +К +Т +Н +О +Э +У +И +Г +Л +Р +Д +Б +Ш +П +З +Х +Е +Ж +Я +Ц +Ч +Й +Щ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + diff --git a/ppocr/utils/dict/spin_dict.txt b/ppocr/utils/dict/spin_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ee8347fd9c85228a3cf46c810d4fc28ab05c492 --- /dev/null +++ b/ppocr/utils/dict/spin_dict.txt @@ -0,0 +1,68 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +: +( +' +- +, +% +> +. +[ +? +) +" += +_ +* +] +; +& ++ +$ +@ +/ +| +! +< +# +` +{ +~ +\ +} +^ \ No newline at end of file diff --git a/ppocr/utils/dict/ta_dict.txt b/ppocr/utils/dict/ta_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..19d81892c205627f296adbf8b20ea41aba2de5d0 --- /dev/null +++ b/ppocr/utils/dict/ta_dict.txt @@ -0,0 +1,128 @@ +t +a +_ +i +m +g +/ +3 +I +L +S +V +R +C +2 +0 +1 +v +l +9 +7 +8 +. +j +p +ப +ூ +த +ம +ி +வ +ர +் +ந +ோ +ன +6 +ஆ +ற +ல +5 +ள +ா +ொ +ழ +ு +4 +ெ +ண +க +ட +ை +ே +ச +ய +ஒ +இ +அ +ங +உ +ீ +ஞ +எ +ஓ +ஃ +ஜ +ஷ +ஸ +ஏ +ஊ +ஹ +ஈ +ஐ +ௌ +ஔ +s +c +e +n +w +F +T +O +P +K +A +N +G +Y +E +M +H +U +B +o +b +D +d +r +W +u +y +f +X +k +q +h +J +z +Z +Q +x +- +' +$ +, +% +@ +é +! +# ++ +É +& +: +( +? + diff --git a/ppocr/utils/dict/table_dict.txt b/ppocr/utils/dict/table_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ef028c786cbce6d1e25856c62986d757b31f93b --- /dev/null +++ b/ppocr/utils/dict/table_dict.txt @@ -0,0 +1,277 @@ +← + +☆ +─ +α + + +⋅ +$ +ω +ψ +χ +( +υ +≥ +σ +, +ρ +ε +0 +■ +4 +8 +✗ +b +< +✓ +Ψ +Ω +€ +D +3 +Π +H +║ + +L +Φ +Χ +θ +P +κ +λ +μ +T +ξ +X +β +γ +δ +\ +ζ +η +` +d + +h +f +l +Θ +p +√ +t + +x +Β +Γ +Δ +| +ǂ +ɛ +j +̧ +➢ +⁡ +̌ +′ +« +△ +▲ +# + +' +Ι ++ +¶ +/ +▼ +⇑ +□ +· +7 +▪ +; +? +➔ +∩ +C +÷ +G +⇒ +K + +O +S +С +W +Α +[ +○ +_ +● +‡ +c +z +g + +o + +〈 +〉 +s +⩽ +w +φ +ʹ +{ +» +∣ +̆ +e +ˆ +∈ +τ +◆ +ι +∅ +∆ +∙ +∘ +Ø +ß +✔ +∞ +∑ +− +× +◊ +∗ +∖ +˃ +˂ +∫ +" +i +& +π +↔ +* +∥ +æ +∧ +. +⁄ +ø +Q +∼ +6 +⁎ +: +★ +> +a +B +≈ +F +J +̄ +N +♯ +R +V + +― +Z +♣ +^ +¤ +¥ +§ + +¢ +£ +≦ +­ +≤ +‖ +Λ +© +n +↓ +→ +↑ +r +° +± +v + +♂ +k +♀ +~ +ᅟ +̇ +@ +” +♦ +ł +® +⊕ +„ +! + +% +⇓ +) +- +1 +5 +9 += +А +A +‰ +⋆ +Σ +E +◦ +I +※ +M +m +̨ +⩾ +† + +• +U +Y +
 +] +̸ +2 +‐ +– +‒ +̂ +— +̀ +́ +’ +‘ +⋮ +⋯ +̊ +“ +̈ +≧ +q +u +ı +y + +​ +̃ +} +ν diff --git a/ppocr/utils/dict/table_master_structure_dict.txt b/ppocr/utils/dict/table_master_structure_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..95ab2539a70aca4f695c53a38cdc1c3e164fcfb3 --- /dev/null +++ b/ppocr/utils/dict/table_master_structure_dict.txt @@ -0,0 +1,39 @@ + + + + + + + + + + + colspan="2" + colspan="3" + + + rowspan="2" + colspan="4" + colspan="6" + rowspan="3" + colspan="9" + colspan="10" + colspan="7" + rowspan="4" + rowspan="5" + rowspan="9" + colspan="8" + rowspan="8" + rowspan="6" + rowspan="7" + rowspan="10" + + + + + + + + diff --git a/ppocr/utils/dict/table_structure_dict.txt b/ppocr/utils/dict/table_structure_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..8edb10b8817ad596af6c63b6b8fc5eb2349b7464 --- /dev/null +++ b/ppocr/utils/dict/table_structure_dict.txt @@ -0,0 +1,28 @@ + + + + + + + + + + colspan="2" + colspan="3" + rowspan="2" + colspan="4" + colspan="6" + rowspan="3" + colspan="9" + colspan="10" + colspan="7" + rowspan="4" + rowspan="5" + rowspan="9" + colspan="8" + rowspan="8" + rowspan="6" + rowspan="7" + rowspan="10" \ No newline at end of file diff --git a/ppocr/utils/dict/table_structure_dict_ch.txt b/ppocr/utils/dict/table_structure_dict_ch.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c59c0e9998a31f9d32f703625aa1c5ca7718c8d --- /dev/null +++ b/ppocr/utils/dict/table_structure_dict_ch.txt @@ -0,0 +1,48 @@ + + + + + + + + + + colspan="2" + colspan="3" + colspan="4" + colspan="5" + colspan="6" + colspan="7" + colspan="8" + colspan="9" + colspan="10" + colspan="11" + colspan="12" + colspan="13" + colspan="14" + colspan="15" + colspan="16" + colspan="17" + colspan="18" + colspan="19" + colspan="20" + rowspan="2" + rowspan="3" + rowspan="4" + rowspan="5" + rowspan="6" + rowspan="7" + rowspan="8" + rowspan="9" + rowspan="10" + rowspan="11" + rowspan="12" + rowspan="13" + rowspan="14" + rowspan="15" + rowspan="16" + rowspan="17" + rowspan="18" + rowspan="19" + rowspan="20" diff --git a/ppocr/utils/dict/te_dict.txt b/ppocr/utils/dict/te_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..83d74cc7e5f899ca43b23fa690d84d70bee535e3 --- /dev/null +++ b/ppocr/utils/dict/te_dict.txt @@ -0,0 +1,151 @@ +t +e +_ +i +m +g +/ +5 +I +L +S +V +R +C +2 +0 +1 +v +a +l +3 +4 +8 +9 +. +j +p +త +ె +ర +క +్ +ి +ం +చ +ే +ద +ు +7 +6 +ఉ +ా +మ +ట +ో +వ +ప +ల +శ +ఆ +య +ై +భ +' +ీ +గ +ూ +డ +ధ +హ +న +జ +స +[ +‌ +ష +అ +ణ +ఫ +బ +ఎ +; +ళ +థ +ొ +ఠ +ృ +ఒ +ఇ +ః +ఊ +ఖ +- +ఐ +ఘ +ౌ +ఏ +ఈ +ఛ +, +ఓ +ఞ +| +? +: +ఢ +" +( +” +! ++ +) +* += +& +“ +€ +] +£ +$ +s +c +n +w +k +J +G +u +d +r +E +o +h +y +b +f +B +M +O +T +N +D +P +A +F +x +W +Y +U +H +K +X +z +Z +Q +q +É +% +# +@ +é diff --git a/ppocr/utils/dict/ug_dict.txt b/ppocr/utils/dict/ug_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..77602f2cfd29d739478bc9e757bd82b71235554b --- /dev/null +++ b/ppocr/utils/dict/ug_dict.txt @@ -0,0 +1,114 @@ +u +g +_ +i +m +/ +1 +I +L +S +V +R +C +2 +0 +v +a +l +8 +5 +3 +6 +9 +. +j +p + +ق +ا +پ +ل +4 +7 +ئ +ى +ش +ت +ي +ك +د +ف +ر +و +ن +ب +ە +خ +ې +چ +ۇ +ز +س +م +ۋ +گ +ڭ +ۆ +ۈ +ج +غ +ھ +ژ +s +c +e +n +w +P +E +D +U +d +r +b +y +B +o +O +Y +N +T +k +t +h +A +H +F +z +W +K +G +M +f +Z +X +Q +J +x +q +- +! +% +# +? +: +$ +, +& +' +É +@ +é +( ++ diff --git a/ppocr/utils/dict/uk_dict.txt b/ppocr/utils/dict/uk_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5ffc0a53dbdf7af6d911097dffb8733d7d4eab1 --- /dev/null +++ b/ppocr/utils/dict/uk_dict.txt @@ -0,0 +1,142 @@ +u +k +_ +i +m +g +/ +1 +6 +I +L +S +V +R +C +2 +0 +v +a +l +7 +9 +. +j +p +в +і +д +п +о +н +с +т +ю +4 +5 +3 +а +и +м +е +р +ч +у +Б +з +л +к +8 +А +В +г +є +б +ь +х +ґ +ш +ц +ф +я +щ +ж +Г +Х +У +Т +Е +І +Н +П +З +Л +Ю +С +Д +М +К +Р +Ф +О +Ц +И +Я +Ч +Ш +Ж +Є +Ґ +Ь +s +c +e +n +w +A +P +r +E +t +o +h +d +y +M +G +N +F +B +T +D +U +O +W +Z +f +H +Y +b +K +z +x +Q +X +q +J +$ +- +' +# +& +% +? +: +! +, ++ +@ +( +é +É + diff --git a/ppocr/utils/dict/ur_dict.txt b/ppocr/utils/dict/ur_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..c06786a83bc60039fa395d71e367bece1e80b11d --- /dev/null +++ b/ppocr/utils/dict/ur_dict.txt @@ -0,0 +1,137 @@ +u +r +_ +i +m +g +/ +3 +I +L +S +V +R +C +2 +0 +1 +v +a +l +9 +7 +8 +. +j +p + +چ +ٹ +پ +ا +ئ +ی +ے +4 +6 +و +ل +ن +ڈ +ھ +ک +ت +ش +ف +ق +ر +د +5 +ب +ج +خ +ہ +س +ز +غ +ڑ +ں +آ +م +ؤ +ط +ص +ح +ع +گ +ث +ض +ذ +ۓ +ِ +ء +ظ +ً +ي +ُ +ۃ +أ +ٰ +ە +ژ +ۂ +ة +ّ +ك +ه +s +c +e +n +w +o +d +t +D +M +T +U +E +b +P +h +y +W +H +A +x +B +O +N +G +Y +Q +F +k +K +q +J +Z +f +z +X +' +@ +& +! +, +: +$ +- +# +? +% +é ++ +( +É diff --git a/ppocr/utils/dict/xi_dict.txt b/ppocr/utils/dict/xi_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..f195f1ea6ce90b09f5feecfc6c288eed423eeb8d --- /dev/null +++ b/ppocr/utils/dict/xi_dict.txt @@ -0,0 +1,110 @@ +x +i +_ +m +g +/ +1 +0 +I +L +S +V +R +C +2 +v +a +l +3 +6 +4 +5 +. +j +p + +Q +u +e +r +o +8 +7 +n +c +9 +t +b +é +q +d +ó +y +F +s +, +O +í +T +f +" +U +M +h +: +P +H +A +E +D +z +N +á +ñ +ú +% +; +è ++ +Y +- +B +G +( +) +¿ +? +w +¡ +! +X +É +K +k +Á +ü +Ú +« +» +J +' +ö +W +Z +º +Ö +­ +[ +] +Ç +ç +à +ä +û +ò +Í +ê +ô +ø +ª diff --git a/ppocr/utils/dict90.txt b/ppocr/utils/dict90.txt new file mode 100644 index 0000000000000000000000000000000000000000..a945ae9c526e4faa68852eb3fb47d078a2f3f6ce --- /dev/null +++ b/ppocr/utils/dict90.txt @@ -0,0 +1,90 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +: +; +< += +> +? +@ +[ +\ +] +_ +` +~ \ No newline at end of file diff --git a/ppocr/utils/e2e_metric/Deteval.py b/ppocr/utils/e2e_metric/Deteval.py new file mode 100644 index 0000000000000000000000000000000000000000..6ce56eda2aa9f38fdc712d49ae64945c558b418d --- /dev/null +++ b/ppocr/utils/e2e_metric/Deteval.py @@ -0,0 +1,701 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import numpy as np +import scipy.io as io +import Polygon as plg +from ppocr.utils.e2e_metric.polygon_fast import iod, area_of_intersection, area + + +def get_socre_A(gt_dir, pred_dict): + allInputs = 1 + + def input_reading_mod(pred_dict): + """This helper reads input from txt files""" + det = [] + n = len(pred_dict) + for i in range(n): + points = pred_dict[i]['points'] + text = pred_dict[i]['texts'] + point = ",".join(map(str, points.reshape(-1, ))) + det.append([point, text]) + return det + + def gt_reading_mod(gt_dict): + """This helper reads groundtruths from mat files""" + gt = [] + n = len(gt_dict) + for i in range(n): + points = gt_dict[i]['points'].tolist() + h = len(points) + text = gt_dict[i]['text'] + xx = [ + np.array( + ['x:'], dtype=' 1): + gt_x = list(map(int, np.squeeze(gt[1]))) + gt_y = list(map(int, np.squeeze(gt[3]))) + for det_id, detection in enumerate(detections): + detection_orig = detection + detection = [float(x) for x in detection[0].split(',')] + detection = list(map(int, detection)) + det_x = detection[0::2] + det_y = detection[1::2] + det_gt_iou = iod(det_x, det_y, gt_x, gt_y) + if det_gt_iou > threshold: + detections[det_id] = [] + + detections[:] = [item for item in detections if item != []] + return detections + + def sigma_calculation(det_x, det_y, gt_x, gt_y): + """ + sigma = inter_area / gt_area + """ + return np.round((area_of_intersection(det_x, det_y, gt_x, gt_y) / + area(gt_x, gt_y)), 2) + + def tau_calculation(det_x, det_y, gt_x, gt_y): + if area(det_x, det_y) == 0.0: + return 0 + return np.round((area_of_intersection(det_x, det_y, gt_x, gt_y) / + area(det_x, det_y)), 2) + + ##############################Initialization################################### + # global_sigma = [] + # global_tau = [] + # global_pred_str = [] + # global_gt_str = [] + ############################################################################### + + for input_id in range(allInputs): + if (input_id != '.DS_Store') and (input_id != 'Pascal_result.txt') and ( + input_id != 'Pascal_result_curved.txt') and (input_id != 'Pascal_result_non_curved.txt') and ( + input_id != 'Deteval_result.txt') and (input_id != 'Deteval_result_curved.txt') \ + and (input_id != 'Deteval_result_non_curved.txt'): + detections = input_reading_mod(pred_dict) + groundtruths = gt_reading_mod(gt_dir) + detections = detection_filtering( + detections, + groundtruths) # filters detections overlapping with DC area + dc_id = [] + for i in range(len(groundtruths)): + if groundtruths[i][5] == '#': + dc_id.append(i) + cnt = 0 + for a in dc_id: + num = a - cnt + del groundtruths[num] + cnt += 1 + + local_sigma_table = np.zeros((len(groundtruths), len(detections))) + local_tau_table = np.zeros((len(groundtruths), len(detections))) + local_pred_str = {} + local_gt_str = {} + + for gt_id, gt in enumerate(groundtruths): + if len(detections) > 0: + for det_id, detection in enumerate(detections): + detection_orig = detection + detection = [float(x) for x in detection[0].split(',')] + detection = list(map(int, detection)) + pred_seq_str = detection_orig[1].strip() + det_x = detection[0::2] + det_y = detection[1::2] + gt_x = list(map(int, np.squeeze(gt[1]))) + gt_y = list(map(int, np.squeeze(gt[3]))) + gt_seq_str = str(gt[4].tolist()[0]) + + local_sigma_table[gt_id, det_id] = sigma_calculation( + det_x, det_y, gt_x, gt_y) + local_tau_table[gt_id, det_id] = tau_calculation( + det_x, det_y, gt_x, gt_y) + local_pred_str[det_id] = pred_seq_str + local_gt_str[gt_id] = gt_seq_str + + global_sigma = local_sigma_table + global_tau = local_tau_table + global_pred_str = local_pred_str + global_gt_str = local_gt_str + + single_data = {} + single_data['sigma'] = global_sigma + single_data['global_tau'] = global_tau + single_data['global_pred_str'] = global_pred_str + single_data['global_gt_str'] = global_gt_str + return single_data + + +def get_socre_B(gt_dir, img_id, pred_dict): + allInputs = 1 + + def input_reading_mod(pred_dict): + """This helper reads input from txt files""" + det = [] + n = len(pred_dict) + for i in range(n): + points = pred_dict[i]['points'] + text = pred_dict[i]['texts'] + point = ",".join(map(str, points.reshape(-1, ))) + det.append([point, text]) + return det + + def gt_reading_mod(gt_dir, gt_id): + gt = io.loadmat('%s/poly_gt_img%s.mat' % (gt_dir, gt_id)) + gt = gt['polygt'] + return gt + + def detection_filtering(detections, groundtruths, threshold=0.5): + for gt_id, gt in enumerate(groundtruths): + if (gt[5] == '#') and (gt[1].shape[1] > 1): + gt_x = list(map(int, np.squeeze(gt[1]))) + gt_y = list(map(int, np.squeeze(gt[3]))) + for det_id, detection in enumerate(detections): + detection_orig = detection + detection = [float(x) for x in detection[0].split(',')] + detection = list(map(int, detection)) + det_x = detection[0::2] + det_y = detection[1::2] + det_gt_iou = iod(det_x, det_y, gt_x, gt_y) + if det_gt_iou > threshold: + detections[det_id] = [] + + detections[:] = [item for item in detections if item != []] + return detections + + def sigma_calculation(det_x, det_y, gt_x, gt_y): + """ + sigma = inter_area / gt_area + """ + return np.round((area_of_intersection(det_x, det_y, gt_x, gt_y) / + area(gt_x, gt_y)), 2) + + def tau_calculation(det_x, det_y, gt_x, gt_y): + if area(det_x, det_y) == 0.0: + return 0 + return np.round((area_of_intersection(det_x, det_y, gt_x, gt_y) / + area(det_x, det_y)), 2) + + ##############################Initialization################################### + # global_sigma = [] + # global_tau = [] + # global_pred_str = [] + # global_gt_str = [] + ############################################################################### + + for input_id in range(allInputs): + if (input_id != '.DS_Store') and (input_id != 'Pascal_result.txt') and ( + input_id != 'Pascal_result_curved.txt') and (input_id != 'Pascal_result_non_curved.txt') and ( + input_id != 'Deteval_result.txt') and (input_id != 'Deteval_result_curved.txt') \ + and (input_id != 'Deteval_result_non_curved.txt'): + detections = input_reading_mod(pred_dict) + groundtruths = gt_reading_mod(gt_dir, img_id).tolist() + detections = detection_filtering( + detections, + groundtruths) # filters detections overlapping with DC area + dc_id = [] + for i in range(len(groundtruths)): + if groundtruths[i][5] == '#': + dc_id.append(i) + cnt = 0 + for a in dc_id: + num = a - cnt + del groundtruths[num] + cnt += 1 + + local_sigma_table = np.zeros((len(groundtruths), len(detections))) + local_tau_table = np.zeros((len(groundtruths), len(detections))) + local_pred_str = {} + local_gt_str = {} + + for gt_id, gt in enumerate(groundtruths): + if len(detections) > 0: + for det_id, detection in enumerate(detections): + detection_orig = detection + detection = [float(x) for x in detection[0].split(',')] + detection = list(map(int, detection)) + pred_seq_str = detection_orig[1].strip() + det_x = detection[0::2] + det_y = detection[1::2] + gt_x = list(map(int, np.squeeze(gt[1]))) + gt_y = list(map(int, np.squeeze(gt[3]))) + gt_seq_str = str(gt[4].tolist()[0]) + + local_sigma_table[gt_id, det_id] = sigma_calculation( + det_x, det_y, gt_x, gt_y) + local_tau_table[gt_id, det_id] = tau_calculation( + det_x, det_y, gt_x, gt_y) + local_pred_str[det_id] = pred_seq_str + local_gt_str[gt_id] = gt_seq_str + + global_sigma = local_sigma_table + global_tau = local_tau_table + global_pred_str = local_pred_str + global_gt_str = local_gt_str + + single_data = {} + single_data['sigma'] = global_sigma + single_data['global_tau'] = global_tau + single_data['global_pred_str'] = global_pred_str + single_data['global_gt_str'] = global_gt_str + return single_data + + +def get_score_C(gt_label, text, pred_bboxes): + """ + get score for CentripetalText (CT) prediction. + """ + + def gt_reading_mod(gt_label, text): + """This helper reads groundtruths from mat files""" + groundtruths = [] + nbox = len(gt_label) + for i in range(nbox): + label = {"transcription": text[i][0], "points": gt_label[i].numpy()} + groundtruths.append(label) + + return groundtruths + + def get_union(pD, pG): + areaA = pD.area() + areaB = pG.area() + return areaA + areaB - get_intersection(pD, pG) + + def get_intersection(pD, pG): + pInt = pD & pG + if len(pInt) == 0: + return 0 + return pInt.area() + + def detection_filtering(detections, groundtruths, threshold=0.5): + for gt in groundtruths: + point_num = gt['points'].shape[1] // 2 + if gt['transcription'] == '###' and (point_num > 1): + gt_p = np.array(gt['points']).reshape(point_num, + 2).astype('int32') + gt_p = plg.Polygon(gt_p) + + for det_id, detection in enumerate(detections): + det_y = detection[0::2] + det_x = detection[1::2] + + det_p = np.concatenate((np.array(det_x), np.array(det_y))) + det_p = det_p.reshape(2, -1).transpose() + det_p = plg.Polygon(det_p) + + try: + det_gt_iou = get_intersection(det_p, + gt_p) / det_p.area() + except: + print(det_x, det_y, gt_p) + if det_gt_iou > threshold: + detections[det_id] = [] + + detections[:] = [item for item in detections if item != []] + return detections + + def sigma_calculation(det_p, gt_p): + """ + sigma = inter_area / gt_area + """ + if gt_p.area() == 0.: + return 0 + return get_intersection(det_p, gt_p) / gt_p.area() + + def tau_calculation(det_p, gt_p): + """ + tau = inter_area / det_area + """ + if det_p.area() == 0.: + return 0 + return get_intersection(det_p, gt_p) / det_p.area() + + detections = [] + + for item in pred_bboxes: + detections.append(item[:, ::-1].reshape(-1)) + + groundtruths = gt_reading_mod(gt_label, text) + + detections = detection_filtering( + detections, groundtruths) # filters detections overlapping with DC area + + for idx in range(len(groundtruths) - 1, -1, -1): + #NOTE: source code use 'orin' to indicate '#', here we use 'anno', + # which may cause slight drop in fscore, about 0.12 + if groundtruths[idx]['transcription'] == '###': + groundtruths.pop(idx) + + local_sigma_table = np.zeros((len(groundtruths), len(detections))) + local_tau_table = np.zeros((len(groundtruths), len(detections))) + + for gt_id, gt in enumerate(groundtruths): + if len(detections) > 0: + for det_id, detection in enumerate(detections): + point_num = gt['points'].shape[1] // 2 + + gt_p = np.array(gt['points']).reshape(point_num, + 2).astype('int32') + gt_p = plg.Polygon(gt_p) + + det_y = detection[0::2] + det_x = detection[1::2] + + det_p = np.concatenate((np.array(det_x), np.array(det_y))) + + det_p = det_p.reshape(2, -1).transpose() + det_p = plg.Polygon(det_p) + + local_sigma_table[gt_id, det_id] = sigma_calculation(det_p, + gt_p) + local_tau_table[gt_id, det_id] = tau_calculation(det_p, gt_p) + + data = {} + data['sigma'] = local_sigma_table + data['global_tau'] = local_tau_table + data['global_pred_str'] = '' + data['global_gt_str'] = '' + return data + + +def combine_results(all_data, rec_flag=True): + tr = 0.7 + tp = 0.6 + fsc_k = 0.8 + k = 2 + global_sigma = [] + global_tau = [] + global_pred_str = [] + global_gt_str = [] + + for data in all_data: + global_sigma.append(data['sigma']) + global_tau.append(data['global_tau']) + global_pred_str.append(data['global_pred_str']) + global_gt_str.append(data['global_gt_str']) + + global_accumulative_recall = 0 + global_accumulative_precision = 0 + total_num_gt = 0 + total_num_det = 0 + hit_str_count = 0 + hit_count = 0 + + def one_to_one(local_sigma_table, local_tau_table, + local_accumulative_recall, local_accumulative_precision, + global_accumulative_recall, global_accumulative_precision, + gt_flag, det_flag, idy, rec_flag): + hit_str_num = 0 + for gt_id in range(num_gt): + gt_matching_qualified_sigma_candidates = np.where( + local_sigma_table[gt_id, :] > tr) + gt_matching_num_qualified_sigma_candidates = gt_matching_qualified_sigma_candidates[ + 0].shape[0] + gt_matching_qualified_tau_candidates = np.where( + local_tau_table[gt_id, :] > tp) + gt_matching_num_qualified_tau_candidates = gt_matching_qualified_tau_candidates[ + 0].shape[0] + + det_matching_qualified_sigma_candidates = np.where( + local_sigma_table[:, gt_matching_qualified_sigma_candidates[0]] + > tr) + det_matching_num_qualified_sigma_candidates = det_matching_qualified_sigma_candidates[ + 0].shape[0] + det_matching_qualified_tau_candidates = np.where( + local_tau_table[:, gt_matching_qualified_tau_candidates[0]] > + tp) + det_matching_num_qualified_tau_candidates = det_matching_qualified_tau_candidates[ + 0].shape[0] + + if (gt_matching_num_qualified_sigma_candidates == 1) and (gt_matching_num_qualified_tau_candidates == 1) and \ + (det_matching_num_qualified_sigma_candidates == 1) and ( + det_matching_num_qualified_tau_candidates == 1): + global_accumulative_recall = global_accumulative_recall + 1.0 + global_accumulative_precision = global_accumulative_precision + 1.0 + local_accumulative_recall = local_accumulative_recall + 1.0 + local_accumulative_precision = local_accumulative_precision + 1.0 + + gt_flag[0, gt_id] = 1 + matched_det_id = np.where(local_sigma_table[gt_id, :] > tr) + # recg start + if rec_flag: + gt_str_cur = global_gt_str[idy][gt_id] + pred_str_cur = global_pred_str[idy][matched_det_id[0] + .tolist()[0]] + if pred_str_cur == gt_str_cur: + hit_str_num += 1 + else: + if pred_str_cur.lower() == gt_str_cur.lower(): + hit_str_num += 1 + # recg end + det_flag[0, matched_det_id] = 1 + return local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, gt_flag, det_flag, hit_str_num + + def one_to_many(local_sigma_table, local_tau_table, + local_accumulative_recall, local_accumulative_precision, + global_accumulative_recall, global_accumulative_precision, + gt_flag, det_flag, idy, rec_flag): + hit_str_num = 0 + for gt_id in range(num_gt): + # skip the following if the groundtruth was matched + if gt_flag[0, gt_id] > 0: + continue + + non_zero_in_sigma = np.where(local_sigma_table[gt_id, :] > 0) + num_non_zero_in_sigma = non_zero_in_sigma[0].shape[0] + + if num_non_zero_in_sigma >= k: + ####search for all detections that overlaps with this groundtruth + qualified_tau_candidates = np.where((local_tau_table[ + gt_id, :] >= tp) & (det_flag[0, :] == 0)) + num_qualified_tau_candidates = qualified_tau_candidates[ + 0].shape[0] + + if num_qualified_tau_candidates == 1: + if ((local_tau_table[gt_id, qualified_tau_candidates] >= tp) + and + (local_sigma_table[gt_id, qualified_tau_candidates] >= + tr)): + # became an one-to-one case + global_accumulative_recall = global_accumulative_recall + 1.0 + global_accumulative_precision = global_accumulative_precision + 1.0 + local_accumulative_recall = local_accumulative_recall + 1.0 + local_accumulative_precision = local_accumulative_precision + 1.0 + + gt_flag[0, gt_id] = 1 + det_flag[0, qualified_tau_candidates] = 1 + # recg start + if rec_flag: + gt_str_cur = global_gt_str[idy][gt_id] + pred_str_cur = global_pred_str[idy][ + qualified_tau_candidates[0].tolist()[0]] + if pred_str_cur == gt_str_cur: + hit_str_num += 1 + else: + if pred_str_cur.lower() == gt_str_cur.lower(): + hit_str_num += 1 + # recg end + elif (np.sum(local_sigma_table[gt_id, qualified_tau_candidates]) + >= tr): + gt_flag[0, gt_id] = 1 + det_flag[0, qualified_tau_candidates] = 1 + # recg start + if rec_flag: + gt_str_cur = global_gt_str[idy][gt_id] + pred_str_cur = global_pred_str[idy][ + qualified_tau_candidates[0].tolist()[0]] + if pred_str_cur == gt_str_cur: + hit_str_num += 1 + else: + if pred_str_cur.lower() == gt_str_cur.lower(): + hit_str_num += 1 + # recg end + + global_accumulative_recall = global_accumulative_recall + fsc_k + global_accumulative_precision = global_accumulative_precision + num_qualified_tau_candidates * fsc_k + + local_accumulative_recall = local_accumulative_recall + fsc_k + local_accumulative_precision = local_accumulative_precision + num_qualified_tau_candidates * fsc_k + + return local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, gt_flag, det_flag, hit_str_num + + def many_to_one(local_sigma_table, local_tau_table, + local_accumulative_recall, local_accumulative_precision, + global_accumulative_recall, global_accumulative_precision, + gt_flag, det_flag, idy, rec_flag): + hit_str_num = 0 + for det_id in range(num_det): + # skip the following if the detection was matched + if det_flag[0, det_id] > 0: + continue + + non_zero_in_tau = np.where(local_tau_table[:, det_id] > 0) + num_non_zero_in_tau = non_zero_in_tau[0].shape[0] + + if num_non_zero_in_tau >= k: + ####search for all detections that overlaps with this groundtruth + qualified_sigma_candidates = np.where(( + local_sigma_table[:, det_id] >= tp) & (gt_flag[0, :] == 0)) + num_qualified_sigma_candidates = qualified_sigma_candidates[ + 0].shape[0] + + if num_qualified_sigma_candidates == 1: + if ((local_tau_table[qualified_sigma_candidates, det_id] >= + tp) and + (local_sigma_table[qualified_sigma_candidates, det_id] + >= tr)): + # became an one-to-one case + global_accumulative_recall = global_accumulative_recall + 1.0 + global_accumulative_precision = global_accumulative_precision + 1.0 + local_accumulative_recall = local_accumulative_recall + 1.0 + local_accumulative_precision = local_accumulative_precision + 1.0 + + gt_flag[0, qualified_sigma_candidates] = 1 + det_flag[0, det_id] = 1 + # recg start + if rec_flag: + pred_str_cur = global_pred_str[idy][det_id] + gt_len = len(qualified_sigma_candidates[0]) + for idx in range(gt_len): + ele_gt_id = qualified_sigma_candidates[ + 0].tolist()[idx] + if ele_gt_id not in global_gt_str[idy]: + continue + gt_str_cur = global_gt_str[idy][ele_gt_id] + if pred_str_cur == gt_str_cur: + hit_str_num += 1 + break + else: + if pred_str_cur.lower() == gt_str_cur.lower( + ): + hit_str_num += 1 + break + # recg end + elif (np.sum(local_tau_table[qualified_sigma_candidates, + det_id]) >= tp): + det_flag[0, det_id] = 1 + gt_flag[0, qualified_sigma_candidates] = 1 + # recg start + if rec_flag: + pred_str_cur = global_pred_str[idy][det_id] + gt_len = len(qualified_sigma_candidates[0]) + for idx in range(gt_len): + ele_gt_id = qualified_sigma_candidates[0].tolist()[ + idx] + if ele_gt_id not in global_gt_str[idy]: + continue + gt_str_cur = global_gt_str[idy][ele_gt_id] + if pred_str_cur == gt_str_cur: + hit_str_num += 1 + break + else: + if pred_str_cur.lower() == gt_str_cur.lower(): + hit_str_num += 1 + break + # recg end + + global_accumulative_recall = global_accumulative_recall + num_qualified_sigma_candidates * fsc_k + global_accumulative_precision = global_accumulative_precision + fsc_k + + local_accumulative_recall = local_accumulative_recall + num_qualified_sigma_candidates * fsc_k + local_accumulative_precision = local_accumulative_precision + fsc_k + return local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, gt_flag, det_flag, hit_str_num + + for idx in range(len(global_sigma)): + local_sigma_table = np.array(global_sigma[idx]) + local_tau_table = global_tau[idx] + + num_gt = local_sigma_table.shape[0] + num_det = local_sigma_table.shape[1] + + total_num_gt = total_num_gt + num_gt + total_num_det = total_num_det + num_det + + local_accumulative_recall = 0 + local_accumulative_precision = 0 + gt_flag = np.zeros((1, num_gt)) + det_flag = np.zeros((1, num_det)) + + #######first check for one-to-one case########## + local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, \ + gt_flag, det_flag, hit_str_num = one_to_one(local_sigma_table, local_tau_table, + local_accumulative_recall, local_accumulative_precision, + global_accumulative_recall, global_accumulative_precision, + gt_flag, det_flag, idx, rec_flag) + + hit_str_count += hit_str_num + #######then check for one-to-many case########## + local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, \ + gt_flag, det_flag, hit_str_num = one_to_many(local_sigma_table, local_tau_table, + local_accumulative_recall, local_accumulative_precision, + global_accumulative_recall, global_accumulative_precision, + gt_flag, det_flag, idx, rec_flag) + hit_str_count += hit_str_num + #######then check for many-to-one case########## + local_accumulative_recall, local_accumulative_precision, global_accumulative_recall, global_accumulative_precision, \ + gt_flag, det_flag, hit_str_num = many_to_one(local_sigma_table, local_tau_table, + local_accumulative_recall, local_accumulative_precision, + global_accumulative_recall, global_accumulative_precision, + gt_flag, det_flag, idx, rec_flag) + hit_str_count += hit_str_num + + try: + recall = global_accumulative_recall / total_num_gt + except ZeroDivisionError: + recall = 0 + + try: + precision = global_accumulative_precision / total_num_det + except ZeroDivisionError: + precision = 0 + + try: + f_score = 2 * precision * recall / (precision + recall) + except ZeroDivisionError: + f_score = 0 + + try: + seqerr = 1 - float(hit_str_count) / global_accumulative_recall + except ZeroDivisionError: + seqerr = 1 + + try: + recall_e2e = float(hit_str_count) / total_num_gt + except ZeroDivisionError: + recall_e2e = 0 + + try: + precision_e2e = float(hit_str_count) / total_num_det + except ZeroDivisionError: + precision_e2e = 0 + + try: + f_score_e2e = 2 * precision_e2e * recall_e2e / ( + precision_e2e + recall_e2e) + except ZeroDivisionError: + f_score_e2e = 0 + + final = { + 'total_num_gt': total_num_gt, + 'total_num_det': total_num_det, + 'global_accumulative_recall': global_accumulative_recall, + 'hit_str_count': hit_str_count, + 'recall': recall, + 'precision': precision, + 'f_score': f_score, + 'seqerr': seqerr, + 'recall_e2e': recall_e2e, + 'precision_e2e': precision_e2e, + 'f_score_e2e': f_score_e2e + } + return final diff --git a/ppocr/utils/e2e_metric/polygon_fast.py b/ppocr/utils/e2e_metric/polygon_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..81c9ad70675bb37a95968283b6dc6f42f709df27 --- /dev/null +++ b/ppocr/utils/e2e_metric/polygon_fast.py @@ -0,0 +1,83 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +from shapely.geometry import Polygon +""" +:param det_x: [1, N] Xs of detection's vertices +:param det_y: [1, N] Ys of detection's vertices +:param gt_x: [1, N] Xs of groundtruth's vertices +:param gt_y: [1, N] Ys of groundtruth's vertices + +############## +All the calculation of 'AREA' in this script is handled by: +1) First generating a binary mask with the polygon area filled up with 1's +2) Summing up all the 1's +""" + + +def area(x, y): + polygon = Polygon(np.stack([x, y], axis=1)) + return float(polygon.area) + + +def approx_area_of_intersection(det_x, det_y, gt_x, gt_y): + """ + This helper determine if both polygons are intersecting with each others with an approximation method. + Area of intersection represented by the minimum bounding rectangular [xmin, ymin, xmax, ymax] + """ + det_ymax = np.max(det_y) + det_xmax = np.max(det_x) + det_ymin = np.min(det_y) + det_xmin = np.min(det_x) + + gt_ymax = np.max(gt_y) + gt_xmax = np.max(gt_x) + gt_ymin = np.min(gt_y) + gt_xmin = np.min(gt_x) + + all_min_ymax = np.minimum(det_ymax, gt_ymax) + all_max_ymin = np.maximum(det_ymin, gt_ymin) + + intersect_heights = np.maximum(0.0, (all_min_ymax - all_max_ymin)) + + all_min_xmax = np.minimum(det_xmax, gt_xmax) + all_max_xmin = np.maximum(det_xmin, gt_xmin) + intersect_widths = np.maximum(0.0, (all_min_xmax - all_max_xmin)) + + return intersect_heights * intersect_widths + + +def area_of_intersection(det_x, det_y, gt_x, gt_y): + p1 = Polygon(np.stack([det_x, det_y], axis=1)).buffer(0) + p2 = Polygon(np.stack([gt_x, gt_y], axis=1)).buffer(0) + return float(p1.intersection(p2).area) + + +def area_of_union(det_x, det_y, gt_x, gt_y): + p1 = Polygon(np.stack([det_x, det_y], axis=1)).buffer(0) + p2 = Polygon(np.stack([gt_x, gt_y], axis=1)).buffer(0) + return float(p1.union(p2).area) + + +def iou(det_x, det_y, gt_x, gt_y): + return area_of_intersection(det_x, det_y, gt_x, gt_y) / ( + area_of_union(det_x, det_y, gt_x, gt_y) + 1.0) + + +def iod(det_x, det_y, gt_x, gt_y): + """ + This helper determine the fraction of intersection area over detection area + """ + return area_of_intersection(det_x, det_y, gt_x, gt_y) / ( + area(det_x, det_y) + 1.0) diff --git a/ppocr/utils/e2e_utils/__pycache__/extract_textpoint_fast.cpython-37.pyc b/ppocr/utils/e2e_utils/__pycache__/extract_textpoint_fast.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7de97052dc736a49fa5bf0da390380c198ab3fcf Binary files /dev/null and b/ppocr/utils/e2e_utils/__pycache__/extract_textpoint_fast.cpython-37.pyc differ diff --git a/ppocr/utils/e2e_utils/__pycache__/extract_textpoint_fast.cpython-38.pyc b/ppocr/utils/e2e_utils/__pycache__/extract_textpoint_fast.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ade00ed41d31b13d0cfd38b509453642b7c59c9 Binary files /dev/null and b/ppocr/utils/e2e_utils/__pycache__/extract_textpoint_fast.cpython-38.pyc differ diff --git a/ppocr/utils/e2e_utils/__pycache__/extract_textpoint_slow.cpython-37.pyc b/ppocr/utils/e2e_utils/__pycache__/extract_textpoint_slow.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27665fcde6265997f8424042e65e233e33ebe9ee Binary files /dev/null and b/ppocr/utils/e2e_utils/__pycache__/extract_textpoint_slow.cpython-37.pyc differ diff --git a/ppocr/utils/e2e_utils/__pycache__/extract_textpoint_slow.cpython-38.pyc b/ppocr/utils/e2e_utils/__pycache__/extract_textpoint_slow.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f53d89bc58fe145ab0e5b614fb56fdf12e88ea0 Binary files /dev/null and b/ppocr/utils/e2e_utils/__pycache__/extract_textpoint_slow.cpython-38.pyc differ diff --git a/ppocr/utils/e2e_utils/__pycache__/pgnet_pp_utils.cpython-37.pyc b/ppocr/utils/e2e_utils/__pycache__/pgnet_pp_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..144ae698fc4136f7e0e3dd03c4eadd6501e447ef Binary files /dev/null and b/ppocr/utils/e2e_utils/__pycache__/pgnet_pp_utils.cpython-37.pyc differ diff --git a/ppocr/utils/e2e_utils/__pycache__/pgnet_pp_utils.cpython-38.pyc b/ppocr/utils/e2e_utils/__pycache__/pgnet_pp_utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97d6d734b0411301d9c8397d024f349b464de740 Binary files /dev/null and b/ppocr/utils/e2e_utils/__pycache__/pgnet_pp_utils.cpython-38.pyc differ diff --git a/ppocr/utils/e2e_utils/extract_batchsize.py b/ppocr/utils/e2e_utils/extract_batchsize.py new file mode 100644 index 0000000000000000000000000000000000000000..e99a833ea76a81e02d39b16fe1a01e22f15bf3a4 --- /dev/null +++ b/ppocr/utils/e2e_utils/extract_batchsize.py @@ -0,0 +1,87 @@ +import paddle +import numpy as np +import copy + + +def org_tcl_rois(batch_size, pos_lists, pos_masks, label_lists, tcl_bs): + """ + """ + pos_lists_, pos_masks_, label_lists_ = [], [], [] + img_bs = batch_size + ngpu = int(batch_size / img_bs) + img_ids = np.array(pos_lists, dtype=np.int32)[:, 0, 0].copy() + pos_lists_split, pos_masks_split, label_lists_split = [], [], [] + for i in range(ngpu): + pos_lists_split.append([]) + pos_masks_split.append([]) + label_lists_split.append([]) + + for i in range(img_ids.shape[0]): + img_id = img_ids[i] + gpu_id = int(img_id / img_bs) + img_id = img_id % img_bs + pos_list = pos_lists[i].copy() + pos_list[:, 0] = img_id + pos_lists_split[gpu_id].append(pos_list) + pos_masks_split[gpu_id].append(pos_masks[i].copy()) + label_lists_split[gpu_id].append(copy.deepcopy(label_lists[i])) + # repeat or delete + for i in range(ngpu): + vp_len = len(pos_lists_split[i]) + if vp_len <= tcl_bs: + for j in range(0, tcl_bs - vp_len): + pos_list = pos_lists_split[i][j].copy() + pos_lists_split[i].append(pos_list) + pos_mask = pos_masks_split[i][j].copy() + pos_masks_split[i].append(pos_mask) + label_list = copy.deepcopy(label_lists_split[i][j]) + label_lists_split[i].append(label_list) + else: + for j in range(0, vp_len - tcl_bs): + c_len = len(pos_lists_split[i]) + pop_id = np.random.permutation(c_len)[0] + pos_lists_split[i].pop(pop_id) + pos_masks_split[i].pop(pop_id) + label_lists_split[i].pop(pop_id) + # merge + for i in range(ngpu): + pos_lists_.extend(pos_lists_split[i]) + pos_masks_.extend(pos_masks_split[i]) + label_lists_.extend(label_lists_split[i]) + return pos_lists_, pos_masks_, label_lists_ + + +def pre_process(label_list, pos_list, pos_mask, max_text_length, max_text_nums, + pad_num, tcl_bs): + label_list = label_list.numpy() + batch, _, _, _ = label_list.shape + pos_list = pos_list.numpy() + pos_mask = pos_mask.numpy() + pos_list_t = [] + pos_mask_t = [] + label_list_t = [] + for i in range(batch): + for j in range(max_text_nums): + if pos_mask[i, j].any(): + pos_list_t.append(pos_list[i][j]) + pos_mask_t.append(pos_mask[i][j]) + label_list_t.append(label_list[i][j]) + pos_list, pos_mask, label_list = org_tcl_rois(batch, pos_list_t, pos_mask_t, + label_list_t, tcl_bs) + label = [] + tt = [l.tolist() for l in label_list] + for i in range(tcl_bs): + k = 0 + for j in range(max_text_length): + if tt[i][j][0] != pad_num: + k += 1 + else: + break + label.append(k) + label = paddle.to_tensor(label) + label = paddle.cast(label, dtype='int64') + pos_list = paddle.to_tensor(pos_list) + pos_mask = paddle.to_tensor(pos_mask) + label_list = paddle.squeeze(paddle.to_tensor(label_list), axis=2) + label_list = paddle.cast(label_list, dtype='int32') + return pos_list, pos_mask, label_list, label diff --git a/ppocr/utils/e2e_utils/extract_textpoint_fast.py b/ppocr/utils/e2e_utils/extract_textpoint_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..a85b8e78ead00e64630b57400b9e5141eb0181a8 --- /dev/null +++ b/ppocr/utils/e2e_utils/extract_textpoint_fast.py @@ -0,0 +1,492 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains various CTC decoders.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import cv2 +import math + +import numpy as np +from itertools import groupby +from skimage.morphology._skeletonize import thin + + +def get_dict(character_dict_path): + character_str = "" + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + character_str += line + dict_character = list(character_str) + return dict_character + + +def softmax(logits): + """ + logits: N x d + """ + max_value = np.max(logits, axis=1, keepdims=True) + exp = np.exp(logits - max_value) + exp_sum = np.sum(exp, axis=1, keepdims=True) + dist = exp / exp_sum + return dist + + +def get_keep_pos_idxs(labels, remove_blank=None): + """ + Remove duplicate and get pos idxs of keep items. + The value of keep_blank should be [None, 95]. + """ + duplicate_len_list = [] + keep_pos_idx_list = [] + keep_char_idx_list = [] + for k, v_ in groupby(labels): + current_len = len(list(v_)) + if k != remove_blank: + current_idx = int(sum(duplicate_len_list) + current_len // 2) + keep_pos_idx_list.append(current_idx) + keep_char_idx_list.append(k) + duplicate_len_list.append(current_len) + return keep_char_idx_list, keep_pos_idx_list + + +def remove_blank(labels, blank=0): + new_labels = [x for x in labels if x != blank] + return new_labels + + +def insert_blank(labels, blank=0): + new_labels = [blank] + for l in labels: + new_labels += [l, blank] + return new_labels + + +def ctc_greedy_decoder(probs_seq, blank=95, keep_blank_in_idxs=True): + """ + CTC greedy (best path) decoder. + """ + raw_str = np.argmax(np.array(probs_seq), axis=1) + remove_blank_in_pos = None if keep_blank_in_idxs else blank + dedup_str, keep_idx_list = get_keep_pos_idxs( + raw_str, remove_blank=remove_blank_in_pos) + dst_str = remove_blank(dedup_str, blank=blank) + return dst_str, keep_idx_list + + +def instance_ctc_greedy_decoder(gather_info, + logits_map, + pts_num=4, + point_gather_mode=None): + _, _, C = logits_map.shape + if point_gather_mode == 'align': + insert_num = 0 + gather_info = np.array(gather_info) + length = len(gather_info) - 1 + for index in range(length): + stride_y = np.abs(gather_info[index + insert_num][0] - gather_info[ + index + 1 + insert_num][0]) + stride_x = np.abs(gather_info[index + insert_num][1] - gather_info[ + index + 1 + insert_num][1]) + max_points = int(max(stride_x, stride_y)) + stride = (gather_info[index + insert_num] - + gather_info[index + 1 + insert_num]) / (max_points) + insert_num_temp = max_points - 1 + + for i in range(int(insert_num_temp)): + insert_value = gather_info[index + insert_num] - (i + 1 + ) * stride + insert_index = index + i + 1 + insert_num + gather_info = np.insert( + gather_info, insert_index, insert_value, axis=0) + insert_num += insert_num_temp + gather_info = gather_info.tolist() + else: + pass + ys, xs = zip(*gather_info) + logits_seq = logits_map[list(ys), list(xs)] + probs_seq = logits_seq + labels = np.argmax(probs_seq, axis=1) + dst_str = [k for k, v_ in groupby(labels) if k != C - 1] + detal = len(gather_info) // (pts_num - 1) + keep_idx_list = [0] + [detal * (i + 1) for i in range(pts_num - 2)] + [-1] + keep_gather_list = [gather_info[idx] for idx in keep_idx_list] + return dst_str, keep_gather_list + + +def ctc_decoder_for_image(gather_info_list, + logits_map, + Lexicon_Table, + pts_num=6, + point_gather_mode=None): + """ + CTC decoder using multiple processes. + """ + decoder_str = [] + decoder_xys = [] + for gather_info in gather_info_list: + if len(gather_info) < pts_num: + continue + dst_str, xys_list = instance_ctc_greedy_decoder( + gather_info, + logits_map, + pts_num=pts_num, + point_gather_mode=point_gather_mode) + dst_str_readable = ''.join([Lexicon_Table[idx] for idx in dst_str]) + if len(dst_str_readable) < 2: + continue + decoder_str.append(dst_str_readable) + decoder_xys.append(xys_list) + return decoder_str, decoder_xys + + +def sort_with_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + + def sort_part_with_direction(pos_list, point_direction): + pos_list = np.array(pos_list).reshape(-1, 2) + point_direction = np.array(point_direction).reshape(-1, 2) + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list[np.argsort(pos_proj_leng)].tolist() + sorted_direction = point_direction[np.argsort(pos_proj_leng)].tolist() + return sorted_list, sorted_direction + + pos_list = np.array(pos_list).reshape(-1, 2) + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + sorted_point, sorted_direction = sort_part_with_direction(pos_list, + point_direction) + + point_num = len(sorted_point) + if point_num >= 16: + middle_num = point_num // 2 + first_part_point = sorted_point[:middle_num] + first_point_direction = sorted_direction[:middle_num] + sorted_fist_part_point, sorted_fist_part_direction = sort_part_with_direction( + first_part_point, first_point_direction) + + last_part_point = sorted_point[middle_num:] + last_point_direction = sorted_direction[middle_num:] + sorted_last_part_point, sorted_last_part_direction = sort_part_with_direction( + last_part_point, last_point_direction) + sorted_point = sorted_fist_part_point + sorted_last_part_point + sorted_direction = sorted_fist_part_direction + sorted_last_part_direction + + return sorted_point, np.array(sorted_direction) + + +def add_id(pos_list, image_id=0): + """ + Add id for gather feature, for inference. + """ + new_list = [] + for item in pos_list: + new_list.append((image_id, item[0], item[1])) + return new_list + + +def sort_and_expand_with_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + h, w, _ = f_direction.shape + sorted_list, point_direction = sort_with_direction(pos_list, f_direction) + + point_num = len(sorted_list) + sub_direction_len = max(point_num // 3, 2) + left_direction = point_direction[:sub_direction_len, :] + right_dirction = point_direction[point_num - sub_direction_len:, :] + + left_average_direction = -np.mean(left_direction, axis=0, keepdims=True) + left_average_len = np.linalg.norm(left_average_direction) + left_start = np.array(sorted_list[0]) + left_step = left_average_direction / (left_average_len + 1e-6) + + right_average_direction = np.mean(right_dirction, axis=0, keepdims=True) + right_average_len = np.linalg.norm(right_average_direction) + right_step = right_average_direction / (right_average_len + 1e-6) + right_start = np.array(sorted_list[-1]) + + append_num = max( + int((left_average_len + right_average_len) / 2.0 * 0.15), 1) + left_list = [] + right_list = [] + for i in range(append_num): + ly, lx = np.round(left_start + left_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ly < h and lx < w and (ly, lx) not in left_list: + left_list.append((ly, lx)) + ry, rx = np.round(right_start + right_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ry < h and rx < w and (ry, rx) not in right_list: + right_list.append((ry, rx)) + + all_list = left_list[::-1] + sorted_list + right_list + return all_list + + +def sort_and_expand_with_direction_v2(pos_list, f_direction, binary_tcl_map): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + binary_tcl_map: h x w + """ + h, w, _ = f_direction.shape + sorted_list, point_direction = sort_with_direction(pos_list, f_direction) + + point_num = len(sorted_list) + sub_direction_len = max(point_num // 3, 2) + left_direction = point_direction[:sub_direction_len, :] + right_dirction = point_direction[point_num - sub_direction_len:, :] + + left_average_direction = -np.mean(left_direction, axis=0, keepdims=True) + left_average_len = np.linalg.norm(left_average_direction) + left_start = np.array(sorted_list[0]) + left_step = left_average_direction / (left_average_len + 1e-6) + + right_average_direction = np.mean(right_dirction, axis=0, keepdims=True) + right_average_len = np.linalg.norm(right_average_direction) + right_step = right_average_direction / (right_average_len + 1e-6) + right_start = np.array(sorted_list[-1]) + + append_num = max( + int((left_average_len + right_average_len) / 2.0 * 0.15), 1) + max_append_num = 2 * append_num + + left_list = [] + right_list = [] + for i in range(max_append_num): + ly, lx = np.round(left_start + left_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ly < h and lx < w and (ly, lx) not in left_list: + if binary_tcl_map[ly, lx] > 0.5: + left_list.append((ly, lx)) + else: + break + + for i in range(max_append_num): + ry, rx = np.round(right_start + right_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ry < h and rx < w and (ry, rx) not in right_list: + if binary_tcl_map[ry, rx] > 0.5: + right_list.append((ry, rx)) + else: + break + + all_list = left_list[::-1] + sorted_list + right_list + return all_list + + +def point_pair2poly(point_pair_list): + """ + Transfer vertical point_pairs into poly point in clockwise. + """ + point_num = len(point_pair_list) * 2 + point_list = [0] * point_num + for idx, point_pair in enumerate(point_pair_list): + point_list[idx] = point_pair[0] + point_list[point_num - 1 - idx] = point_pair[1] + return np.array(point_list).reshape(-1, 2) + + +def shrink_quad_along_width(quad, begin_width_ratio=0., end_width_ratio=1.): + ratio_pair = np.array( + [[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + +def expand_poly_along_width(poly, shrink_ratio_of_width=0.3): + """ + expand poly along width. + """ + point_num = poly.shape[0] + left_quad = np.array( + [poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32) + left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \ + (np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6) + left_quad_expand = shrink_quad_along_width(left_quad, left_ratio, 1.0) + right_quad = np.array( + [ + poly[point_num // 2 - 2], poly[point_num // 2 - 1], + poly[point_num // 2], poly[point_num // 2 + 1] + ], + dtype=np.float32) + right_ratio = 1.0 + shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \ + (np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6) + right_quad_expand = shrink_quad_along_width(right_quad, 0.0, right_ratio) + poly[0] = left_quad_expand[0] + poly[-1] = left_quad_expand[-1] + poly[point_num // 2 - 1] = right_quad_expand[1] + poly[point_num // 2] = right_quad_expand[2] + return poly + + +def restore_poly(instance_yxs_list, seq_strs, p_border, ratio_w, ratio_h, src_w, + src_h, valid_set): + poly_list = [] + keep_str_list = [] + for yx_center_line, keep_str in zip(instance_yxs_list, seq_strs): + if len(keep_str) < 2: + print('--> too short, {}'.format(keep_str)) + continue + + offset_expand = 1.0 + if valid_set == 'totaltext': + offset_expand = 1.2 + + point_pair_list = [] + for y, x in yx_center_line: + offset = p_border[:, y, x].reshape(2, 2) * offset_expand + ori_yx = np.array([y, x], dtype=np.float32) + point_pair = (ori_yx + offset)[:, ::-1] * 4.0 / np.array( + [ratio_w, ratio_h]).reshape(-1, 2) + point_pair_list.append(point_pair) + + detected_poly = point_pair2poly(point_pair_list) + detected_poly = expand_poly_along_width( + detected_poly, shrink_ratio_of_width=0.2) + detected_poly[:, 0] = np.clip(detected_poly[:, 0], a_min=0, a_max=src_w) + detected_poly[:, 1] = np.clip(detected_poly[:, 1], a_min=0, a_max=src_h) + + keep_str_list.append(keep_str) + if valid_set == 'partvgg': + middle_point = len(detected_poly) // 2 + detected_poly = detected_poly[ + [0, middle_point - 1, middle_point, -1], :] + poly_list.append(detected_poly) + elif valid_set == 'totaltext': + poly_list.append(detected_poly) + else: + print('--> Not supported format.') + exit(-1) + return poly_list, keep_str_list + + +def generate_pivot_list_fast(p_score, + p_char_maps, + f_direction, + Lexicon_Table, + score_thresh=0.5, + point_gather_mode=None): + """ + return center point and end point of TCL instance; filter with the char maps; + """ + p_score = p_score[0] + f_direction = f_direction.transpose(1, 2, 0) + p_tcl_map = (p_score > score_thresh) * 1.0 + skeleton_map = thin(p_tcl_map.astype(np.uint8)) + instance_count, instance_label_map = cv2.connectedComponents( + skeleton_map.astype(np.uint8), connectivity=8) + + # get TCL Instance + all_pos_yxs = [] + if instance_count > 0: + for instance_id in range(1, instance_count): + pos_list = [] + ys, xs = np.where(instance_label_map == instance_id) + pos_list = list(zip(ys, xs)) + + if len(pos_list) < 3: + continue + + pos_list_sorted = sort_and_expand_with_direction_v2( + pos_list, f_direction, p_tcl_map) + all_pos_yxs.append(pos_list_sorted) + + p_char_maps = p_char_maps.transpose([1, 2, 0]) + decoded_str, keep_yxs_list = ctc_decoder_for_image( + all_pos_yxs, + logits_map=p_char_maps, + Lexicon_Table=Lexicon_Table, + point_gather_mode=point_gather_mode) + return keep_yxs_list, decoded_str + + +def extract_main_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + pos_list = np.array(pos_list) + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] + point_direction = point_direction[:, ::-1] # x, y -> y, x + average_direction = np.mean(point_direction, axis=0, keepdims=True) + average_direction = average_direction / ( + np.linalg.norm(average_direction) + 1e-6) + return average_direction + + +def sort_by_direction_with_image_id_deprecated(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[id, y, x], [id, y, x], [id, y, x] ...] + """ + pos_list_full = np.array(pos_list).reshape(-1, 3) + pos_list = pos_list_full[:, 1:] + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list_full[np.argsort(pos_proj_leng)].tolist() + return sorted_list + + +def sort_by_direction_with_image_id(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + + def sort_part_with_direction(pos_list_full, point_direction): + pos_list_full = np.array(pos_list_full).reshape(-1, 3) + pos_list = pos_list_full[:, 1:] + point_direction = np.array(point_direction).reshape(-1, 2) + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list_full[np.argsort(pos_proj_leng)].tolist() + sorted_direction = point_direction[np.argsort(pos_proj_leng)].tolist() + return sorted_list, sorted_direction + + pos_list = np.array(pos_list).reshape(-1, 3) + point_direction = f_direction[pos_list[:, 1], pos_list[:, 2]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + sorted_point, sorted_direction = sort_part_with_direction(pos_list, + point_direction) + + point_num = len(sorted_point) + if point_num >= 16: + middle_num = point_num // 2 + first_part_point = sorted_point[:middle_num] + first_point_direction = sorted_direction[:middle_num] + sorted_fist_part_point, sorted_fist_part_direction = sort_part_with_direction( + first_part_point, first_point_direction) + + last_part_point = sorted_point[middle_num:] + last_point_direction = sorted_direction[middle_num:] + sorted_last_part_point, sorted_last_part_direction = sort_part_with_direction( + last_part_point, last_point_direction) + sorted_point = sorted_fist_part_point + sorted_last_part_point + sorted_direction = sorted_fist_part_direction + sorted_last_part_direction + + return sorted_point diff --git a/ppocr/utils/e2e_utils/extract_textpoint_slow.py b/ppocr/utils/e2e_utils/extract_textpoint_slow.py new file mode 100644 index 0000000000000000000000000000000000000000..ace46fba372e0b6ee27959076e1e855e77c4305c --- /dev/null +++ b/ppocr/utils/e2e_utils/extract_textpoint_slow.py @@ -0,0 +1,592 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains various CTC decoders.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import cv2 +import math + +import numpy as np +from itertools import groupby +from skimage.morphology._skeletonize import thin + + +def get_dict(character_dict_path): + character_str = "" + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + character_str += line + dict_character = list(character_str) + return dict_character + + +def point_pair2poly(point_pair_list): + """ + Transfer vertical point_pairs into poly point in clockwise. + """ + pair_length_list = [] + for point_pair in point_pair_list: + pair_length = np.linalg.norm(point_pair[0] - point_pair[1]) + pair_length_list.append(pair_length) + pair_length_list = np.array(pair_length_list) + pair_info = (pair_length_list.max(), pair_length_list.min(), + pair_length_list.mean()) + + point_num = len(point_pair_list) * 2 + point_list = [0] * point_num + for idx, point_pair in enumerate(point_pair_list): + point_list[idx] = point_pair[0] + point_list[point_num - 1 - idx] = point_pair[1] + return np.array(point_list).reshape(-1, 2), pair_info + + +def shrink_quad_along_width(quad, begin_width_ratio=0., end_width_ratio=1.): + """ + Generate shrink_quad_along_width. + """ + ratio_pair = np.array( + [[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + +def expand_poly_along_width(poly, shrink_ratio_of_width=0.3): + """ + expand poly along width. + """ + point_num = poly.shape[0] + left_quad = np.array( + [poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32) + left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \ + (np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6) + left_quad_expand = shrink_quad_along_width(left_quad, left_ratio, 1.0) + right_quad = np.array( + [ + poly[point_num // 2 - 2], poly[point_num // 2 - 1], + poly[point_num // 2], poly[point_num // 2 + 1] + ], + dtype=np.float32) + right_ratio = 1.0 + \ + shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \ + (np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6) + right_quad_expand = shrink_quad_along_width(right_quad, 0.0, right_ratio) + poly[0] = left_quad_expand[0] + poly[-1] = left_quad_expand[-1] + poly[point_num // 2 - 1] = right_quad_expand[1] + poly[point_num // 2] = right_quad_expand[2] + return poly + + +def softmax(logits): + """ + logits: N x d + """ + max_value = np.max(logits, axis=1, keepdims=True) + exp = np.exp(logits - max_value) + exp_sum = np.sum(exp, axis=1, keepdims=True) + dist = exp / exp_sum + return dist + + +def get_keep_pos_idxs(labels, remove_blank=None): + """ + Remove duplicate and get pos idxs of keep items. + The value of keep_blank should be [None, 95]. + """ + duplicate_len_list = [] + keep_pos_idx_list = [] + keep_char_idx_list = [] + for k, v_ in groupby(labels): + current_len = len(list(v_)) + if k != remove_blank: + current_idx = int(sum(duplicate_len_list) + current_len // 2) + keep_pos_idx_list.append(current_idx) + keep_char_idx_list.append(k) + duplicate_len_list.append(current_len) + return keep_char_idx_list, keep_pos_idx_list + + +def remove_blank(labels, blank=0): + new_labels = [x for x in labels if x != blank] + return new_labels + + +def insert_blank(labels, blank=0): + new_labels = [blank] + for l in labels: + new_labels += [l, blank] + return new_labels + + +def ctc_greedy_decoder(probs_seq, blank=95, keep_blank_in_idxs=True): + """ + CTC greedy (best path) decoder. + """ + raw_str = np.argmax(np.array(probs_seq), axis=1) + remove_blank_in_pos = None if keep_blank_in_idxs else blank + dedup_str, keep_idx_list = get_keep_pos_idxs( + raw_str, remove_blank=remove_blank_in_pos) + dst_str = remove_blank(dedup_str, blank=blank) + return dst_str, keep_idx_list + + +def instance_ctc_greedy_decoder(gather_info, + logits_map, + keep_blank_in_idxs=True): + """ + gather_info: [[x, y], [x, y] ...] + logits_map: H x W X (n_chars + 1) + """ + _, _, C = logits_map.shape + ys, xs = zip(*gather_info) + logits_seq = logits_map[list(ys), list(xs)] # n x 96 + probs_seq = softmax(logits_seq) + dst_str, keep_idx_list = ctc_greedy_decoder( + probs_seq, blank=C - 1, keep_blank_in_idxs=keep_blank_in_idxs) + keep_gather_list = [gather_info[idx] for idx in keep_idx_list] + return dst_str, keep_gather_list + + +def ctc_decoder_for_image(gather_info_list, logits_map, + keep_blank_in_idxs=True): + """ + CTC decoder using multiple processes. + """ + decoder_results = [] + for gather_info in gather_info_list: + res = instance_ctc_greedy_decoder( + gather_info, logits_map, keep_blank_in_idxs=keep_blank_in_idxs) + decoder_results.append(res) + return decoder_results + + +def sort_with_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + + def sort_part_with_direction(pos_list, point_direction): + pos_list = np.array(pos_list).reshape(-1, 2) + point_direction = np.array(point_direction).reshape(-1, 2) + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list[np.argsort(pos_proj_leng)].tolist() + sorted_direction = point_direction[np.argsort(pos_proj_leng)].tolist() + return sorted_list, sorted_direction + + pos_list = np.array(pos_list).reshape(-1, 2) + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + sorted_point, sorted_direction = sort_part_with_direction(pos_list, + point_direction) + + point_num = len(sorted_point) + if point_num >= 16: + middle_num = point_num // 2 + first_part_point = sorted_point[:middle_num] + first_point_direction = sorted_direction[:middle_num] + sorted_fist_part_point, sorted_fist_part_direction = sort_part_with_direction( + first_part_point, first_point_direction) + + last_part_point = sorted_point[middle_num:] + last_point_direction = sorted_direction[middle_num:] + sorted_last_part_point, sorted_last_part_direction = sort_part_with_direction( + last_part_point, last_point_direction) + sorted_point = sorted_fist_part_point + sorted_last_part_point + sorted_direction = sorted_fist_part_direction + sorted_last_part_direction + + return sorted_point, np.array(sorted_direction) + + +def add_id(pos_list, image_id=0): + """ + Add id for gather feature, for inference. + """ + new_list = [] + for item in pos_list: + new_list.append((image_id, item[0], item[1])) + return new_list + + +def sort_and_expand_with_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + h, w, _ = f_direction.shape + sorted_list, point_direction = sort_with_direction(pos_list, f_direction) + + # expand along + point_num = len(sorted_list) + sub_direction_len = max(point_num // 3, 2) + left_direction = point_direction[:sub_direction_len, :] + right_dirction = point_direction[point_num - sub_direction_len:, :] + + left_average_direction = -np.mean(left_direction, axis=0, keepdims=True) + left_average_len = np.linalg.norm(left_average_direction) + left_start = np.array(sorted_list[0]) + left_step = left_average_direction / (left_average_len + 1e-6) + + right_average_direction = np.mean(right_dirction, axis=0, keepdims=True) + right_average_len = np.linalg.norm(right_average_direction) + right_step = right_average_direction / (right_average_len + 1e-6) + right_start = np.array(sorted_list[-1]) + + append_num = max( + int((left_average_len + right_average_len) / 2.0 * 0.15), 1) + left_list = [] + right_list = [] + for i in range(append_num): + ly, lx = np.round(left_start + left_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ly < h and lx < w and (ly, lx) not in left_list: + left_list.append((ly, lx)) + ry, rx = np.round(right_start + right_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ry < h and rx < w and (ry, rx) not in right_list: + right_list.append((ry, rx)) + + all_list = left_list[::-1] + sorted_list + right_list + return all_list + + +def sort_and_expand_with_direction_v2(pos_list, f_direction, binary_tcl_map): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + binary_tcl_map: h x w + """ + h, w, _ = f_direction.shape + sorted_list, point_direction = sort_with_direction(pos_list, f_direction) + + # expand along + point_num = len(sorted_list) + sub_direction_len = max(point_num // 3, 2) + left_direction = point_direction[:sub_direction_len, :] + right_dirction = point_direction[point_num - sub_direction_len:, :] + + left_average_direction = -np.mean(left_direction, axis=0, keepdims=True) + left_average_len = np.linalg.norm(left_average_direction) + left_start = np.array(sorted_list[0]) + left_step = left_average_direction / (left_average_len + 1e-6) + + right_average_direction = np.mean(right_dirction, axis=0, keepdims=True) + right_average_len = np.linalg.norm(right_average_direction) + right_step = right_average_direction / (right_average_len + 1e-6) + right_start = np.array(sorted_list[-1]) + + append_num = max( + int((left_average_len + right_average_len) / 2.0 * 0.15), 1) + max_append_num = 2 * append_num + + left_list = [] + right_list = [] + for i in range(max_append_num): + ly, lx = np.round(left_start + left_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ly < h and lx < w and (ly, lx) not in left_list: + if binary_tcl_map[ly, lx] > 0.5: + left_list.append((ly, lx)) + else: + break + + for i in range(max_append_num): + ry, rx = np.round(right_start + right_step * (i + 1)).flatten().astype( + 'int32').tolist() + if ry < h and rx < w and (ry, rx) not in right_list: + if binary_tcl_map[ry, rx] > 0.5: + right_list.append((ry, rx)) + else: + break + + all_list = left_list[::-1] + sorted_list + right_list + return all_list + + +def generate_pivot_list_curved(p_score, + p_char_maps, + f_direction, + score_thresh=0.5, + is_expand=True, + is_backbone=False, + image_id=0): + """ + return center point and end point of TCL instance; filter with the char maps; + """ + p_score = p_score[0] + f_direction = f_direction.transpose(1, 2, 0) + p_tcl_map = (p_score > score_thresh) * 1.0 + skeleton_map = thin(p_tcl_map) + instance_count, instance_label_map = cv2.connectedComponents( + skeleton_map.astype(np.uint8), connectivity=8) + + # get TCL Instance + all_pos_yxs = [] + center_pos_yxs = [] + end_points_yxs = [] + instance_center_pos_yxs = [] + pred_strs = [] + if instance_count > 0: + for instance_id in range(1, instance_count): + pos_list = [] + ys, xs = np.where(instance_label_map == instance_id) + pos_list = list(zip(ys, xs)) + + ### FIX-ME, eliminate outlier + if len(pos_list) < 3: + continue + + if is_expand: + pos_list_sorted = sort_and_expand_with_direction_v2( + pos_list, f_direction, p_tcl_map) + else: + pos_list_sorted, _ = sort_with_direction(pos_list, f_direction) + all_pos_yxs.append(pos_list_sorted) + + # use decoder to filter backgroud points. + p_char_maps = p_char_maps.transpose([1, 2, 0]) + decode_res = ctc_decoder_for_image( + all_pos_yxs, logits_map=p_char_maps, keep_blank_in_idxs=True) + for decoded_str, keep_yxs_list in decode_res: + if is_backbone: + keep_yxs_list_with_id = add_id(keep_yxs_list, image_id=image_id) + instance_center_pos_yxs.append(keep_yxs_list_with_id) + pred_strs.append(decoded_str) + else: + end_points_yxs.extend((keep_yxs_list[0], keep_yxs_list[-1])) + center_pos_yxs.extend(keep_yxs_list) + + if is_backbone: + return pred_strs, instance_center_pos_yxs + else: + return center_pos_yxs, end_points_yxs + + +def generate_pivot_list_horizontal(p_score, + p_char_maps, + f_direction, + score_thresh=0.5, + is_backbone=False, + image_id=0): + """ + return center point and end point of TCL instance; filter with the char maps; + """ + p_score = p_score[0] + f_direction = f_direction.transpose(1, 2, 0) + p_tcl_map_bi = (p_score > score_thresh) * 1.0 + instance_count, instance_label_map = cv2.connectedComponents( + p_tcl_map_bi.astype(np.uint8), connectivity=8) + + # get TCL Instance + all_pos_yxs = [] + center_pos_yxs = [] + end_points_yxs = [] + instance_center_pos_yxs = [] + + if instance_count > 0: + for instance_id in range(1, instance_count): + pos_list = [] + ys, xs = np.where(instance_label_map == instance_id) + pos_list = list(zip(ys, xs)) + + ### FIX-ME, eliminate outlier + if len(pos_list) < 5: + continue + + # add rule here + main_direction = extract_main_direction(pos_list, + f_direction) # y x + reference_directin = np.array([0, 1]).reshape([-1, 2]) # y x + is_h_angle = abs(np.sum( + main_direction * reference_directin)) < math.cos(math.pi / 180 * + 70) + + point_yxs = np.array(pos_list) + max_y, max_x = np.max(point_yxs, axis=0) + min_y, min_x = np.min(point_yxs, axis=0) + is_h_len = (max_y - min_y) < 1.5 * (max_x - min_x) + + pos_list_final = [] + if is_h_len: + xs = np.unique(xs) + for x in xs: + ys = instance_label_map[:, x].copy().reshape((-1, )) + y = int(np.where(ys == instance_id)[0].mean()) + pos_list_final.append((y, x)) + else: + ys = np.unique(ys) + for y in ys: + xs = instance_label_map[y, :].copy().reshape((-1, )) + x = int(np.where(xs == instance_id)[0].mean()) + pos_list_final.append((y, x)) + + pos_list_sorted, _ = sort_with_direction(pos_list_final, + f_direction) + all_pos_yxs.append(pos_list_sorted) + + # use decoder to filter backgroud points. + p_char_maps = p_char_maps.transpose([1, 2, 0]) + decode_res = ctc_decoder_for_image( + all_pos_yxs, logits_map=p_char_maps, keep_blank_in_idxs=True) + for decoded_str, keep_yxs_list in decode_res: + if is_backbone: + keep_yxs_list_with_id = add_id(keep_yxs_list, image_id=image_id) + instance_center_pos_yxs.append(keep_yxs_list_with_id) + else: + end_points_yxs.extend((keep_yxs_list[0], keep_yxs_list[-1])) + center_pos_yxs.extend(keep_yxs_list) + + if is_backbone: + return instance_center_pos_yxs + else: + return center_pos_yxs, end_points_yxs + + +def generate_pivot_list_slow(p_score, + p_char_maps, + f_direction, + score_thresh=0.5, + is_backbone=False, + is_curved=True, + image_id=0): + """ + Warp all the function together. + """ + if is_curved: + return generate_pivot_list_curved( + p_score, + p_char_maps, + f_direction, + score_thresh=score_thresh, + is_expand=True, + is_backbone=is_backbone, + image_id=image_id) + else: + return generate_pivot_list_horizontal( + p_score, + p_char_maps, + f_direction, + score_thresh=score_thresh, + is_backbone=is_backbone, + image_id=image_id) + + +# for refine module +def extract_main_direction(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + pos_list = np.array(pos_list) + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] + point_direction = point_direction[:, ::-1] # x, y -> y, x + average_direction = np.mean(point_direction, axis=0, keepdims=True) + average_direction = average_direction / ( + np.linalg.norm(average_direction) + 1e-6) + return average_direction + + +def sort_by_direction_with_image_id_deprecated(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[id, y, x], [id, y, x], [id, y, x] ...] + """ + pos_list_full = np.array(pos_list).reshape(-1, 3) + pos_list = pos_list_full[:, 1:] + point_direction = f_direction[pos_list[:, 0], pos_list[:, 1]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list_full[np.argsort(pos_proj_leng)].tolist() + return sorted_list + + +def sort_by_direction_with_image_id(pos_list, f_direction): + """ + f_direction: h x w x 2 + pos_list: [[y, x], [y, x], [y, x] ...] + """ + + def sort_part_with_direction(pos_list_full, point_direction): + pos_list_full = np.array(pos_list_full).reshape(-1, 3) + pos_list = pos_list_full[:, 1:] + point_direction = np.array(point_direction).reshape(-1, 2) + average_direction = np.mean(point_direction, axis=0, keepdims=True) + pos_proj_leng = np.sum(pos_list * average_direction, axis=1) + sorted_list = pos_list_full[np.argsort(pos_proj_leng)].tolist() + sorted_direction = point_direction[np.argsort(pos_proj_leng)].tolist() + return sorted_list, sorted_direction + + pos_list = np.array(pos_list).reshape(-1, 3) + point_direction = f_direction[pos_list[:, 1], pos_list[:, 2]] # x, y + point_direction = point_direction[:, ::-1] # x, y -> y, x + sorted_point, sorted_direction = sort_part_with_direction(pos_list, + point_direction) + + point_num = len(sorted_point) + if point_num >= 16: + middle_num = point_num // 2 + first_part_point = sorted_point[:middle_num] + first_point_direction = sorted_direction[:middle_num] + sorted_fist_part_point, sorted_fist_part_direction = sort_part_with_direction( + first_part_point, first_point_direction) + + last_part_point = sorted_point[middle_num:] + last_point_direction = sorted_direction[middle_num:] + sorted_last_part_point, sorted_last_part_direction = sort_part_with_direction( + last_part_point, last_point_direction) + sorted_point = sorted_fist_part_point + sorted_last_part_point + sorted_direction = sorted_fist_part_direction + sorted_last_part_direction + + return sorted_point + + +def generate_pivot_list_tt_inference(p_score, + p_char_maps, + f_direction, + score_thresh=0.5, + is_backbone=False, + is_curved=True, + image_id=0): + """ + return center point and end point of TCL instance; filter with the char maps; + """ + p_score = p_score[0] + f_direction = f_direction.transpose(1, 2, 0) + p_tcl_map = (p_score > score_thresh) * 1.0 + skeleton_map = thin(p_tcl_map) + instance_count, instance_label_map = cv2.connectedComponents( + skeleton_map.astype(np.uint8), connectivity=8) + + # get TCL Instance + all_pos_yxs = [] + if instance_count > 0: + for instance_id in range(1, instance_count): + pos_list = [] + ys, xs = np.where(instance_label_map == instance_id) + pos_list = list(zip(ys, xs)) + ### FIX-ME, eliminate outlier + if len(pos_list) < 3: + continue + pos_list_sorted = sort_and_expand_with_direction_v2( + pos_list, f_direction, p_tcl_map) + pos_list_sorted_with_id = add_id(pos_list_sorted, image_id=image_id) + all_pos_yxs.append(pos_list_sorted_with_id) + return all_pos_yxs diff --git a/ppocr/utils/e2e_utils/pgnet_pp_utils.py b/ppocr/utils/e2e_utils/pgnet_pp_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..06a766b0e714e2792c0b0d3069963de998eb9eb7 --- /dev/null +++ b/ppocr/utils/e2e_utils/pgnet_pp_utils.py @@ -0,0 +1,169 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import paddle +import os +import sys + +__dir__ = os.path.dirname(__file__) +sys.path.append(__dir__) +sys.path.append(os.path.join(__dir__, '..')) +from extract_textpoint_slow import * +from extract_textpoint_fast import generate_pivot_list_fast, restore_poly + + +class PGNet_PostProcess(object): + # two different post-process + def __init__(self, + character_dict_path, + valid_set, + score_thresh, + outs_dict, + shape_list, + point_gather_mode=None): + self.Lexicon_Table = get_dict(character_dict_path) + self.valid_set = valid_set + self.score_thresh = score_thresh + self.outs_dict = outs_dict + self.shape_list = shape_list + self.point_gather_mode = point_gather_mode + + def pg_postprocess_fast(self): + p_score = self.outs_dict['f_score'] + p_border = self.outs_dict['f_border'] + p_char = self.outs_dict['f_char'] + p_direction = self.outs_dict['f_direction'] + if isinstance(p_score, paddle.Tensor): + p_score = p_score[0].numpy() + p_border = p_border[0].numpy() + p_direction = p_direction[0].numpy() + p_char = p_char[0].numpy() + else: + p_score = p_score[0] + p_border = p_border[0] + p_direction = p_direction[0] + p_char = p_char[0] + + src_h, src_w, ratio_h, ratio_w = self.shape_list[0] + instance_yxs_list, seq_strs = generate_pivot_list_fast( + p_score, + p_char, + p_direction, + self.Lexicon_Table, + score_thresh=self.score_thresh, + point_gather_mode=self.point_gather_mode) + poly_list, keep_str_list = restore_poly(instance_yxs_list, seq_strs, + p_border, ratio_w, ratio_h, + src_w, src_h, self.valid_set) + data = { + 'points': poly_list, + 'texts': keep_str_list, + } + return data + + def pg_postprocess_slow(self): + p_score = self.outs_dict['f_score'] + p_border = self.outs_dict['f_border'] + p_char = self.outs_dict['f_char'] + p_direction = self.outs_dict['f_direction'] + if isinstance(p_score, paddle.Tensor): + p_score = p_score[0].numpy() + p_border = p_border[0].numpy() + p_direction = p_direction[0].numpy() + p_char = p_char[0].numpy() + else: + p_score = p_score[0] + p_border = p_border[0] + p_direction = p_direction[0] + p_char = p_char[0] + src_h, src_w, ratio_h, ratio_w = self.shape_list[0] + is_curved = self.valid_set == "totaltext" + char_seq_idx_set, instance_yxs_list = generate_pivot_list_slow( + p_score, + p_char, + p_direction, + score_thresh=self.score_thresh, + is_backbone=True, + is_curved=is_curved) + seq_strs = [] + for char_idx_set in char_seq_idx_set: + pr_str = ''.join([self.Lexicon_Table[pos] for pos in char_idx_set]) + seq_strs.append(pr_str) + poly_list = [] + keep_str_list = [] + all_point_list = [] + all_point_pair_list = [] + for yx_center_line, keep_str in zip(instance_yxs_list, seq_strs): + if len(yx_center_line) == 1: + yx_center_line.append(yx_center_line[-1]) + + offset_expand = 1.0 + if self.valid_set == 'totaltext': + offset_expand = 1.2 + + point_pair_list = [] + for batch_id, y, x in yx_center_line: + offset = p_border[:, y, x].reshape(2, 2) + if offset_expand != 1.0: + offset_length = np.linalg.norm( + offset, axis=1, keepdims=True) + expand_length = np.clip( + offset_length * (offset_expand - 1), + a_min=0.5, + a_max=3.0) + offset_detal = offset / offset_length * expand_length + offset = offset + offset_detal + ori_yx = np.array([y, x], dtype=np.float32) + point_pair = (ori_yx + offset)[:, ::-1] * 4.0 / np.array( + [ratio_w, ratio_h]).reshape(-1, 2) + point_pair_list.append(point_pair) + + all_point_list.append([ + int(round(x * 4.0 / ratio_w)), + int(round(y * 4.0 / ratio_h)) + ]) + all_point_pair_list.append(point_pair.round().astype(np.int32) + .tolist()) + + detected_poly, pair_length_info = point_pair2poly(point_pair_list) + detected_poly = expand_poly_along_width( + detected_poly, shrink_ratio_of_width=0.2) + detected_poly[:, 0] = np.clip( + detected_poly[:, 0], a_min=0, a_max=src_w) + detected_poly[:, 1] = np.clip( + detected_poly[:, 1], a_min=0, a_max=src_h) + + if len(keep_str) < 2: + continue + + keep_str_list.append(keep_str) + detected_poly = np.round(detected_poly).astype('int32') + if self.valid_set == 'partvgg': + middle_point = len(detected_poly) // 2 + detected_poly = detected_poly[ + [0, middle_point - 1, middle_point, -1], :] + poly_list.append(detected_poly) + elif self.valid_set == 'totaltext': + poly_list.append(detected_poly) + else: + print('--> Not supported format.') + exit(-1) + data = { + 'points': poly_list, + 'texts': keep_str_list, + } + return data diff --git a/ppocr/utils/e2e_utils/visual.py b/ppocr/utils/e2e_utils/visual.py new file mode 100644 index 0000000000000000000000000000000000000000..e6e4fd0667dbf4a42dbc0fd9bf26e6fd91be0d82 --- /dev/null +++ b/ppocr/utils/e2e_utils/visual.py @@ -0,0 +1,162 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import cv2 +import time + + +def resize_image(im, max_side_len=512): + """ + resize image to a size multiple of max_stride which is required by the network + :param im: the resized image + :param max_side_len: limit of max image size to avoid out of memory in gpu + :return: the resized image and the resize ratio + """ + h, w, _ = im.shape + + resize_w = w + resize_h = h + + if resize_h > resize_w: + ratio = float(max_side_len) / resize_h + else: + ratio = float(max_side_len) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + + return im, (ratio_h, ratio_w) + + +def resize_image_min(im, max_side_len=512): + """ + """ + h, w, _ = im.shape + + resize_w = w + resize_h = h + + if resize_h < resize_w: + ratio = float(max_side_len) / resize_h + else: + ratio = float(max_side_len) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return im, (ratio_h, ratio_w) + + +def resize_image_for_totaltext(im, max_side_len=512): + """ + """ + h, w, _ = im.shape + + resize_w = w + resize_h = h + ratio = 1.25 + if h * ratio > max_side_len: + ratio = float(max_side_len) / resize_h + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return im, (ratio_h, ratio_w) + + +def point_pair2poly(point_pair_list): + """ + Transfer vertical point_pairs into poly point in clockwise. + """ + pair_length_list = [] + for point_pair in point_pair_list: + pair_length = np.linalg.norm(point_pair[0] - point_pair[1]) + pair_length_list.append(pair_length) + pair_length_list = np.array(pair_length_list) + pair_info = (pair_length_list.max(), pair_length_list.min(), + pair_length_list.mean()) + + point_num = len(point_pair_list) * 2 + point_list = [0] * point_num + for idx, point_pair in enumerate(point_pair_list): + point_list[idx] = point_pair[0] + point_list[point_num - 1 - idx] = point_pair[1] + return np.array(point_list).reshape(-1, 2), pair_info + + +def shrink_quad_along_width(quad, begin_width_ratio=0., end_width_ratio=1.): + """ + Generate shrink_quad_along_width. + """ + ratio_pair = np.array( + [[begin_width_ratio], [end_width_ratio]], dtype=np.float32) + p0_1 = quad[0] + (quad[1] - quad[0]) * ratio_pair + p3_2 = quad[3] + (quad[2] - quad[3]) * ratio_pair + return np.array([p0_1[0], p0_1[1], p3_2[1], p3_2[0]]) + + +def expand_poly_along_width(poly, shrink_ratio_of_width=0.3): + """ + expand poly along width. + """ + point_num = poly.shape[0] + left_quad = np.array( + [poly[0], poly[1], poly[-2], poly[-1]], dtype=np.float32) + left_ratio = -shrink_ratio_of_width * np.linalg.norm(left_quad[0] - left_quad[3]) / \ + (np.linalg.norm(left_quad[0] - left_quad[1]) + 1e-6) + left_quad_expand = shrink_quad_along_width(left_quad, left_ratio, 1.0) + right_quad = np.array( + [ + poly[point_num // 2 - 2], poly[point_num // 2 - 1], + poly[point_num // 2], poly[point_num // 2 + 1] + ], + dtype=np.float32) + right_ratio = 1.0 + \ + shrink_ratio_of_width * np.linalg.norm(right_quad[0] - right_quad[3]) / \ + (np.linalg.norm(right_quad[0] - right_quad[1]) + 1e-6) + right_quad_expand = shrink_quad_along_width(right_quad, 0.0, right_ratio) + poly[0] = left_quad_expand[0] + poly[-1] = left_quad_expand[-1] + poly[point_num // 2 - 1] = right_quad_expand[1] + poly[point_num // 2] = right_quad_expand[2] + return poly + + +def norm2(x, axis=None): + if axis: + return np.sqrt(np.sum(x**2, axis=axis)) + return np.sqrt(np.sum(x**2)) + + +def cos(p1, p2): + return (p1 * p2).sum() / (norm2(p1) * norm2(p2)) diff --git a/ppocr/utils/en_dict.txt b/ppocr/utils/en_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..7677d31b9d3f08eef2823c2cf051beeab1f0470b --- /dev/null +++ b/ppocr/utils/en_dict.txt @@ -0,0 +1,95 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ + diff --git a/ppocr/utils/gen_label.py b/ppocr/utils/gen_label.py new file mode 100644 index 0000000000000000000000000000000000000000..fb78bd38bcfc1a59cac48a28bbb655ecb83bcb3f --- /dev/null +++ b/ppocr/utils/gen_label.py @@ -0,0 +1,81 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import argparse +import json + + +def gen_rec_label(input_path, out_label): + with open(out_label, 'w') as out_file: + with open(input_path, 'r') as f: + for line in f.readlines(): + tmp = line.strip('\n').replace(" ", "").split(',') + img_path, label = tmp[0], tmp[1] + label = label.replace("\"", "") + out_file.write(img_path + '\t' + label + '\n') + + +def gen_det_label(root_path, input_dir, out_label): + with open(out_label, 'w') as out_file: + for label_file in os.listdir(input_dir): + img_path = root_path + label_file[3:-4] + ".jpg" + label = [] + with open( + os.path.join(input_dir, label_file), 'r', + encoding='utf-8-sig') as f: + for line in f.readlines(): + tmp = line.strip("\n\r").replace("\xef\xbb\xbf", + "").split(',') + points = tmp[:8] + s = [] + for i in range(0, len(points), 2): + b = points[i:i + 2] + b = [int(t) for t in b] + s.append(b) + result = {"transcription": tmp[8], "points": s} + label.append(result) + + out_file.write(img_path + '\t' + json.dumps( + label, ensure_ascii=False) + '\n') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--mode', + type=str, + default="rec", + help='Generate rec_label or det_label, can be set rec or det') + parser.add_argument( + '--root_path', + type=str, + default=".", + help='The root directory of images.Only takes effect when mode=det ') + parser.add_argument( + '--input_path', + type=str, + default=".", + help='Input_label or input path to be converted') + parser.add_argument( + '--output_label', + type=str, + default="out_label.txt", + help='Output file name') + + args = parser.parse_args() + if args.mode == "rec": + print("Generate rec label") + gen_rec_label(args.input_path, args.output_label) + elif args.mode == "det": + gen_det_label(args.root_path, args.input_path, args.output_label) diff --git a/ppocr/utils/ic15_dict.txt b/ppocr/utils/ic15_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..474060366f8a2a00c108d5c743821c0a61867cd5 --- /dev/null +++ b/ppocr/utils/ic15_dict.txt @@ -0,0 +1,36 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z \ No newline at end of file diff --git a/ppocr/utils/iou.py b/ppocr/utils/iou.py new file mode 100644 index 0000000000000000000000000000000000000000..35459f5f053cde0a74f76c5652bfb723a48ca890 --- /dev/null +++ b/ppocr/utils/iou.py @@ -0,0 +1,54 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/whai362/PSENet/blob/python3/models/loss/iou.py +""" + +import paddle + +EPS = 1e-6 + + +def iou_single(a, b, mask, n_class): + valid = mask == 1 + a = a.masked_select(valid) + b = b.masked_select(valid) + miou = [] + for i in range(n_class): + if a.shape == [0] and a.shape == b.shape: + inter = paddle.to_tensor(0.0) + union = paddle.to_tensor(0.0) + else: + inter = ((a == i).logical_and(b == i)).astype('float32') + union = ((a == i).logical_or(b == i)).astype('float32') + miou.append(paddle.sum(inter) / (paddle.sum(union) + EPS)) + miou = sum(miou) / len(miou) + return miou + + +def iou(a, b, mask, n_class=2, reduce=True): + batch_size = a.shape[0] + + a = a.reshape([batch_size, -1]) + b = b.reshape([batch_size, -1]) + mask = mask.reshape([batch_size, -1]) + + iou = paddle.zeros((batch_size, ), dtype='float32') + for i in range(batch_size): + iou[i] = iou_single(a[i], b[i], mask[i], n_class) + + if reduce: + iou = paddle.mean(iou) + return iou diff --git a/ppocr/utils/loggers/__init__.py b/ppocr/utils/loggers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b1e92f734e84b7e0278f8e7940ef3baf137c159e --- /dev/null +++ b/ppocr/utils/loggers/__init__.py @@ -0,0 +1,3 @@ +from .vdl_logger import VDLLogger +from .wandb_logger import WandbLogger +from .loggers import Loggers diff --git a/ppocr/utils/loggers/base_logger.py b/ppocr/utils/loggers/base_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..3a7fc3593ba8e69fdd5bed386c7ae4ff0d459988 --- /dev/null +++ b/ppocr/utils/loggers/base_logger.py @@ -0,0 +1,15 @@ +import os +from abc import ABC, abstractmethod + +class BaseLogger(ABC): + def __init__(self, save_dir): + self.save_dir = save_dir + os.makedirs(self.save_dir, exist_ok=True) + + @abstractmethod + def log_metrics(self, metrics, prefix=None): + pass + + @abstractmethod + def close(self): + pass \ No newline at end of file diff --git a/ppocr/utils/loggers/loggers.py b/ppocr/utils/loggers/loggers.py new file mode 100644 index 0000000000000000000000000000000000000000..260146620811c8e72da66e9f2c7bbcbaef90b90d --- /dev/null +++ b/ppocr/utils/loggers/loggers.py @@ -0,0 +1,18 @@ +from .wandb_logger import WandbLogger + +class Loggers(object): + def __init__(self, loggers): + super().__init__() + self.loggers = loggers + + def log_metrics(self, metrics, prefix=None, step=None): + for logger in self.loggers: + logger.log_metrics(metrics, prefix=prefix, step=step) + + def log_model(self, is_best, prefix, metadata=None): + for logger in self.loggers: + logger.log_model(is_best=is_best, prefix=prefix, metadata=metadata) + + def close(self): + for logger in self.loggers: + logger.close() \ No newline at end of file diff --git a/ppocr/utils/loggers/vdl_logger.py b/ppocr/utils/loggers/vdl_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..c345f93235b239873f0ddcd49c8b1b8966877a03 --- /dev/null +++ b/ppocr/utils/loggers/vdl_logger.py @@ -0,0 +1,21 @@ +from .base_logger import BaseLogger +from visualdl import LogWriter + +class VDLLogger(BaseLogger): + def __init__(self, save_dir): + super().__init__(save_dir) + self.vdl_writer = LogWriter(logdir=save_dir) + + def log_metrics(self, metrics, prefix=None, step=None): + if not prefix: + prefix = "" + updated_metrics = {prefix + "/" + k: v for k, v in metrics.items()} + + for k, v in updated_metrics.items(): + self.vdl_writer.add_scalar(k, v, step) + + def log_model(self, is_best, prefix, metadata=None): + pass + + def close(self): + self.vdl_writer.close() \ No newline at end of file diff --git a/ppocr/utils/loggers/wandb_logger.py b/ppocr/utils/loggers/wandb_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..b9c6711696569e825638e0a27394071020b29cb5 --- /dev/null +++ b/ppocr/utils/loggers/wandb_logger.py @@ -0,0 +1,78 @@ +import os +from .base_logger import BaseLogger + +class WandbLogger(BaseLogger): + def __init__(self, + project=None, + name=None, + id=None, + entity=None, + save_dir=None, + config=None, + **kwargs): + try: + import wandb + self.wandb = wandb + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install wandb using `pip install wandb`" + ) + + self.project = project + self.name = name + self.id = id + self.save_dir = save_dir + self.config = config + self.kwargs = kwargs + self.entity = entity + self._run = None + self._wandb_init = dict( + project=self.project, + name=self.name, + id=self.id, + entity=self.entity, + dir=self.save_dir, + resume="allow" + ) + self._wandb_init.update(**kwargs) + + _ = self.run + + if self.config: + self.run.config.update(self.config) + + @property + def run(self): + if self._run is None: + if self.wandb.run is not None: + logger.info( + "There is a wandb run already in progress " + "and newly created instances of `WandbLogger` will reuse" + " this run. If this is not desired, call `wandb.finish()`" + "before instantiating `WandbLogger`." + ) + self._run = self.wandb.run + else: + self._run = self.wandb.init(**self._wandb_init) + return self._run + + def log_metrics(self, metrics, prefix=None, step=None): + if not prefix: + prefix = "" + updated_metrics = {prefix.lower() + "/" + k: v for k, v in metrics.items()} + + self.run.log(updated_metrics, step=step) + + def log_model(self, is_best, prefix, metadata=None): + model_path = os.path.join(self.save_dir, prefix + '.pdparams') + artifact = self.wandb.Artifact('model-{}'.format(self.run.id), type='model', metadata=metadata) + artifact.add_file(model_path, name="model_ckpt.pdparams") + + aliases = [prefix] + if is_best: + aliases.append("best") + + self.run.log_artifact(artifact, aliases=aliases) + + def close(self): + self.run.finish() \ No newline at end of file diff --git a/ppocr/utils/logging.py b/ppocr/utils/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..1eac8f351a4d30915d6f4ca863267cb73b9b1f19 --- /dev/null +++ b/ppocr/utils/logging.py @@ -0,0 +1,71 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/WenmuZhou/PytorchOCR/blob/master/torchocr/utils/logging.py +""" + +import os +import sys +import logging +import functools +import paddle.distributed as dist + +logger_initialized = {} + + +@functools.lru_cache() +def get_logger(name='ppocr', log_file=None, log_level=logging.DEBUG): + """Initialize and get a logger by name. + If the logger has not been initialized, this method will initialize the + logger by adding one or two handlers, otherwise the initialized logger will + be directly returned. During initialization, a StreamHandler will always be + added. If `log_file` is specified a FileHandler will also be added. + Args: + name (str): Logger name. + log_file (str | None): The log filename. If specified, a FileHandler + will be added to the logger. + log_level (int): The logger level. Note that only the process of + rank 0 is affected, and other processes will set the level to + "Error" thus be silent most of the time. + Returns: + logging.Logger: The expected logger. + """ + logger = logging.getLogger(name) + if name in logger_initialized: + return logger + for logger_name in logger_initialized: + if name.startswith(logger_name): + return logger + + formatter = logging.Formatter( + '[%(asctime)s] %(name)s %(levelname)s: %(message)s', + datefmt="%Y/%m/%d %H:%M:%S") + + stream_handler = logging.StreamHandler(stream=sys.stdout) + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + if log_file is not None and dist.get_rank() == 0: + log_file_folder = os.path.split(log_file)[0] + os.makedirs(log_file_folder, exist_ok=True) + file_handler = logging.FileHandler(log_file, 'a') + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + if dist.get_rank() == 0: + logger.setLevel(log_level) + else: + logger.setLevel(logging.ERROR) + logger_initialized[name] = True + logger.propagate = False + return logger diff --git a/ppocr/utils/network.py b/ppocr/utils/network.py new file mode 100644 index 0000000000000000000000000000000000000000..080a5d160116cfdd3b255a883525281d97ee9cc9 --- /dev/null +++ b/ppocr/utils/network.py @@ -0,0 +1,82 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import tarfile +import requests +from tqdm import tqdm + +from ppocr.utils.logging import get_logger + + +def download_with_progressbar(url, save_path): + logger = get_logger() + response = requests.get(url, stream=True) + if response.status_code == 200: + total_size_in_bytes = int(response.headers.get('content-length', 1)) + block_size = 1024 # 1 Kibibyte + progress_bar = tqdm( + total=total_size_in_bytes, unit='iB', unit_scale=True) + with open(save_path, 'wb') as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + progress_bar.close() + else: + logger.error("Something went wrong while downloading models") + sys.exit(0) + + +def maybe_download(model_storage_directory, url): + # using custom model + tar_file_name_list = ['.pdiparams', '.pdiparams.info', '.pdmodel'] + if not os.path.exists( + os.path.join(model_storage_directory, 'inference.pdiparams') + ) or not os.path.exists( + os.path.join(model_storage_directory, 'inference.pdmodel')): + assert url.endswith('.tar'), 'Only supports tar compressed package' + tmp_path = os.path.join(model_storage_directory, url.split('/')[-1]) + print('download {} to {}'.format(url, tmp_path)) + os.makedirs(model_storage_directory, exist_ok=True) + download_with_progressbar(url, tmp_path) + with tarfile.open(tmp_path, 'r') as tarObj: + for member in tarObj.getmembers(): + filename = None + for tar_file_name in tar_file_name_list: + if member.name.endswith(tar_file_name): + filename = 'inference' + tar_file_name + if filename is None: + continue + file = tarObj.extractfile(member) + with open( + os.path.join(model_storage_directory, filename), + 'wb') as f: + f.write(file.read()) + os.remove(tmp_path) + + +def is_link(s): + return s is not None and s.startswith('http') + + +def confirm_model_dir_url(model_dir, default_model_dir, default_url): + url = default_url + if model_dir is None or is_link(model_dir): + if is_link(model_dir): + url = model_dir + file_name = url.split('/')[-1][:-4] + model_dir = default_model_dir + model_dir = os.path.join(model_dir, file_name) + return model_dir, url diff --git a/ppocr/utils/poly_nms.py b/ppocr/utils/poly_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..9dcb3d2c2f7be2022529d5e54de357182f207cf5 --- /dev/null +++ b/ppocr/utils/poly_nms.py @@ -0,0 +1,146 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from shapely.geometry import Polygon + + +def points2polygon(points): + """Convert k points to 1 polygon. + + Args: + points (ndarray or list): A ndarray or a list of shape (2k) + that indicates k points. + + Returns: + polygon (Polygon): A polygon object. + """ + if isinstance(points, list): + points = np.array(points) + + assert isinstance(points, np.ndarray) + assert (points.size % 2 == 0) and (points.size >= 8) + + point_mat = points.reshape([-1, 2]) + return Polygon(point_mat) + + +def poly_intersection(poly_det, poly_gt, buffer=0.0001): + """Calculate the intersection area between two polygon. + + Args: + poly_det (Polygon): A polygon predicted by detector. + poly_gt (Polygon): A gt polygon. + + Returns: + intersection_area (float): The intersection area between two polygons. + """ + assert isinstance(poly_det, Polygon) + assert isinstance(poly_gt, Polygon) + + if buffer == 0: + poly_inter = poly_det & poly_gt + else: + poly_inter = poly_det.buffer(buffer) & poly_gt.buffer(buffer) + return poly_inter.area, poly_inter + + +def poly_union(poly_det, poly_gt): + """Calculate the union area between two polygon. + + Args: + poly_det (Polygon): A polygon predicted by detector. + poly_gt (Polygon): A gt polygon. + + Returns: + union_area (float): The union area between two polygons. + """ + assert isinstance(poly_det, Polygon) + assert isinstance(poly_gt, Polygon) + + area_det = poly_det.area + area_gt = poly_gt.area + area_inters, _ = poly_intersection(poly_det, poly_gt) + return area_det + area_gt - area_inters + + +def valid_boundary(x, with_score=True): + num = len(x) + if num < 8: + return False + if num % 2 == 0 and (not with_score): + return True + if num % 2 == 1 and with_score: + return True + + return False + + +def boundary_iou(src, target): + """Calculate the IOU between two boundaries. + + Args: + src (list): Source boundary. + target (list): Target boundary. + + Returns: + iou (float): The iou between two boundaries. + """ + assert valid_boundary(src, False) + assert valid_boundary(target, False) + src_poly = points2polygon(src) + target_poly = points2polygon(target) + + return poly_iou(src_poly, target_poly) + + +def poly_iou(poly_det, poly_gt): + """Calculate the IOU between two polygons. + + Args: + poly_det (Polygon): A polygon predicted by detector. + poly_gt (Polygon): A gt polygon. + + Returns: + iou (float): The IOU between two polygons. + """ + assert isinstance(poly_det, Polygon) + assert isinstance(poly_gt, Polygon) + area_inters, _ = poly_intersection(poly_det, poly_gt) + area_union = poly_union(poly_det, poly_gt) + if area_union == 0: + return 0.0 + return area_inters / area_union + + +def poly_nms(polygons, threshold): + assert isinstance(polygons, list) + + polygons = np.array(sorted(polygons, key=lambda x: x[-1])) + + keep_poly = [] + index = [i for i in range(polygons.shape[0])] + + while len(index) > 0: + keep_poly.append(polygons[index[-1]].tolist()) + A = polygons[index[-1]][:-1] + index = np.delete(index, -1) + iou_list = np.zeros((len(index), )) + for i in range(len(index)): + B = polygons[index[i]][:-1] + iou_list[i] = boundary_iou(A, B) + remove_index = np.where(iou_list > threshold) + index = np.delete(index, remove_index) + + return keep_poly diff --git a/ppocr/utils/ppocr_keys_v1.txt b/ppocr/utils/ppocr_keys_v1.txt new file mode 100644 index 0000000000000000000000000000000000000000..84b885d8352226e49b1d5d791b8f43a663e246aa --- /dev/null +++ b/ppocr/utils/ppocr_keys_v1.txt @@ -0,0 +1,6623 @@ +' +疗 +绚 +诚 +娇 +溜 +题 +贿 +者 +廖 +更 +纳 +加 +奉 +公 +一 +就 +汴 +计 +与 +路 +房 +原 +妇 +2 +0 +8 +- +7 +其 +> +: +] +, +, +骑 +刈 +全 +消 +昏 +傈 +安 +久 +钟 +嗅 +不 +影 +处 +驽 +蜿 +资 +关 +椤 +地 +瘸 +专 +问 +忖 +票 +嫉 +炎 +韵 +要 +月 +田 +节 +陂 +鄙 +捌 +备 +拳 +伺 +眼 +网 +盎 +大 +傍 +心 +东 +愉 +汇 +蹿 +科 +每 +业 +里 +航 +晏 +字 +平 +录 +先 +1 +3 +彤 +鲶 +产 +稍 +督 +腴 +有 +象 +岳 +注 +绍 +在 +泺 +文 +定 +核 +名 +水 +过 +理 +让 +偷 +率 +等 +这 +发 +” +为 +含 +肥 +酉 +相 +鄱 +七 +编 +猥 +锛 +日 +镀 +蒂 +掰 +倒 +辆 +栾 +栗 +综 +涩 +州 +雌 +滑 +馀 +了 +机 +块 +司 +宰 +甙 +兴 +矽 +抚 +保 +用 +沧 +秩 +如 +收 +息 +滥 +页 +疑 +埠 +! +! +姥 +异 +橹 +钇 +向 +下 +跄 +的 +椴 +沫 +国 +绥 +獠 +报 +开 +民 +蜇 +何 +分 +凇 +长 +讥 +藏 +掏 +施 +羽 +中 +讲 +派 +嘟 +人 +提 +浼 +间 +世 +而 +古 +多 +倪 +唇 +饯 +控 +庚 +首 +赛 +蜓 +味 +断 +制 +觉 +技 +替 +艰 +溢 +潮 +夕 +钺 +外 +摘 +枋 +动 +双 +单 +啮 +户 +枇 +确 +锦 +曜 +杜 +或 +能 +效 +霜 +盒 +然 +侗 +电 +晁 +放 +步 +鹃 +新 +杖 +蜂 +吒 +濂 +瞬 +评 +总 +隍 +对 +独 +合 +也 +是 +府 +青 +天 +诲 +墙 +组 +滴 +级 +邀 +帘 +示 +已 +时 +骸 +仄 +泅 +和 +遨 +店 +雇 +疫 +持 +巍 +踮 +境 +只 +亨 +目 +鉴 +崤 +闲 +体 +泄 +杂 +作 +般 +轰 +化 +解 +迂 +诿 +蛭 +璀 +腾 +告 +版 +服 +省 +师 +小 +规 +程 +线 +海 +办 +引 +二 +桧 +牌 +砺 +洄 +裴 +修 +图 +痫 +胡 +许 +犊 +事 +郛 +基 +柴 +呼 +食 +研 +奶 +律 +蛋 +因 +葆 +察 +戏 +褒 +戒 +再 +李 +骁 +工 +貂 +油 +鹅 +章 +啄 +休 +场 +给 +睡 +纷 +豆 +器 +捎 +说 +敏 +学 +会 +浒 +设 +诊 +格 +廓 +查 +来 +霓 +室 +溆 +¢ +诡 +寥 +焕 +舜 +柒 +狐 +回 +戟 +砾 +厄 +实 +翩 +尿 +五 +入 +径 +惭 +喹 +股 +宇 +篝 +| +; +美 +期 +云 +九 +祺 +扮 +靠 +锝 +槌 +系 +企 +酰 +阊 +暂 +蚕 +忻 +豁 +本 +羹 +执 +条 +钦 +H +獒 +限 +进 +季 +楦 +于 +芘 +玖 +铋 +茯 +未 +答 +粘 +括 +样 +精 +欠 +矢 +甥 +帷 +嵩 +扣 +令 +仔 +风 +皈 +行 +支 +部 +蓉 +刮 +站 +蜡 +救 +钊 +汗 +松 +嫌 +成 +可 +. +鹤 +院 +从 +交 +政 +怕 +活 +调 +球 +局 +验 +髌 +第 +韫 +谗 +串 +到 +圆 +年 +米 +/ +* +友 +忿 +检 +区 +看 +自 +敢 +刃 +个 +兹 +弄 +流 +留 +同 +没 +齿 +星 +聆 +轼 +湖 +什 +三 +建 +蛔 +儿 +椋 +汕 +震 +颧 +鲤 +跟 +力 +情 +璺 +铨 +陪 +务 +指 +族 +训 +滦 +鄣 +濮 +扒 +商 +箱 +十 +召 +慷 +辗 +所 +莞 +管 +护 +臭 +横 +硒 +嗓 +接 +侦 +六 +露 +党 +馋 +驾 +剖 +高 +侬 +妪 +幂 +猗 +绺 +骐 +央 +酐 +孝 +筝 +课 +徇 +缰 +门 +男 +西 +项 +句 +谙 +瞒 +秃 +篇 +教 +碲 +罚 +声 +呐 +景 +前 +富 +嘴 +鳌 +稀 +免 +朋 +啬 +睐 +去 +赈 +鱼 +住 +肩 +愕 +速 +旁 +波 +厅 +健 +茼 +厥 +鲟 +谅 +投 +攸 +炔 +数 +方 +击 +呋 +谈 +绩 +别 +愫 +僚 +躬 +鹧 +胪 +炳 +招 +喇 +膨 +泵 +蹦 +毛 +结 +5 +4 +谱 +识 +陕 +粽 +婚 +拟 +构 +且 +搜 +任 +潘 +比 +郢 +妨 +醪 +陀 +桔 +碘 +扎 +选 +哈 +骷 +楷 +亿 +明 +缆 +脯 +监 +睫 +逻 +婵 +共 +赴 +淝 +凡 +惦 +及 +达 +揖 +谩 +澹 +减 +焰 +蛹 +番 +祁 +柏 +员 +禄 +怡 +峤 +龙 +白 +叽 +生 +闯 +起 +细 +装 +谕 +竟 +聚 +钙 +上 +导 +渊 +按 +艾 +辘 +挡 +耒 +盹 +饪 +臀 +记 +邮 +蕙 +受 +各 +医 +搂 +普 +滇 +朗 +茸 +带 +翻 +酚 +( +光 +堤 +墟 +蔷 +万 +幻 +〓 +瑙 +辈 +昧 +盏 +亘 +蛀 +吉 +铰 +请 +子 +假 +闻 +税 +井 +诩 +哨 +嫂 +好 +面 +琐 +校 +馊 +鬣 +缂 +营 +访 +炖 +占 +农 +缀 +否 +经 +钚 +棵 +趟 +张 +亟 +吏 +茶 +谨 +捻 +论 +迸 +堂 +玉 +信 +吧 +瞠 +乡 +姬 +寺 +咬 +溏 +苄 +皿 +意 +赉 +宝 +尔 +钰 +艺 +特 +唳 +踉 +都 +荣 +倚 +登 +荐 +丧 +奇 +涵 +批 +炭 +近 +符 +傩 +感 +道 +着 +菊 +虹 +仲 +众 +懈 +濯 +颞 +眺 +南 +释 +北 +缝 +标 +既 +茗 +整 +撼 +迤 +贲 +挎 +耱 +拒 +某 +妍 +卫 +哇 +英 +矶 +藩 +治 +他 +元 +领 +膜 +遮 +穗 +蛾 +飞 +荒 +棺 +劫 +么 +市 +火 +温 +拈 +棚 +洼 +转 +果 +奕 +卸 +迪 +伸 +泳 +斗 +邡 +侄 +涨 +屯 +萋 +胭 +氡 +崮 +枞 +惧 +冒 +彩 +斜 +手 +豚 +随 +旭 +淑 +妞 +形 +菌 +吲 +沱 +争 +驯 +歹 +挟 +兆 +柱 +传 +至 +包 +内 +响 +临 +红 +功 +弩 +衡 +寂 +禁 +老 +棍 +耆 +渍 +织 +害 +氵 +渑 +布 +载 +靥 +嗬 +虽 +苹 +咨 +娄 +库 +雉 +榜 +帜 +嘲 +套 +瑚 +亲 +簸 +欧 +边 +6 +腿 +旮 +抛 +吹 +瞳 +得 +镓 +梗 +厨 +继 +漾 +愣 +憨 +士 +策 +窑 +抑 +躯 +襟 +脏 +参 +贸 +言 +干 +绸 +鳄 +穷 +藜 +音 +折 +详 +) +举 +悍 +甸 +癌 +黎 +谴 +死 +罩 +迁 +寒 +驷 +袖 +媒 +蒋 +掘 +模 +纠 +恣 +观 +祖 +蛆 +碍 +位 +稿 +主 +澧 +跌 +筏 +京 +锏 +帝 +贴 +证 +糠 +才 +黄 +鲸 +略 +炯 +饱 +四 +出 +园 +犀 +牧 +容 +汉 +杆 +浈 +汰 +瑷 +造 +虫 +瘩 +怪 +驴 +济 +应 +花 +沣 +谔 +夙 +旅 +价 +矿 +以 +考 +s +u +呦 +晒 +巡 +茅 +准 +肟 +瓴 +詹 +仟 +褂 +译 +桌 +混 +宁 +怦 +郑 +抿 +些 +余 +鄂 +饴 +攒 +珑 +群 +阖 +岔 +琨 +藓 +预 +环 +洮 +岌 +宀 +杲 +瀵 +最 +常 +囡 +周 +踊 +女 +鼓 +袭 +喉 +简 +范 +薯 +遐 +疏 +粱 +黜 +禧 +法 +箔 +斤 +遥 +汝 +奥 +直 +贞 +撑 +置 +绱 +集 +她 +馅 +逗 +钧 +橱 +魉 +[ +恙 +躁 +唤 +9 +旺 +膘 +待 +脾 +惫 +购 +吗 +依 +盲 +度 +瘿 +蠖 +俾 +之 +镗 +拇 +鲵 +厝 +簧 +续 +款 +展 +啃 +表 +剔 +品 +钻 +腭 +损 +清 +锶 +统 +涌 +寸 +滨 +贪 +链 +吠 +冈 +伎 +迥 +咏 +吁 +览 +防 +迅 +失 +汾 +阔 +逵 +绀 +蔑 +列 +川 +凭 +努 +熨 +揪 +利 +俱 +绉 +抢 +鸨 +我 +即 +责 +膦 +易 +毓 +鹊 +刹 +玷 +岿 +空 +嘞 +绊 +排 +术 +估 +锷 +违 +们 +苟 +铜 +播 +肘 +件 +烫 +审 +鲂 +广 +像 +铌 +惰 +铟 +巳 +胍 +鲍 +康 +憧 +色 +恢 +想 +拷 +尤 +疳 +知 +S +Y +F +D +A +峄 +裕 +帮 +握 +搔 +氐 +氘 +难 +墒 +沮 +雨 +叁 +缥 +悴 +藐 +湫 +娟 +苑 +稠 +颛 +簇 +后 +阕 +闭 +蕤 +缚 +怎 +佞 +码 +嘤 +蔡 +痊 +舱 +螯 +帕 +赫 +昵 +升 +烬 +岫 +、 +疵 +蜻 +髁 +蕨 +隶 +烛 +械 +丑 +盂 +梁 +强 +鲛 +由 +拘 +揉 +劭 +龟 +撤 +钩 +呕 +孛 +费 +妻 +漂 +求 +阑 +崖 +秤 +甘 +通 +深 +补 +赃 +坎 +床 +啪 +承 +吼 +量 +暇 +钼 +烨 +阂 +擎 +脱 +逮 +称 +P +神 +属 +矗 +华 +届 +狍 +葑 +汹 +育 +患 +窒 +蛰 +佼 +静 +槎 +运 +鳗 +庆 +逝 +曼 +疱 +克 +代 +官 +此 +麸 +耧 +蚌 +晟 +例 +础 +榛 +副 +测 +唰 +缢 +迹 +灬 +霁 +身 +岁 +赭 +扛 +又 +菡 +乜 +雾 +板 +读 +陷 +徉 +贯 +郁 +虑 +变 +钓 +菜 +圾 +现 +琢 +式 +乐 +维 +渔 +浜 +左 +吾 +脑 +钡 +警 +T +啵 +拴 +偌 +漱 +湿 +硕 +止 +骼 +魄 +积 +燥 +联 +踢 +玛 +则 +窿 +见 +振 +畿 +送 +班 +钽 +您 +赵 +刨 +印 +讨 +踝 +籍 +谡 +舌 +崧 +汽 +蔽 +沪 +酥 +绒 +怖 +财 +帖 +肱 +私 +莎 +勋 +羔 +霸 +励 +哼 +帐 +将 +帅 +渠 +纪 +婴 +娩 +岭 +厘 +滕 +吻 +伤 +坝 +冠 +戊 +隆 +瘁 +介 +涧 +物 +黍 +并 +姗 +奢 +蹑 +掣 +垸 +锴 +命 +箍 +捉 +病 +辖 +琰 +眭 +迩 +艘 +绌 +繁 +寅 +若 +毋 +思 +诉 +类 +诈 +燮 +轲 +酮 +狂 +重 +反 +职 +筱 +县 +委 +磕 +绣 +奖 +晋 +濉 +志 +徽 +肠 +呈 +獐 +坻 +口 +片 +碰 +几 +村 +柿 +劳 +料 +获 +亩 +惕 +晕 +厌 +号 +罢 +池 +正 +鏖 +煨 +家 +棕 +复 +尝 +懋 +蜥 +锅 +岛 +扰 +队 +坠 +瘾 +钬 +@ +卧 +疣 +镇 +譬 +冰 +彷 +频 +黯 +据 +垄 +采 +八 +缪 +瘫 +型 +熹 +砰 +楠 +襁 +箐 +但 +嘶 +绳 +啤 +拍 +盥 +穆 +傲 +洗 +盯 +塘 +怔 +筛 +丿 +台 +恒 +喂 +葛 +永 +¥ +烟 +酒 +桦 +书 +砂 +蚝 +缉 +态 +瀚 +袄 +圳 +轻 +蛛 +超 +榧 +遛 +姒 +奘 +铮 +右 +荽 +望 +偻 +卡 +丶 +氰 +附 +做 +革 +索 +戚 +坨 +桷 +唁 +垅 +榻 +岐 +偎 +坛 +莨 +山 +殊 +微 +骇 +陈 +爨 +推 +嗝 +驹 +澡 +藁 +呤 +卤 +嘻 +糅 +逛 +侵 +郓 +酌 +德 +摇 +※ +鬃 +被 +慨 +殡 +羸 +昌 +泡 +戛 +鞋 +河 +宪 +沿 +玲 +鲨 +翅 +哽 +源 +铅 +语 +照 +邯 +址 +荃 +佬 +顺 +鸳 +町 +霭 +睾 +瓢 +夸 +椁 +晓 +酿 +痈 +咔 +侏 +券 +噎 +湍 +签 +嚷 +离 +午 +尚 +社 +锤 +背 +孟 +使 +浪 +缦 +潍 +鞅 +军 +姹 +驶 +笑 +鳟 +鲁 +》 +孽 +钜 +绿 +洱 +礴 +焯 +椰 +颖 +囔 +乌 +孔 +巴 +互 +性 +椽 +哞 +聘 +昨 +早 +暮 +胶 +炀 +隧 +低 +彗 +昝 +铁 +呓 +氽 +藉 +喔 +癖 +瑗 +姨 +权 +胱 +韦 +堑 +蜜 +酋 +楝 +砝 +毁 +靓 +歙 +锲 +究 +屋 +喳 +骨 +辨 +碑 +武 +鸠 +宫 +辜 +烊 +适 +坡 +殃 +培 +佩 +供 +走 +蜈 +迟 +翼 +况 +姣 +凛 +浔 +吃 +飘 +债 +犟 +金 +促 +苛 +崇 +坂 +莳 +畔 +绂 +兵 +蠕 +斋 +根 +砍 +亢 +欢 +恬 +崔 +剁 +餐 +榫 +快 +扶 +‖ +濒 +缠 +鳜 +当 +彭 +驭 +浦 +篮 +昀 +锆 +秸 +钳 +弋 +娣 +瞑 +夷 +龛 +苫 +拱 +致 +% +嵊 +障 +隐 +弑 +初 +娓 +抉 +汩 +累 +蓖 +" +唬 +助 +苓 +昙 +押 +毙 +破 +城 +郧 +逢 +嚏 +獭 +瞻 +溱 +婿 +赊 +跨 +恼 +璧 +萃 +姻 +貉 +灵 +炉 +密 +氛 +陶 +砸 +谬 +衔 +点 +琛 +沛 +枳 +层 +岱 +诺 +脍 +榈 +埂 +征 +冷 +裁 +打 +蹴 +素 +瘘 +逞 +蛐 +聊 +激 +腱 +萘 +踵 +飒 +蓟 +吆 +取 +咙 +簋 +涓 +矩 +曝 +挺 +揣 +座 +你 +史 +舵 +焱 +尘 +苏 +笈 +脚 +溉 +榨 +诵 +樊 +邓 +焊 +义 +庶 +儋 +蟋 +蒲 +赦 +呷 +杞 +诠 +豪 +还 +试 +颓 +茉 +太 +除 +紫 +逃 +痴 +草 +充 +鳕 +珉 +祗 +墨 +渭 +烩 +蘸 +慕 +璇 +镶 +穴 +嵘 +恶 +骂 +险 +绋 +幕 +碉 +肺 +戳 +刘 +潞 +秣 +纾 +潜 +銮 +洛 +须 +罘 +销 +瘪 +汞 +兮 +屉 +r +林 +厕 +质 +探 +划 +狸 +殚 +善 +煊 +烹 +〒 +锈 +逯 +宸 +辍 +泱 +柚 +袍 +远 +蹋 +嶙 +绝 +峥 +娥 +缍 +雀 +徵 +认 +镱 +谷 += +贩 +勉 +撩 +鄯 +斐 +洋 +非 +祚 +泾 +诒 +饿 +撬 +威 +晷 +搭 +芍 +锥 +笺 +蓦 +候 +琊 +档 +礁 +沼 +卵 +荠 +忑 +朝 +凹 +瑞 +头 +仪 +弧 +孵 +畏 +铆 +突 +衲 +车 +浩 +气 +茂 +悖 +厢 +枕 +酝 +戴 +湾 +邹 +飚 +攘 +锂 +写 +宵 +翁 +岷 +无 +喜 +丈 +挑 +嗟 +绛 +殉 +议 +槽 +具 +醇 +淞 +笃 +郴 +阅 +饼 +底 +壕 +砚 +弈 +询 +缕 +庹 +翟 +零 +筷 +暨 +舟 +闺 +甯 +撞 +麂 +茌 +蔼 +很 +珲 +捕 +棠 +角 +阉 +媛 +娲 +诽 +剿 +尉 +爵 +睬 +韩 +诰 +匣 +危 +糍 +镯 +立 +浏 +阳 +少 +盆 +舔 +擘 +匪 +申 +尬 +铣 +旯 +抖 +赘 +瓯 +居 +ˇ +哮 +游 +锭 +茏 +歌 +坏 +甚 +秒 +舞 +沙 +仗 +劲 +潺 +阿 +燧 +郭 +嗖 +霏 +忠 +材 +奂 +耐 +跺 +砀 +输 +岖 +媳 +氟 +极 +摆 +灿 +今 +扔 +腻 +枝 +奎 +药 +熄 +吨 +话 +q +额 +慑 +嘌 +协 +喀 +壳 +埭 +视 +著 +於 +愧 +陲 +翌 +峁 +颅 +佛 +腹 +聋 +侯 +咎 +叟 +秀 +颇 +存 +较 +罪 +哄 +岗 +扫 +栏 +钾 +羌 +己 +璨 +枭 +霉 +煌 +涸 +衿 +键 +镝 +益 +岢 +奏 +连 +夯 +睿 +冥 +均 +糖 +狞 +蹊 +稻 +爸 +刿 +胥 +煜 +丽 +肿 +璃 +掸 +跚 +灾 +垂 +樾 +濑 +乎 +莲 +窄 +犹 +撮 +战 +馄 +软 +络 +显 +鸢 +胸 +宾 +妲 +恕 +埔 +蝌 +份 +遇 +巧 +瞟 +粒 +恰 +剥 +桡 +博 +讯 +凯 +堇 +阶 +滤 +卖 +斌 +骚 +彬 +兑 +磺 +樱 +舷 +两 +娱 +福 +仃 +差 +找 +桁 +÷ +净 +把 +阴 +污 +戬 +雷 +碓 +蕲 +楚 +罡 +焖 +抽 +妫 +咒 +仑 +闱 +尽 +邑 +菁 +爱 +贷 +沥 +鞑 +牡 +嗉 +崴 +骤 +塌 +嗦 +订 +拮 +滓 +捡 +锻 +次 +坪 +杩 +臃 +箬 +融 +珂 +鹗 +宗 +枚 +降 +鸬 +妯 +阄 +堰 +盐 +毅 +必 +杨 +崃 +俺 +甬 +状 +莘 +货 +耸 +菱 +腼 +铸 +唏 +痤 +孚 +澳 +懒 +溅 +翘 +疙 +杷 +淼 +缙 +骰 +喊 +悉 +砻 +坷 +艇 +赁 +界 +谤 +纣 +宴 +晃 +茹 +归 +饭 +梢 +铡 +街 +抄 +肼 +鬟 +苯 +颂 +撷 +戈 +炒 +咆 +茭 +瘙 +负 +仰 +客 +琉 +铢 +封 +卑 +珥 +椿 +镧 +窨 +鬲 +寿 +御 +袤 +铃 +萎 +砖 +餮 +脒 +裳 +肪 +孕 +嫣 +馗 +嵇 +恳 +氯 +江 +石 +褶 +冢 +祸 +阻 +狈 +羞 +银 +靳 +透 +咳 +叼 +敷 +芷 +啥 +它 +瓤 +兰 +痘 +懊 +逑 +肌 +往 +捺 +坊 +甩 +呻 +〃 +沦 +忘 +膻 +祟 +菅 +剧 +崆 +智 +坯 +臧 +霍 +墅 +攻 +眯 +倘 +拢 +骠 +铐 +庭 +岙 +瓠 +′ +缺 +泥 +迢 +捶 +? +? +郏 +喙 +掷 +沌 +纯 +秘 +种 +听 +绘 +固 +螨 +团 +香 +盗 +妒 +埚 +蓝 +拖 +旱 +荞 +铀 +血 +遏 +汲 +辰 +叩 +拽 +幅 +硬 +惶 +桀 +漠 +措 +泼 +唑 +齐 +肾 +念 +酱 +虚 +屁 +耶 +旗 +砦 +闵 +婉 +馆 +拭 +绅 +韧 +忏 +窝 +醋 +葺 +顾 +辞 +倜 +堆 +辋 +逆 +玟 +贱 +疾 +董 +惘 +倌 +锕 +淘 +嘀 +莽 +俭 +笏 +绑 +鲷 +杈 +择 +蟀 +粥 +嗯 +驰 +逾 +案 +谪 +褓 +胫 +哩 +昕 +颚 +鲢 +绠 +躺 +鹄 +崂 +儒 +俨 +丝 +尕 +泌 +啊 +萸 +彰 +幺 +吟 +骄 +苣 +弦 +脊 +瑰 +〈 +诛 +镁 +析 +闪 +剪 +侧 +哟 +框 +螃 +守 +嬗 +燕 +狭 +铈 +缮 +概 +迳 +痧 +鲲 +俯 +售 +笼 +痣 +扉 +挖 +满 +咋 +援 +邱 +扇 +歪 +便 +玑 +绦 +峡 +蛇 +叨 +〖 +泽 +胃 +斓 +喋 +怂 +坟 +猪 +该 +蚬 +炕 +弥 +赞 +棣 +晔 +娠 +挲 +狡 +创 +疖 +铕 +镭 +稷 +挫 +弭 +啾 +翔 +粉 +履 +苘 +哦 +楼 +秕 +铂 +土 +锣 +瘟 +挣 +栉 +习 +享 +桢 +袅 +磨 +桂 +谦 +延 +坚 +蔚 +噗 +署 +谟 +猬 +钎 +恐 +嬉 +雒 +倦 +衅 +亏 +璩 +睹 +刻 +殿 +王 +算 +雕 +麻 +丘 +柯 +骆 +丸 +塍 +谚 +添 +鲈 +垓 +桎 +蚯 +芥 +予 +飕 +镦 +谌 +窗 +醚 +菀 +亮 +搪 +莺 +蒿 +羁 +足 +J +真 +轶 +悬 +衷 +靛 +翊 +掩 +哒 +炅 +掐 +冼 +妮 +l +谐 +稚 +荆 +擒 +犯 +陵 +虏 +浓 +崽 +刍 +陌 +傻 +孜 +千 +靖 +演 +矜 +钕 +煽 +杰 +酗 +渗 +伞 +栋 +俗 +泫 +戍 +罕 +沾 +疽 +灏 +煦 +芬 +磴 +叱 +阱 +榉 +湃 +蜀 +叉 +醒 +彪 +租 +郡 +篷 +屎 +良 +垢 +隗 +弱 +陨 +峪 +砷 +掴 +颁 +胎 +雯 +绵 +贬 +沐 +撵 +隘 +篙 +暖 +曹 +陡 +栓 +填 +臼 +彦 +瓶 +琪 +潼 +哪 +鸡 +摩 +啦 +俟 +锋 +域 +耻 +蔫 +疯 +纹 +撇 +毒 +绶 +痛 +酯 +忍 +爪 +赳 +歆 +嘹 +辕 +烈 +册 +朴 +钱 +吮 +毯 +癜 +娃 +谀 +邵 +厮 +炽 +璞 +邃 +丐 +追 +词 +瓒 +忆 +轧 +芫 +谯 +喷 +弟 +半 +冕 +裙 +掖 +墉 +绮 +寝 +苔 +势 +顷 +褥 +切 +衮 +君 +佳 +嫒 +蚩 +霞 +佚 +洙 +逊 +镖 +暹 +唛 +& +殒 +顶 +碗 +獗 +轭 +铺 +蛊 +废 +恹 +汨 +崩 +珍 +那 +杵 +曲 +纺 +夏 +薰 +傀 +闳 +淬 +姘 +舀 +拧 +卷 +楂 +恍 +讪 +厩 +寮 +篪 +赓 +乘 +灭 +盅 +鞣 +沟 +慎 +挂 +饺 +鼾 +杳 +树 +缨 +丛 +絮 +娌 +臻 +嗳 +篡 +侩 +述 +衰 +矛 +圈 +蚜 +匕 +筹 +匿 +濞 +晨 +叶 +骋 +郝 +挚 +蚴 +滞 +增 +侍 +描 +瓣 +吖 +嫦 +蟒 +匾 +圣 +赌 +毡 +癞 +恺 +百 +曳 +需 +篓 +肮 +庖 +帏 +卿 +驿 +遗 +蹬 +鬓 +骡 +歉 +芎 +胳 +屐 +禽 +烦 +晌 +寄 +媾 +狄 +翡 +苒 +船 +廉 +终 +痞 +殇 +々 +畦 +饶 +改 +拆 +悻 +萄 +£ +瓿 +乃 +訾 +桅 +匮 +溧 +拥 +纱 +铍 +骗 +蕃 +龋 +缬 +父 +佐 +疚 +栎 +醍 +掳 +蓄 +x +惆 +颜 +鲆 +榆 +〔 +猎 +敌 +暴 +谥 +鲫 +贾 +罗 +玻 +缄 +扦 +芪 +癣 +落 +徒 +臾 +恿 +猩 +托 +邴 +肄 +牵 +春 +陛 +耀 +刊 +拓 +蓓 +邳 +堕 +寇 +枉 +淌 +啡 +湄 +兽 +酷 +萼 +碚 +濠 +萤 +夹 +旬 +戮 +梭 +琥 +椭 +昔 +勺 +蜊 +绐 +晚 +孺 +僵 +宣 +摄 +冽 +旨 +萌 +忙 +蚤 +眉 +噼 +蟑 +付 +契 +瓜 +悼 +颡 +壁 +曾 +窕 +颢 +澎 +仿 +俑 +浑 +嵌 +浣 +乍 +碌 +褪 +乱 +蔟 +隙 +玩 +剐 +葫 +箫 +纲 +围 +伐 +决 +伙 +漩 +瑟 +刑 +肓 +镳 +缓 +蹭 +氨 +皓 +典 +畲 +坍 +铑 +檐 +塑 +洞 +倬 +储 +胴 +淳 +戾 +吐 +灼 +惺 +妙 +毕 +珐 +缈 +虱 +盖 +羰 +鸿 +磅 +谓 +髅 +娴 +苴 +唷 +蚣 +霹 +抨 +贤 +唠 +犬 +誓 +逍 +庠 +逼 +麓 +籼 +釉 +呜 +碧 +秧 +氩 +摔 +霄 +穸 +纨 +辟 +妈 +映 +完 +牛 +缴 +嗷 +炊 +恩 +荔 +茆 +掉 +紊 +慌 +莓 +羟 +阙 +萁 +磐 +另 +蕹 +辱 +鳐 +湮 +吡 +吩 +唐 +睦 +垠 +舒 +圜 +冗 +瞿 +溺 +芾 +囱 +匠 +僳 +汐 +菩 +饬 +漓 +黑 +霰 +浸 +濡 +窥 +毂 +蒡 +兢 +驻 +鹉 +芮 +诙 +迫 +雳 +厂 +忐 +臆 +猴 +鸣 +蚪 +栈 +箕 +羡 +渐 +莆 +捍 +眈 +哓 +趴 +蹼 +埕 +嚣 +骛 +宏 +淄 +斑 +噜 +严 +瑛 +垃 +椎 +诱 +压 +庾 +绞 +焘 +廿 +抡 +迄 +棘 +夫 +纬 +锹 +眨 +瞌 +侠 +脐 +竞 +瀑 +孳 +骧 +遁 +姜 +颦 +荪 +滚 +萦 +伪 +逸 +粳 +爬 +锁 +矣 +役 +趣 +洒 +颔 +诏 +逐 +奸 +甭 +惠 +攀 +蹄 +泛 +尼 +拼 +阮 +鹰 +亚 +颈 +惑 +勒 +〉 +际 +肛 +爷 +刚 +钨 +丰 +养 +冶 +鲽 +辉 +蔻 +画 +覆 +皴 +妊 +麦 +返 +醉 +皂 +擀 +〗 +酶 +凑 +粹 +悟 +诀 +硖 +港 +卜 +z +杀 +涕 +± +舍 +铠 +抵 +弛 +段 +敝 +镐 +奠 +拂 +轴 +跛 +袱 +e +t +沉 +菇 +俎 +薪 +峦 +秭 +蟹 +历 +盟 +菠 +寡 +液 +肢 +喻 +染 +裱 +悱 +抱 +氙 +赤 +捅 +猛 +跑 +氮 +谣 +仁 +尺 +辊 +窍 +烙 +衍 +架 +擦 +倏 +璐 +瑁 +币 +楞 +胖 +夔 +趸 +邛 +惴 +饕 +虔 +蝎 +§ +哉 +贝 +宽 +辫 +炮 +扩 +饲 +籽 +魏 +菟 +锰 +伍 +猝 +末 +琳 +哚 +蛎 +邂 +呀 +姿 +鄞 +却 +歧 +仙 +恸 +椐 +森 +牒 +寤 +袒 +婆 +虢 +雅 +钉 +朵 +贼 +欲 +苞 +寰 +故 +龚 +坭 +嘘 +咫 +礼 +硷 +兀 +睢 +汶 +’ +铲 +烧 +绕 +诃 +浃 +钿 +哺 +柜 +讼 +颊 +璁 +腔 +洽 +咐 +脲 +簌 +筠 +镣 +玮 +鞠 +谁 +兼 +姆 +挥 +梯 +蝴 +谘 +漕 +刷 +躏 +宦 +弼 +b +垌 +劈 +麟 +莉 +揭 +笙 +渎 +仕 +嗤 +仓 +配 +怏 +抬 +错 +泯 +镊 +孰 +猿 +邪 +仍 +秋 +鼬 +壹 +歇 +吵 +炼 +< +尧 +射 +柬 +廷 +胧 +霾 +凳 +隋 +肚 +浮 +梦 +祥 +株 +堵 +退 +L +鹫 +跎 +凶 +毽 +荟 +炫 +栩 +玳 +甜 +沂 +鹿 +顽 +伯 +爹 +赔 +蛴 +徐 +匡 +欣 +狰 +缸 +雹 +蟆 +疤 +默 +沤 +啜 +痂 +衣 +禅 +w +i +h +辽 +葳 +黝 +钗 +停 +沽 +棒 +馨 +颌 +肉 +吴 +硫 +悯 +劾 +娈 +马 +啧 +吊 +悌 +镑 +峭 +帆 +瀣 +涉 +咸 +疸 +滋 +泣 +翦 +拙 +癸 +钥 +蜒 ++ +尾 +庄 +凝 +泉 +婢 +渴 +谊 +乞 +陆 +锉 +糊 +鸦 +淮 +I +B +N +晦 +弗 +乔 +庥 +葡 +尻 +席 +橡 +傣 +渣 +拿 +惩 +麋 +斛 +缃 +矮 +蛏 +岘 +鸽 +姐 +膏 +催 +奔 +镒 +喱 +蠡 +摧 +钯 +胤 +柠 +拐 +璋 +鸥 +卢 +荡 +倾 +^ +_ +珀 +逄 +萧 +塾 +掇 +贮 +笆 +聂 +圃 +冲 +嵬 +M +滔 +笕 +值 +炙 +偶 +蜱 +搐 +梆 +汪 +蔬 +腑 +鸯 +蹇 +敞 +绯 +仨 +祯 +谆 +梧 +糗 +鑫 +啸 +豺 +囹 +猾 +巢 +柄 +瀛 +筑 +踌 +沭 +暗 +苁 +鱿 +蹉 +脂 +蘖 +牢 +热 +木 +吸 +溃 +宠 +序 +泞 +偿 +拜 +檩 +厚 +朐 +毗 +螳 +吞 +媚 +朽 +担 +蝗 +橘 +畴 +祈 +糟 +盱 +隼 +郜 +惜 +珠 +裨 +铵 +焙 +琚 +唯 +咚 +噪 +骊 +丫 +滢 +勤 +棉 +呸 +咣 +淀 +隔 +蕾 +窈 +饨 +挨 +煅 +短 +匙 +粕 +镜 +赣 +撕 +墩 +酬 +馁 +豌 +颐 +抗 +酣 +氓 +佑 +搁 +哭 +递 +耷 +涡 +桃 +贻 +碣 +截 +瘦 +昭 +镌 +蔓 +氚 +甲 +猕 +蕴 +蓬 +散 +拾 +纛 +狼 +猷 +铎 +埋 +旖 +矾 +讳 +囊 +糜 +迈 +粟 +蚂 +紧 +鲳 +瘢 +栽 +稼 +羊 +锄 +斟 +睁 +桥 +瓮 +蹙 +祉 +醺 +鼻 +昱 +剃 +跳 +篱 +跷 +蒜 +翎 +宅 +晖 +嗑 +壑 +峻 +癫 +屏 +狠 +陋 +袜 +途 +憎 +祀 +莹 +滟 +佶 +溥 +臣 +约 +盛 +峰 +磁 +慵 +婪 +拦 +莅 +朕 +鹦 +粲 +裤 +哎 +疡 +嫖 +琵 +窟 +堪 +谛 +嘉 +儡 +鳝 +斩 +郾 +驸 +酊 +妄 +胜 +贺 +徙 +傅 +噌 +钢 +栅 +庇 +恋 +匝 +巯 +邈 +尸 +锚 +粗 +佟 +蛟 +薹 +纵 +蚊 +郅 +绢 +锐 +苗 +俞 +篆 +淆 +膀 +鲜 +煎 +诶 +秽 +寻 +涮 +刺 +怀 +噶 +巨 +褰 +魅 +灶 +灌 +桉 +藕 +谜 +舸 +薄 +搀 +恽 +借 +牯 +痉 +渥 +愿 +亓 +耘 +杠 +柩 +锔 +蚶 +钣 +珈 +喘 +蹒 +幽 +赐 +稗 +晤 +莱 +泔 +扯 +肯 +菪 +裆 +腩 +豉 +疆 +骜 +腐 +倭 +珏 +唔 +粮 +亡 +润 +慰 +伽 +橄 +玄 +誉 +醐 +胆 +龊 +粼 +塬 +陇 +彼 +削 +嗣 +绾 +芽 +妗 +垭 +瘴 +爽 +薏 +寨 +龈 +泠 +弹 +赢 +漪 +猫 +嘧 +涂 +恤 +圭 +茧 +烽 +屑 +痕 +巾 +赖 +荸 +凰 +腮 +畈 +亵 +蹲 +偃 +苇 +澜 +艮 +换 +骺 +烘 +苕 +梓 +颉 +肇 +哗 +悄 +氤 +涠 +葬 +屠 +鹭 +植 +竺 +佯 +诣 +鲇 +瘀 +鲅 +邦 +移 +滁 +冯 +耕 +癔 +戌 +茬 +沁 +巩 +悠 +湘 +洪 +痹 +锟 +循 +谋 +腕 +鳃 +钠 +捞 +焉 +迎 +碱 +伫 +急 +榷 +奈 +邝 +卯 +辄 +皲 +卟 +醛 +畹 +忧 +稳 +雄 +昼 +缩 +阈 +睑 +扌 +耗 +曦 +涅 +捏 +瞧 +邕 +淖 +漉 +铝 +耦 +禹 +湛 +喽 +莼 +琅 +诸 +苎 +纂 +硅 +始 +嗨 +傥 +燃 +臂 +赅 +嘈 +呆 +贵 +屹 +壮 +肋 +亍 +蚀 +卅 +豹 +腆 +邬 +迭 +浊 +} +童 +螂 +捐 +圩 +勐 +触 +寞 +汊 +壤 +荫 +膺 +渌 +芳 +懿 +遴 +螈 +泰 +蓼 +蛤 +茜 +舅 +枫 +朔 +膝 +眙 +避 +梅 +判 +鹜 +璜 +牍 +缅 +垫 +藻 +黔 +侥 +惚 +懂 +踩 +腰 +腈 +札 +丞 +唾 +慈 +顿 +摹 +荻 +琬 +~ +斧 +沈 +滂 +胁 +胀 +幄 +莜 +Z +匀 +鄄 +掌 +绰 +茎 +焚 +赋 +萱 +谑 +汁 +铒 +瞎 +夺 +蜗 +野 +娆 +冀 +弯 +篁 +懵 +灞 +隽 +芡 +脘 +俐 +辩 +芯 +掺 +喏 +膈 +蝈 +觐 +悚 +踹 +蔗 +熠 +鼠 +呵 +抓 +橼 +峨 +畜 +缔 +禾 +崭 +弃 +熊 +摒 +凸 +拗 +穹 +蒙 +抒 +祛 +劝 +闫 +扳 +阵 +醌 +踪 +喵 +侣 +搬 +仅 +荧 +赎 +蝾 +琦 +买 +婧 +瞄 +寓 +皎 +冻 +赝 +箩 +莫 +瞰 +郊 +笫 +姝 +筒 +枪 +遣 +煸 +袋 +舆 +痱 +涛 +母 +〇 +启 +践 +耙 +绲 +盘 +遂 +昊 +搞 +槿 +诬 +纰 +泓 +惨 +檬 +亻 +越 +C +o +憩 +熵 +祷 +钒 +暧 +塔 +阗 +胰 +咄 +娶 +魔 +琶 +钞 +邻 +扬 +杉 +殴 +咽 +弓 +〆 +髻 +】 +吭 +揽 +霆 +拄 +殖 +脆 +彻 +岩 +芝 +勃 +辣 +剌 +钝 +嘎 +甄 +佘 +皖 +伦 +授 +徕 +憔 +挪 +皇 +庞 +稔 +芜 +踏 +溴 +兖 +卒 +擢 +饥 +鳞 +煲 +‰ +账 +颗 +叻 +斯 +捧 +鳍 +琮 +讹 +蛙 +纽 +谭 +酸 +兔 +莒 +睇 +伟 +觑 +羲 +嗜 +宜 +褐 +旎 +辛 +卦 +诘 +筋 +鎏 +溪 +挛 +熔 +阜 +晰 +鳅 +丢 +奚 +灸 +呱 +献 +陉 +黛 +鸪 +甾 +萨 +疮 +拯 +洲 +疹 +辑 +叙 +恻 +谒 +允 +柔 +烂 +氏 +逅 +漆 +拎 +惋 +扈 +湟 +纭 +啕 +掬 +擞 +哥 +忽 +涤 +鸵 +靡 +郗 +瓷 +扁 +廊 +怨 +雏 +钮 +敦 +E +懦 +憋 +汀 +拚 +啉 +腌 +岸 +f +痼 +瞅 +尊 +咀 +眩 +飙 +忌 +仝 +迦 +熬 +毫 +胯 +篑 +茄 +腺 +凄 +舛 +碴 +锵 +诧 +羯 +後 +漏 +汤 +宓 +仞 +蚁 +壶 +谰 +皑 +铄 +棰 +罔 +辅 +晶 +苦 +牟 +闽 +\ +烃 +饮 +聿 +丙 +蛳 +朱 +煤 +涔 +鳖 +犁 +罐 +荼 +砒 +淦 +妤 +黏 +戎 +孑 +婕 +瑾 +戢 +钵 +枣 +捋 +砥 +衩 +狙 +桠 +稣 +阎 +肃 +梏 +诫 +孪 +昶 +婊 +衫 +嗔 +侃 +塞 +蜃 +樵 +峒 +貌 +屿 +欺 +缫 +阐 +栖 +诟 +珞 +荭 +吝 +萍 +嗽 +恂 +啻 +蜴 +磬 +峋 +俸 +豫 +谎 +徊 +镍 +韬 +魇 +晴 +U +囟 +猜 +蛮 +坐 +囿 +伴 +亭 +肝 +佗 +蝠 +妃 +胞 +滩 +榴 +氖 +垩 +苋 +砣 +扪 +馏 +姓 +轩 +厉 +夥 +侈 +禀 +垒 +岑 +赏 +钛 +辐 +痔 +披 +纸 +碳 +“ +坞 +蠓 +挤 +荥 +沅 +悔 +铧 +帼 +蒌 +蝇 +a +p +y +n +g +哀 +浆 +瑶 +凿 +桶 +馈 +皮 +奴 +苜 +佤 +伶 +晗 +铱 +炬 +优 +弊 +氢 +恃 +甫 +攥 +端 +锌 +灰 +稹 +炝 +曙 +邋 +亥 +眶 +碾 +拉 +萝 +绔 +捷 +浍 +腋 +姑 +菖 +凌 +涞 +麽 +锢 +桨 +潢 +绎 +镰 +殆 +锑 +渝 +铬 +困 +绽 +觎 +匈 +糙 +暑 +裹 +鸟 +盔 +肽 +迷 +綦 +『 +亳 +佝 +俘 +钴 +觇 +骥 +仆 +疝 +跪 +婶 +郯 +瀹 +唉 +脖 +踞 +针 +晾 +忒 +扼 +瞩 +叛 +椒 +疟 +嗡 +邗 +肆 +跆 +玫 +忡 +捣 +咧 +唆 +艄 +蘑 +潦 +笛 +阚 +沸 +泻 +掊 +菽 +贫 +斥 +髂 +孢 +镂 +赂 +麝 +鸾 +屡 +衬 +苷 +恪 +叠 +希 +粤 +爻 +喝 +茫 +惬 +郸 +绻 +庸 +撅 +碟 +宄 +妹 +膛 +叮 +饵 +崛 +嗲 +椅 +冤 +搅 +咕 +敛 +尹 +垦 +闷 +蝉 +霎 +勰 +败 +蓑 +泸 +肤 +鹌 +幌 +焦 +浠 +鞍 +刁 +舰 +乙 +竿 +裔 +。 +茵 +函 +伊 +兄 +丨 +娜 +匍 +謇 +莪 +宥 +似 +蝽 +翳 +酪 +翠 +粑 +薇 +祢 +骏 +赠 +叫 +Q +噤 +噻 +竖 +芗 +莠 +潭 +俊 +羿 +耜 +O +郫 +趁 +嗪 +囚 +蹶 +芒 +洁 +笋 +鹑 +敲 +硝 +啶 +堡 +渲 +揩 +』 +携 +宿 +遒 +颍 +扭 +棱 +割 +萜 +蔸 +葵 +琴 +捂 +饰 +衙 +耿 +掠 +募 +岂 +窖 +涟 +蔺 +瘤 +柞 +瞪 +怜 +匹 +距 +楔 +炜 +哆 +秦 +缎 +幼 +茁 +绪 +痨 +恨 +楸 +娅 +瓦 +桩 +雪 +嬴 +伏 +榔 +妥 +铿 +拌 +眠 +雍 +缇 +‘ +卓 +搓 +哌 +觞 +噩 +屈 +哧 +髓 +咦 +巅 +娑 +侑 +淫 +膳 +祝 +勾 +姊 +莴 +胄 +疃 +薛 +蜷 +胛 +巷 +芙 +芋 +熙 +闰 +勿 +窃 +狱 +剩 +钏 +幢 +陟 +铛 +慧 +靴 +耍 +k +浙 +浇 +飨 +惟 +绗 +祜 +澈 +啼 +咪 +磷 +摞 +诅 +郦 +抹 +跃 +壬 +吕 +肖 +琏 +颤 +尴 +剡 +抠 +凋 +赚 +泊 +津 +宕 +殷 +倔 +氲 +漫 +邺 +涎 +怠 +$ +垮 +荬 +遵 +俏 +叹 +噢 +饽 +蜘 +孙 +筵 +疼 +鞭 +羧 +牦 +箭 +潴 +c +眸 +祭 +髯 +啖 +坳 +愁 +芩 +驮 +倡 +巽 +穰 +沃 +胚 +怒 +凤 +槛 +剂 +趵 +嫁 +v +邢 +灯 +鄢 +桐 +睽 +檗 +锯 +槟 +婷 +嵋 +圻 +诗 +蕈 +颠 +遭 +痢 +芸 +怯 +馥 +竭 +锗 +徜 +恭 +遍 +籁 +剑 +嘱 +苡 +龄 +僧 +桑 +潸 +弘 +澶 +楹 +悲 +讫 +愤 +腥 +悸 +谍 +椹 +呢 +桓 +葭 +攫 +阀 +翰 +躲 +敖 +柑 +郎 +笨 +橇 +呃 +魁 +燎 +脓 +葩 +磋 +垛 +玺 +狮 +沓 +砜 +蕊 +锺 +罹 +蕉 +翱 +虐 +闾 +巫 +旦 +茱 +嬷 +枯 +鹏 +贡 +芹 +汛 +矫 +绁 +拣 +禺 +佃 +讣 +舫 +惯 +乳 +趋 +疲 +挽 +岚 +虾 +衾 +蠹 +蹂 +飓 +氦 +铖 +孩 +稞 +瑜 +壅 +掀 +勘 +妓 +畅 +髋 +W +庐 +牲 +蓿 +榕 +练 +垣 +唱 +邸 +菲 +昆 +婺 +穿 +绡 +麒 +蚱 +掂 +愚 +泷 +涪 +漳 +妩 +娉 +榄 +讷 +觅 +旧 +藤 +煮 +呛 +柳 +腓 +叭 +庵 +烷 +阡 +罂 +蜕 +擂 +猖 +咿 +媲 +脉 +【 +沏 +貅 +黠 +熏 +哲 +烁 +坦 +酵 +兜 +× +潇 +撒 +剽 +珩 +圹 +乾 +摸 +樟 +帽 +嗒 +襄 +魂 +轿 +憬 +锡 +〕 +喃 +皆 +咖 +隅 +脸 +残 +泮 +袂 +鹂 +珊 +囤 +捆 +咤 +误 +徨 +闹 +淙 +芊 +淋 +怆 +囗 +拨 +梳 +渤 +R +G +绨 +蚓 +婀 +幡 +狩 +麾 +谢 +唢 +裸 +旌 +伉 +纶 +裂 +驳 +砼 +咛 +澄 +樨 +蹈 +宙 +澍 +倍 +貔 +操 +勇 +蟠 +摈 +砧 +虬 +够 +缁 +悦 +藿 +撸 +艹 +摁 +淹 +豇 +虎 +榭 +ˉ +吱 +d +° +喧 +荀 +踱 +侮 +奋 +偕 +饷 +犍 +惮 +坑 +璎 +徘 +宛 +妆 +袈 +倩 +窦 +昂 +荏 +乖 +K +怅 +撰 +鳙 +牙 +袁 +酞 +X +痿 +琼 +闸 +雁 +趾 +荚 +虻 +涝 +《 +杏 +韭 +偈 +烤 +绫 +鞘 +卉 +症 +遢 +蓥 +诋 +杭 +荨 +匆 +竣 +簪 +辙 +敕 +虞 +丹 +缭 +咩 +黟 +m +淤 +瑕 +咂 +铉 +硼 +茨 +嶂 +痒 +畸 +敬 +涿 +粪 +窘 +熟 +叔 +嫔 +盾 +忱 +裘 +憾 +梵 +赡 +珙 +咯 +娘 +庙 +溯 +胺 +葱 +痪 +摊 +荷 +卞 +乒 +髦 +寐 +铭 +坩 +胗 +枷 +爆 +溟 +嚼 +羚 +砬 +轨 +惊 +挠 +罄 +竽 +菏 +氧 +浅 +楣 +盼 +枢 +炸 +阆 +杯 +谏 +噬 +淇 +渺 +俪 +秆 +墓 +泪 +跻 +砌 +痰 +垡 +渡 +耽 +釜 +讶 +鳎 +煞 +呗 +韶 +舶 +绷 +鹳 +缜 +旷 +铊 +皱 +龌 +檀 +霖 +奄 +槐 +艳 +蝶 +旋 +哝 +赶 +骞 +蚧 +腊 +盈 +丁 +` +蜚 +矸 +蝙 +睨 +嚓 +僻 +鬼 +醴 +夜 +彝 +磊 +笔 +拔 +栀 +糕 +厦 +邰 +纫 +逭 +纤 +眦 +膊 +馍 +躇 +烯 +蘼 +冬 +诤 +暄 +骶 +哑 +瘠 +」 +臊 +丕 +愈 +咱 +螺 +擅 +跋 +搏 +硪 +谄 +笠 +淡 +嘿 +骅 +谧 +鼎 +皋 +姚 +歼 +蠢 +驼 +耳 +胬 +挝 +涯 +狗 +蒽 +孓 +犷 +凉 +芦 +箴 +铤 +孤 +嘛 +坤 +V +茴 +朦 +挞 +尖 +橙 +诞 +搴 +碇 +洵 +浚 +帚 +蜍 +漯 +柘 +嚎 +讽 +芭 +荤 +咻 +祠 +秉 +跖 +埃 +吓 +糯 +眷 +馒 +惹 +娼 +鲑 +嫩 +讴 +轮 +瞥 +靶 +褚 +乏 +缤 +宋 +帧 +删 +驱 +碎 +扑 +俩 +俄 +偏 +涣 +竹 +噱 +皙 +佰 +渚 +唧 +斡 +# +镉 +刀 +崎 +筐 +佣 +夭 +贰 +肴 +峙 +哔 +艿 +匐 +牺 +镛 +缘 +仡 +嫡 +劣 +枸 +堀 +梨 +簿 +鸭 +蒸 +亦 +稽 +浴 +{ +衢 +束 +槲 +j +阁 +揍 +疥 +棋 +潋 +聪 +窜 +乓 +睛 +插 +冉 +阪 +苍 +搽 +「 +蟾 +螟 +幸 +仇 +樽 +撂 +慢 +跤 +幔 +俚 +淅 +覃 +觊 +溶 +妖 +帛 +侨 +曰 +妾 +泗 +· +: +瀘 +風 +Ë +( +) +∶ +紅 +紗 +瑭 +雲 +頭 +鶏 +財 +許 +• +¥ +樂 +焗 +麗 +— +; +滙 +東 +榮 +繪 +興 +… +門 +業 +π +楊 +國 +顧 +é +盤 +寳 +Λ +龍 +鳳 +島 +誌 +緣 +結 +銭 +萬 +勝 +祎 +璟 +優 +歡 +臨 +時 +購 += +★ +藍 +昇 +鐵 +觀 +勅 +農 +聲 +畫 +兿 +術 +發 +劉 +記 +專 +耑 +園 +書 +壴 +種 +Ο +● +褀 +號 +銀 +匯 +敟 +锘 +葉 +橪 +廣 +進 +蒄 +鑽 +阝 +祙 +貢 +鍋 +豊 +夬 +喆 +團 +閣 +開 +燁 +賓 +館 +酡 +沔 +順 ++ +硚 +劵 +饸 +陽 +車 +湓 +復 +萊 +氣 +軒 +華 +堃 +迮 +纟 +戶 +馬 +學 +裡 +電 +嶽 +獨 +マ +シ +サ +ジ +燘 +袪 +環 +❤ +臺 +灣 +専 +賣 +孖 +聖 +攝 +線 +▪ +α +傢 +俬 +夢 +達 +莊 +喬 +貝 +薩 +劍 +羅 +壓 +棛 +饦 +尃 +璈 +囍 +醫 +G +I +A +# +N +鷄 +髙 +嬰 +啓 +約 +隹 +潔 +賴 +藝 +~ +寶 +籣 +麺 +  +嶺 +√ +義 +網 +峩 +長 +∧ +魚 +機 +構 +② +鳯 +偉 +L +B +㙟 +畵 +鴿 +' +詩 +溝 +嚞 +屌 +藔 +佧 +玥 +蘭 +織 +1 +3 +9 +0 +7 +點 +砭 +鴨 +鋪 +銘 +廳 +弍 +‧ +創 +湯 +坶 +℃ +卩 +骝 +& +烜 +荘 +當 +潤 +扞 +係 +懷 +碶 +钅 +蚨 +讠 +☆ +叢 +爲 +埗 +涫 +塗 +→ +楽 +現 +鯨 +愛 +瑪 +鈺 +忄 +悶 +藥 +飾 +樓 +視 +孬 +ㆍ +燚 +苪 +師 +① +丼 +锽 +│ +韓 +標 +è +兒 +閏 +匋 +張 +漢 +Ü +髪 +會 +閑 +檔 +習 +裝 +の +峯 +菘 +輝 +И +雞 +釣 +億 +浐 +K +O +R +8 +H +E +P +T +W +D +S +C +M +F +姌 +饹 +» +晞 +廰 +ä +嵯 +鷹 +負 +飲 +絲 +冚 +楗 +澤 +綫 +區 +❋ +← +質 +靑 +揚 +③ +滬 +統 +産 +協 +﹑ +乸 +畐 +經 +運 +際 +洺 +岽 +為 +粵 +諾 +崋 +豐 +碁 +ɔ +V +2 +6 +齋 +誠 +訂 +´ +勑 +雙 +陳 +無 +í +泩 +媄 +夌 +刂 +i +c +t +o +r +a +嘢 +耄 +燴 +暃 +壽 +媽 +靈 +抻 +體 +唻 +É +冮 +甹 +鎮 +錦 +ʌ +蜛 +蠄 +尓 +駕 +戀 +飬 +逹 +倫 +貴 +極 +Я +Й +寬 +磚 +嶪 +郎 +職 +| +間 +n +d +剎 +伈 +課 +飛 +橋 +瘊 +№ +譜 +骓 +圗 +滘 +縣 +粿 +咅 +養 +濤 +彳 +® +% +Ⅱ +啰 +㴪 +見 +矞 +薬 +糁 +邨 +鲮 +顔 +罱 +З +選 +話 +贏 +氪 +俵 +競 +瑩 +繡 +枱 +β +綉 +á +獅 +爾 +™ +麵 +戋 +淩 +徳 +個 +劇 +場 +務 +簡 +寵 +h +實 +膠 +轱 +圖 +築 +嘣 +樹 +㸃 +營 +耵 +孫 +饃 +鄺 +飯 +麯 +遠 +輸 +坫 +孃 +乚 +閃 +鏢 +㎡ +題 +廠 +關 +↑ +爺 +將 +軍 +連 +篦 +覌 +參 +箸 +- +窠 +棽 +寕 +夀 +爰 +歐 +呙 +閥 +頡 +熱 +雎 +垟 +裟 +凬 +勁 +帑 +馕 +夆 +疌 +枼 +馮 +貨 +蒤 +樸 +彧 +旸 +靜 +龢 +暢 +㐱 +鳥 +珺 +鏡 +灡 +爭 +堷 +廚 +Ó +騰 +診 +┅ +蘇 +褔 +凱 +頂 +豕 +亞 +帥 +嘬 +⊥ +仺 +桖 +複 +饣 +絡 +穂 +顏 +棟 +納 +▏ +濟 +親 +設 +計 +攵 +埌 +烺 +ò +頤 +燦 +蓮 +撻 +節 +講 +濱 +濃 +娽 +洳 +朿 +燈 +鈴 +護 +膚 +铔 +過 +補 +Z +U +5 +4 +坋 +闿 +䖝 +餘 +缐 +铞 +貿 +铪 +桼 +趙 +鍊 +[ +㐂 +垚 +菓 +揸 +捲 +鐘 +滏 +𣇉 +爍 +輪 +燜 +鴻 +鮮 +動 +鹞 +鷗 +丄 +慶 +鉌 +翥 +飮 +腸 +⇋ +漁 +覺 +來 +熘 +昴 +翏 +鲱 +圧 +鄉 +萭 +頔 +爐 +嫚 +г +貭 +類 +聯 +幛 +輕 +訓 +鑒 +夋 +锨 +芃 +珣 +䝉 +扙 +嵐 +銷 +處 +ㄱ +語 +誘 +苝 +歸 +儀 +燒 +楿 +內 +粢 +葒 +奧 +麥 +礻 +滿 +蠔 +穵 +瞭 +態 +鱬 +榞 +硂 +鄭 +黃 +煙 +祐 +奓 +逺 +* +瑄 +獲 +聞 +薦 +讀 +這 +樣 +決 +問 +啟 +們 +執 +説 +轉 +單 +隨 +唘 +帶 +倉 +庫 +還 +贈 +尙 +皺 +■ +餅 +產 +○ +∈ +報 +狀 +楓 +賠 +琯 +嗮 +禮 +` +傳 +> +≤ +嗞 +Φ +≥ +換 +咭 +∣ +↓ +曬 +ε +応 +寫 +″ +終 +様 +純 +費 +療 +聨 +凍 +壐 +郵 +ü +黒 +∫ +製 +塊 +調 +軽 +確 +撃 +級 +馴 +Ⅲ +涇 +繹 +數 +碼 +證 +狒 +処 +劑 +< +晧 +賀 +衆 +] +櫥 +兩 +陰 +絶 +對 +鯉 +憶 +◎ +p +e +Y +蕒 +煖 +頓 +測 +試 +鼽 +僑 +碩 +妝 +帯 +≈ +鐡 +舖 +權 +喫 +倆 +ˋ +該 +悅 +ā +俫 +. +f +s +b +m +k +g +u +j +貼 +淨 +濕 +針 +適 +備 +l +/ +給 +謢 +強 +觸 +衛 +與 +⊙ +$ +緯 +變 +⑴ +⑵ +⑶ +㎏ +殺 +∩ +幚 +─ +價 +▲ +離 +ú +ó +飄 +烏 +関 +閟 +﹝ +﹞ +邏 +輯 +鍵 +驗 +訣 +導 +歷 +屆 +層 +▼ +儱 +錄 +熳 +ē +艦 +吋 +錶 +辧 +飼 +顯 +④ +禦 +販 +気 +対 +枰 +閩 +紀 +幹 +瞓 +貊 +淚 +△ +眞 +墊 +Ω +獻 +褲 +縫 +緑 +亜 +鉅 +餠 +{ +} +◆ +蘆 +薈 +█ +◇ +溫 +彈 +晳 +粧 +犸 +穩 +訊 +崬 +凖 +熥 +П +舊 +條 +紋 +圍 +Ⅳ +筆 +尷 +難 +雜 +錯 +綁 +識 +頰 +鎖 +艶 +□ +殁 +殼 +⑧ +├ +▕ +鵬 +ǐ +ō +ǒ +糝 +綱 +▎ +μ +盜 +饅 +醬 +籤 +蓋 +釀 +鹽 +據 +à +ɡ +辦 +◥ +彐 +┌ +婦 +獸 +鲩 +伱 +ī +蒟 +蒻 +齊 +袆 +腦 +寧 +凈 +妳 +煥 +詢 +偽 +謹 +啫 +鯽 +騷 +鱸 +損 +傷 +鎻 +髮 +買 +冏 +儥 +両 +﹢ +∞ +載 +喰 +z +羙 +悵 +燙 +曉 +員 +組 +徹 +艷 +痠 +鋼 +鼙 +縮 +細 +嚒 +爯 +≠ +維 +" +鱻 +壇 +厍 +帰 +浥 +犇 +薡 +軎 +² +應 +醜 +刪 +緻 +鶴 +賜 +噁 +軌 +尨 +镔 +鷺 +槗 +彌 +葚 +濛 +請 +溇 +緹 +賢 +訪 +獴 +瑅 +資 +縤 +陣 +蕟 +栢 +韻 +祼 +恁 +伢 +謝 +劃 +涑 +總 +衖 +踺 +砋 +凉 +籃 +駿 +苼 +瘋 +昽 +紡 +驊 +腎 +﹗ +響 +杋 +剛 +嚴 +禪 +歓 +槍 +傘 +檸 +檫 +炣 +勢 +鏜 +鎢 +銑 +尐 +減 +奪 +惡 +θ +僮 +婭 +臘 +ū +ì +殻 +鉄 +∑ +蛲 +焼 +緖 +續 +紹 +懮 \ No newline at end of file diff --git a/ppocr/utils/profiler.py b/ppocr/utils/profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..c4e28bc6bea9ca912a0786d879a48ec0349e7698 --- /dev/null +++ b/ppocr/utils/profiler.py @@ -0,0 +1,110 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import paddle + +# A global variable to record the number of calling times for profiler +# functions. It is used to specify the tracing range of training steps. +_profiler_step_id = 0 + +# A global variable to avoid parsing from string every time. +_profiler_options = None + + +class ProfilerOptions(object): + ''' + Use a string to initialize a ProfilerOptions. + The string should be in the format: "key1=value1;key2=value;key3=value3". + For example: + "profile_path=model.profile" + "batch_range=[50, 60]; profile_path=model.profile" + "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile" + ProfilerOptions supports following key-value pair: + batch_range - a integer list, e.g. [100, 110]. + state - a string, the optional values are 'CPU', 'GPU' or 'All'. + sorted_key - a string, the optional values are 'calls', 'total', + 'max', 'min' or 'ave. + tracer_option - a string, the optional values are 'Default', 'OpDetail', + 'AllOpDetail'. + profile_path - a string, the path to save the serialized profile data, + which can be used to generate a timeline. + exit_on_finished - a boolean. + ''' + + def __init__(self, options_str): + assert isinstance(options_str, str) + + self._options = { + 'batch_range': [10, 20], + 'state': 'All', + 'sorted_key': 'total', + 'tracer_option': 'Default', + 'profile_path': '/tmp/profile', + 'exit_on_finished': True + } + self._parse_from_string(options_str) + + def _parse_from_string(self, options_str): + for kv in options_str.replace(' ', '').split(';'): + key, value = kv.split('=') + if key == 'batch_range': + value_list = value.replace('[', '').replace(']', '').split(',') + value_list = list(map(int, value_list)) + if len(value_list) >= 2 and value_list[0] >= 0 and value_list[ + 1] > value_list[0]: + self._options[key] = value_list + elif key == 'exit_on_finished': + self._options[key] = value.lower() in ("yes", "true", "t", "1") + elif key in [ + 'state', 'sorted_key', 'tracer_option', 'profile_path' + ]: + self._options[key] = value + + def __getitem__(self, name): + if self._options.get(name, None) is None: + raise ValueError( + "ProfilerOptions does not have an option named %s." % name) + return self._options[name] + + +def add_profiler_step(options_str=None): + ''' + Enable the operator-level timing using PaddlePaddle's profiler. + The profiler uses a independent variable to count the profiler steps. + One call of this function is treated as a profiler step. + + Args: + profiler_options - a string to initialize the ProfilerOptions. + Default is None, and the profiler is disabled. + ''' + if options_str is None: + return + + global _profiler_step_id + global _profiler_options + + if _profiler_options is None: + _profiler_options = ProfilerOptions(options_str) + + if _profiler_step_id == _profiler_options['batch_range'][0]: + paddle.utils.profiler.start_profiler( + _profiler_options['state'], _profiler_options['tracer_option']) + elif _profiler_step_id == _profiler_options['batch_range'][1]: + paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'], + _profiler_options['profile_path']) + if _profiler_options['exit_on_finished']: + sys.exit(0) + + _profiler_step_id += 1 diff --git a/ppocr/utils/save_load.py b/ppocr/utils/save_load.py new file mode 100644 index 0000000000000000000000000000000000000000..aa65f290c0a5f4f13b3103fb4404815e2ae74a88 --- /dev/null +++ b/ppocr/utils/save_load.py @@ -0,0 +1,220 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import errno +import os +import pickle +import six + +import paddle + +from ppocr.utils.logging import get_logger + +__all__ = ['load_model'] + + +def _mkdir_if_not_exist(path, logger): + """ + mkdir if not exists, ignore the exception when multiprocess mkdir together + """ + if not os.path.exists(path): + try: + os.makedirs(path) + except OSError as e: + if e.errno == errno.EEXIST and os.path.isdir(path): + logger.warning( + 'be happy if some process has already created {}'.format( + path)) + else: + raise OSError('Failed to mkdir {}'.format(path)) + + +def load_model(config, model, optimizer=None, model_type='det'): + """ + load model from checkpoint or pretrained_model + """ + logger = get_logger() + global_config = config['Global'] + checkpoints = global_config.get('checkpoints') + pretrained_model = global_config.get('pretrained_model') + best_model_dict = {} + is_float16 = False + is_nlp_model = model_type == 'kie' and config["Architecture"][ + "algorithm"] not in ["SDMGR"] + + if is_nlp_model is True: + # NOTE: for kie model dsitillation, resume training is not supported now + if config["Architecture"]["algorithm"] in ["Distillation"]: + return best_model_dict + checkpoints = config['Architecture']['Backbone']['checkpoints'] + # load kie method metric + if checkpoints: + if os.path.exists(os.path.join(checkpoints, 'metric.states')): + with open(os.path.join(checkpoints, 'metric.states'), + 'rb') as f: + states_dict = pickle.load(f) if six.PY2 else pickle.load( + f, encoding='latin1') + best_model_dict = states_dict.get('best_model_dict', {}) + if 'epoch' in states_dict: + best_model_dict['start_epoch'] = states_dict['epoch'] + 1 + logger.info("resume from {}".format(checkpoints)) + + if optimizer is not None: + if checkpoints[-1] in ['/', '\\']: + checkpoints = checkpoints[:-1] + if os.path.exists(checkpoints + '.pdopt'): + optim_dict = paddle.load(checkpoints + '.pdopt') + optimizer.set_state_dict(optim_dict) + else: + logger.warning( + "{}.pdopt is not exists, params of optimizer is not loaded". + format(checkpoints)) + + return best_model_dict + + if checkpoints: + if checkpoints.endswith('.pdparams'): + checkpoints = checkpoints.replace('.pdparams', '') + assert os.path.exists(checkpoints + ".pdparams"), \ + "The {}.pdparams does not exists!".format(checkpoints) + + # load params from trained model + params = paddle.load(checkpoints + '.pdparams') + state_dict = model.state_dict() + new_state_dict = {} + for key, value in state_dict.items(): + if key not in params: + logger.warning("{} not in loaded params {} !".format( + key, params.keys())) + continue + pre_value = params[key] + if pre_value.dtype == paddle.float16: + is_float16 = True + if pre_value.dtype != value.dtype: + pre_value = pre_value.astype(value.dtype) + if list(value.shape) == list(pre_value.shape): + new_state_dict[key] = pre_value + else: + logger.warning( + "The shape of model params {} {} not matched with loaded params shape {} !". + format(key, value.shape, pre_value.shape)) + model.set_state_dict(new_state_dict) + if is_float16: + logger.info( + "The parameter type is float16, which is converted to float32 when loading" + ) + if optimizer is not None: + if os.path.exists(checkpoints + '.pdopt'): + optim_dict = paddle.load(checkpoints + '.pdopt') + optimizer.set_state_dict(optim_dict) + else: + logger.warning( + "{}.pdopt is not exists, params of optimizer is not loaded". + format(checkpoints)) + + if os.path.exists(checkpoints + '.states'): + with open(checkpoints + '.states', 'rb') as f: + states_dict = pickle.load(f) if six.PY2 else pickle.load( + f, encoding='latin1') + best_model_dict = states_dict.get('best_model_dict', {}) + if 'epoch' in states_dict: + best_model_dict['start_epoch'] = states_dict['epoch'] + 1 + logger.info("resume from {}".format(checkpoints)) + elif pretrained_model: + is_float16 = load_pretrained_params(model, pretrained_model) + else: + logger.info('train from scratch') + best_model_dict['is_float16'] = is_float16 + return best_model_dict + + +def load_pretrained_params(model, path): + logger = get_logger() + if path.endswith('.pdparams'): + path = path.replace('.pdparams', '') + assert os.path.exists(path + ".pdparams"), \ + "The {}.pdparams does not exists!".format(path) + + params = paddle.load(path + '.pdparams') + + state_dict = model.state_dict() + + new_state_dict = {} + is_float16 = False + + for k1 in params.keys(): + + if k1 not in state_dict.keys(): + logger.warning("The pretrained params {} not in model".format(k1)) + else: + if params[k1].dtype == paddle.float16: + is_float16 = True + if params[k1].dtype != state_dict[k1].dtype: + params[k1] = params[k1].astype(state_dict[k1].dtype) + if list(state_dict[k1].shape) == list(params[k1].shape): + new_state_dict[k1] = params[k1] + else: + logger.warning( + "The shape of model params {} {} not matched with loaded params {} {} !". + format(k1, state_dict[k1].shape, k1, params[k1].shape)) + + model.set_state_dict(new_state_dict) + if is_float16: + logger.info( + "The parameter type is float16, which is converted to float32 when loading" + ) + logger.info("load pretrain successful from {}".format(path)) + return is_float16 + + +def save_model(model, + optimizer, + model_path, + logger, + config, + is_best=False, + prefix='ppocr', + **kwargs): + """ + save model to the target path + """ + _mkdir_if_not_exist(model_path, logger) + model_prefix = os.path.join(model_path, prefix) + paddle.save(optimizer.state_dict(), model_prefix + '.pdopt') + + is_nlp_model = config['Architecture']["model_type"] == 'kie' and config[ + "Architecture"]["algorithm"] not in ["SDMGR"] + if is_nlp_model is not True: + paddle.save(model.state_dict(), model_prefix + '.pdparams') + metric_prefix = model_prefix + else: # for kie system, we follow the save/load rules in NLP + if config['Global']['distributed']: + arch = model._layers + else: + arch = model + if config["Architecture"]["algorithm"] in ["Distillation"]: + arch = arch.Student + arch.backbone.model.save_pretrained(model_prefix) + metric_prefix = os.path.join(model_prefix, 'metric') + # save metric and config + with open(metric_prefix + '.states', 'wb') as f: + pickle.dump(kwargs, f, protocol=2) + if is_best: + logger.info('save best model is to {}'.format(model_prefix)) + else: + logger.info("save model in {}".format(model_prefix)) diff --git a/ppocr/utils/stats.py b/ppocr/utils/stats.py new file mode 100644 index 0000000000000000000000000000000000000000..179b0082f16133ac202c6f0dc120ec07c10fd18e --- /dev/null +++ b/ppocr/utils/stats.py @@ -0,0 +1,72 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import numpy as np +import datetime + +__all__ = ['TrainingStats', 'Time'] + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size): + self.deque = collections.deque(maxlen=window_size) + + def add_value(self, value): + self.deque.append(value) + + def get_median_value(self): + return np.median(self.deque) + + +def Time(): + return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + + +class TrainingStats(object): + def __init__(self, window_size, stats_keys): + self.window_size = window_size + self.smoothed_losses_and_metrics = { + key: SmoothedValue(window_size) + for key in stats_keys + } + + def update(self, stats): + for k, v in stats.items(): + if k not in self.smoothed_losses_and_metrics: + self.smoothed_losses_and_metrics[k] = SmoothedValue( + self.window_size) + self.smoothed_losses_and_metrics[k].add_value(v) + + def get(self, extras=None): + stats = collections.OrderedDict() + if extras: + for k, v in extras.items(): + stats[k] = v + for k, v in self.smoothed_losses_and_metrics.items(): + stats[k] = round(v.get_median_value(), 6) + + return stats + + def log(self, extras=None): + d = self.get(extras) + strs = [] + for k, v in d.items(): + strs.append('{}: {:x<6f}'.format(k, v)) + strs = ', '.join(strs) + return strs diff --git a/ppocr/utils/utility.py b/ppocr/utils/utility.py new file mode 100644 index 0000000000000000000000000000000000000000..ebff2fe7f6aeb18864a182f68f843847f4d14013 --- /dev/null +++ b/ppocr/utils/utility.py @@ -0,0 +1,149 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import imghdr +import cv2 +import random +import numpy as np +import paddle + + +def print_dict(d, logger, delimiter=0): + """ + Recursively visualize a dict and + indenting acrrording by the relationship of keys. + """ + for k, v in sorted(d.items()): + if isinstance(v, dict): + logger.info("{}{} : ".format(delimiter * " ", str(k))) + print_dict(v, logger, delimiter + 4) + elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict): + logger.info("{}{} : ".format(delimiter * " ", str(k))) + for value in v: + print_dict(value, logger, delimiter + 4) + else: + logger.info("{}{} : {}".format(delimiter * " ", k, v)) + + +def get_check_global_params(mode): + check_params = ['use_gpu', 'max_text_length', 'image_shape', \ + 'image_shape', 'character_type', 'loss_type'] + if mode == "train_eval": + check_params = check_params + [ \ + 'train_batch_size_per_card', 'test_batch_size_per_card'] + elif mode == "test": + check_params = check_params + ['test_batch_size_per_card'] + return check_params + + +def _check_image_file(path): + img_end = {'jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif', 'tiff', 'gif', 'pdf'} + return any([path.lower().endswith(e) for e in img_end]) + + +def get_image_file_list(img_file): + imgs_lists = [] + if img_file is None or not os.path.exists(img_file): + raise Exception("not found any img file in {}".format(img_file)) + + if os.path.isfile(img_file) and _check_image_file(img_file): + imgs_lists.append(img_file) + elif os.path.isdir(img_file): + for single_file in os.listdir(img_file): + file_path = os.path.join(img_file, single_file) + if os.path.isfile(file_path) and _check_image_file(file_path): + imgs_lists.append(file_path) + if len(imgs_lists) == 0: + raise Exception("not found any img file in {}".format(img_file)) + imgs_lists = sorted(imgs_lists) + return imgs_lists + + +def check_and_read(img_path): + if os.path.basename(img_path)[-3:].lower() == 'gif': + gif = cv2.VideoCapture(img_path) + ret, frame = gif.read() + if not ret: + logger = logging.getLogger('ppocr') + logger.info("Cannot read {}. This gif image maybe corrupted.") + return None, False + if len(frame.shape) == 2 or frame.shape[-1] == 1: + frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB) + imgvalue = frame[:, :, ::-1] + return imgvalue, True, False + elif os.path.basename(img_path)[-3:].lower() == 'pdf': + import fitz + from PIL import Image + imgs = [] + with fitz.open(img_path) as pdf: + for pg in range(0, pdf.page_count): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.get_pixmap(matrix=mat, alpha=False) + + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) + return imgs, False, True + return None, False, False + + +def load_vqa_bio_label_maps(label_map_path): + with open(label_map_path, "r", encoding='utf-8') as fin: + lines = fin.readlines() + old_lines = [line.strip() for line in lines] + lines = ["O"] + for line in old_lines: + # "O" has already been in lines + if line.upper() in ["OTHER", "OTHERS", "IGNORE"]: + continue + lines.append(line) + labels = ["O"] + for line in lines[1:]: + labels.append("B-" + line) + labels.append("I-" + line) + label2id_map = {label.upper(): idx for idx, label in enumerate(labels)} + id2label_map = {idx: label.upper() for idx, label in enumerate(labels)} + return label2id_map, id2label_map + + +def set_seed(seed=1024): + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + + +class AverageMeter: + def __init__(self): + self.reset() + + def reset(self): + """reset""" + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + """update""" + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count diff --git a/ppocr/utils/visual.py b/ppocr/utils/visual.py new file mode 100644 index 0000000000000000000000000000000000000000..b6de446593984788bea5c03026f4a5b8c0187909 --- /dev/null +++ b/ppocr/utils/visual.py @@ -0,0 +1,125 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import cv2 +import os +import numpy as np +from PIL import Image, ImageDraw, ImageFont + + +def draw_ser_results(image, + ocr_results, + font_path="doc/fonts/simfang.ttf", + font_size=14): + np.random.seed(2021) + color = (np.random.permutation(range(255)), + np.random.permutation(range(255)), + np.random.permutation(range(255))) + color_map = { + idx: (color[0][idx], color[1][idx], color[2][idx]) + for idx in range(1, 255) + } + if isinstance(image, np.ndarray): + image = Image.fromarray(image) + elif isinstance(image, str) and os.path.isfile(image): + image = Image.open(image).convert('RGB') + img_new = image.copy() + draw = ImageDraw.Draw(img_new) + + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + for ocr_info in ocr_results: + if ocr_info["pred_id"] not in color_map: + continue + color = color_map[ocr_info["pred_id"]] + text = "{}: {}".format(ocr_info["pred"], ocr_info["transcription"]) + + if "bbox" in ocr_info: + # draw with ocr engine + bbox = ocr_info["bbox"] + else: + # draw with ocr groundtruth + bbox = trans_poly_to_bbox(ocr_info["points"]) + draw_box_txt(bbox, text, draw, font, font_size, color) + + img_new = Image.blend(image, img_new, 0.7) + return np.array(img_new) + + +def draw_box_txt(bbox, text, draw, font, font_size, color): + + # draw ocr results outline + bbox = ((bbox[0], bbox[1]), (bbox[2], bbox[3])) + draw.rectangle(bbox, fill=color) + + # draw ocr results + tw = font.getsize(text)[0] + th = font.getsize(text)[1] + start_y = max(0, bbox[0][1] - th) + draw.rectangle( + [(bbox[0][0] + 1, start_y), (bbox[0][0] + tw + 1, start_y + th)], + fill=(0, 0, 255)) + draw.text((bbox[0][0] + 1, start_y), text, fill=(255, 255, 255), font=font) + + +def trans_poly_to_bbox(poly): + x1 = np.min([p[0] for p in poly]) + x2 = np.max([p[0] for p in poly]) + y1 = np.min([p[1] for p in poly]) + y2 = np.max([p[1] for p in poly]) + return [x1, y1, x2, y2] + + +def draw_re_results(image, + result, + font_path="doc/fonts/simfang.ttf", + font_size=18): + np.random.seed(0) + if isinstance(image, np.ndarray): + image = Image.fromarray(image) + elif isinstance(image, str) and os.path.isfile(image): + image = Image.open(image).convert('RGB') + img_new = image.copy() + draw = ImageDraw.Draw(img_new) + + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + color_head = (0, 0, 255) + color_tail = (255, 0, 0) + color_line = (0, 255, 0) + + for ocr_info_head, ocr_info_tail in result: + draw_box_txt(ocr_info_head["bbox"], ocr_info_head["transcription"], + draw, font, font_size, color_head) + draw_box_txt(ocr_info_tail["bbox"], ocr_info_tail["transcription"], + draw, font, font_size, color_tail) + + center_head = ( + (ocr_info_head['bbox'][0] + ocr_info_head['bbox'][2]) // 2, + (ocr_info_head['bbox'][1] + ocr_info_head['bbox'][3]) // 2) + center_tail = ( + (ocr_info_tail['bbox'][0] + ocr_info_tail['bbox'][2]) // 2, + (ocr_info_tail['bbox'][1] + ocr_info_tail['bbox'][3]) // 2) + + draw.line([center_head, center_tail], fill=color_line, width=5) + + img_new = Image.blend(image, img_new, 0.5) + return np.array(img_new) + + +def draw_rectangle(img_path, boxes): + boxes = np.array(boxes) + img = cv2.imread(img_path) + img_show = img.copy() + for box in boxes.astype(int): + x1, y1, x2, y2 = box + cv2.rectangle(img_show, (x1, y1), (x2, y2), (255, 0, 0), 2) + return img_show \ No newline at end of file diff --git a/ppstructure/README.md b/ppstructure/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e44ba588601b1049ccb087bf5985f2c13d6f451d --- /dev/null +++ b/ppstructure/README.md @@ -0,0 +1,116 @@ +English | [简体中文](README_ch.md) + +# PP-Structure + +- [1. Introduction](#1-introduction) +- [2. Features](#2-features) +- [3. Results](#3-results) + - [3.1 Layout analysis and table recognition](#31-layout-analysis-and-table-recognition) + - [3.2 Layout Recovery](#32-layout-recovery) + - [3.3 KIE](#33-kie) +- [4. Quick start](#4-quick-start) +- [5. Model List](#5-model-list) + +## 1. Introduction + +PP-Structure is an intelligent document analysis system developed by the PaddleOCR team, which aims to help developers better complete tasks related to document understanding such as layout analysis and table recognition. + +The pipeline of PP-StructureV2 system is shown below. The document image first passes through the image direction correction module to identify the direction of the entire image and complete the direction correction. Then, two tasks of layout information analysis and key information extraction can be completed. + +- In the layout analysis task, the image first goes through the layout analysis model to divide the image into different areas such as text, table, and figure, and then analyze these areas separately. For example, the table area is sent to the form recognition module for structured recognition, and the text area is sent to the OCR engine for text recognition. Finally, the layout recovery module restores it to a word or pdf file with the same layout as the original image; +- In the key information extraction task, the OCR engine is first used to extract the text content, and then the SER(semantic entity recognition) module obtains the semantic entities in the image, and finally the RE(relationship extraction) module obtains the correspondence between the semantic entities, thereby extracting the required key information. + + +More technical details: 👉 [PP-StructureV2 Technical Report](https://arxiv.org/abs/2210.05391) + +PP-StructureV2 supports independent use or flexible collocation of each module. For example, you can use layout analysis alone or table recognition alone. Click the corresponding link below to get the tutorial for each independent module: + +- [Layout Analysis](layout/README.md) +- [Table Recognition](table/README.md) +- [Key Information Extraction](kie/README.md) +- [Layout Recovery](recovery/README.md) + +## 2. Features + +The main features of PP-StructureV2 are as follows: +- Support layout analysis of documents in the form of images/pdfs, which can be divided into areas such as **text, titles, tables, figures, formulas, etc.**; +- Support common Chinese and English **table detection** tasks; +- Support structured table recognition, and output the final result to **Excel file**; +- Support multimodal-based Key Information Extraction (KIE) tasks - **Semantic Entity Recognition** (SER) and **Relation Extraction (RE); +- Support **layout recovery**, that is, restore the document in word or pdf format with the same layout as the original image; +- Support customized training and multiple inference deployment methods such as python whl package quick start; +- Connect with the semi-automatic data labeling tool PPOCRLabel, which supports the labeling of layout analysis, table recognition, and SER. + +## 3. Results + +PP-StructureV2 supports the independent use or flexible collocation of each module. For example, layout analysis can be used alone, or table recognition can be used alone. Only the visualization effects of several representative usage methods are shown here. + +### 3.1 Layout analysis and table recognition + +The figure shows the pipeline of layout analysis + table recognition. The image is first divided into four areas of image, text, title and table by layout analysis, and then OCR detection and recognition is performed on the three areas of image, text and title, and the table is performed table recognition, where the image will also be stored for use. + + +### 3.2 Layout recovery + +The following figure shows the effect of layout recovery based on the results of layout analysis and table recognition in the previous section. + + +### 3.3 KIE + +* SER + +Different colored boxes in the figure represent different categories. + +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +* RE + +In the figure, the red box represents `Question`, the blue box represents `Answer`, and `Question` and `Answer` are connected by green lines. + +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +## 4. Quick start + +Start from [Quick Start](./docs/quickstart_en.md). + +## 5. Model List + +Some tasks need to use both the structured analysis models and the OCR models. For example, the table recognition task needs to use the table recognition model for structured analysis, and the OCR model to recognize the text in the table. Please select the appropriate models according to your specific needs. + +For structural analysis related model downloads, please refer to: +- [PP-Structure Model Zoo](./docs/models_list_en.md) + +For OCR related model downloads, please refer to: +- [PP-OCR Model Zoo](../doc/doc_en/models_list_en.md) diff --git a/ppstructure/README_ch.md b/ppstructure/README_ch.md new file mode 100644 index 0000000000000000000000000000000000000000..53c251d15417291a0d024ef90d0013fe30ef42d3 --- /dev/null +++ b/ppstructure/README_ch.md @@ -0,0 +1,122 @@ +[English](README.md) | 简体中文 + +# PP-Structure 文档分析 + +- [1. 简介](#1) +- [2. 特性](#2) +- [3. 效果展示](#3) + - [3.1 版面分析和表格识别](#31) + - [3.2 版面恢复](#32) + - [3.3 关键信息抽取](#33) +- [4. 快速体验](#4) +- [5. 模型库](#5) + + +## 1. 简介 + +PP-Structure是PaddleOCR团队自研的智能文档分析系统,旨在帮助开发者更好的完成版面分析、表格识别等文档理解相关任务。 + +PP-StructureV2系统流程图如下所示,文档图像首先经过图像矫正模块,判断整图方向并完成转正,随后可以完成版面信息分析与关键信息抽取2类任务。 +- 版面分析任务中,图像首先经过版面分析模型,将图像划分为文本、表格、图像等不同区域,随后对这些区域分别进行识别,如,将表格区域送入表格识别模块进行结构化识别,将文本区域送入OCR引擎进行文字识别,最后使用版面恢复模块将其恢复为与原始图像布局一致的word或者pdf格式的文件; +- 关键信息抽取任务中,首先使用OCR引擎提取文本内容,然后由语义实体识别模块获取图像中的语义实体,最后经关系抽取模块获取语义实体之间的对应关系,从而提取需要的关键信息。 + + + +更多技术细节:👉 PP-StructureV2技术报告 [中文版](docs/PP-StructureV2_introduction.md),[英文版](https://arxiv.org/abs/2210.05391)。 + +PP-StructureV2支持各个模块独立使用或灵活搭配,如,可以单独使用版面分析,或单独使用表格识别,点击下面相应链接获取各个独立模块的使用教程: + +- [版面分析](layout/README_ch.md) +- [表格识别](table/README_ch.md) +- [关键信息抽取](kie/README_ch.md) +- [版面复原](recovery/README_ch.md) + + +## 2. 特性 + +PP-StructureV2的主要特性如下: +- 支持对图片/pdf形式的文档进行版面分析,可以划分**文字、标题、表格、图片、公式等**区域; +- 支持通用的中英文**表格检测**任务; +- 支持表格区域进行结构化识别,最终结果输出**Excel文件**; +- 支持基于多模态的关键信息抽取(Key Information Extraction,KIE)任务-**语义实体识别**(Semantic Entity Recognition,SER)和**关系抽取**(Relation Extraction,RE); +- 支持**版面复原**,即恢复为与原始图像布局一致的word或者pdf格式的文件; +- 支持自定义训练及python whl包调用等多种推理部署方式,简单易用; +- 与半自动数据标注工具PPOCRLabel打通,支持版面分析、表格识别、SER三种任务的标注。 + + +## 3. 效果展示 +PP-StructureV2支持各个模块独立使用或灵活搭配,如,可以单独使用版面分析,或单独使用表格识别,这里仅展示几种代表性使用方式的可视化效果。 + + +### 3.1 版面分析和表格识别 +下图展示了版面分析+表格识别的整体流程,图片先有版面分析划分为图像、文本、标题和表格四种区域,然后对图像、文本和标题三种区域进行OCR的检测识别,对表格进行表格识别,其中图像还会被存储下来以便使用。 + + + +### 3.2 版面恢复 +下图展示了基于上一节版面分析和表格识别的结果进行版面恢复的效果。 + + + + +### 3.3 关键信息抽取 + +* SER + +图中不同颜色的框表示不同的类别。 + +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +* RE + +图中红色框表示`问题`,蓝色框表示`答案`,`问题`和`答案`之间使用绿色线连接。 + +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ + +## 4. 快速体验 + +请参考[快速使用](./docs/quickstart.md)教程。 + + +## 5. 模型库 + +部分任务需要同时用到结构化分析模型和OCR模型,如表格识别需要使用表格识别模型进行结构化解析,同时也要用到OCR模型对表格内的文字进行识别,请根据具体需求选择合适的模型。 + +结构化分析相关模型下载可以参考: +- [PP-Structure 模型库](./docs/models_list.md) + +OCR相关模型下载可以参考: +- [PP-OCR 模型库](../doc/doc_ch/models_list.md) diff --git a/ppstructure/__init__.py b/ppstructure/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1d11e265597c7c8e39098a228108da3bb954b892 --- /dev/null +++ b/ppstructure/__init__.py @@ -0,0 +1,13 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppstructure/kie/README.md b/ppstructure/kie/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d8e18443f7f58a5a1ad44212f3c46997f07dd254 --- /dev/null +++ b/ppstructure/kie/README.md @@ -0,0 +1,280 @@ +English | [简体中文](README_ch.md) + +# Key Information Extraction (KIE) + +- [1. Introduction](#1-introduction) +- [2. Performance](#2-performance) +- [3. Visualization](#3-visualization) + - [3.1 SER](#31-ser) + - [3.2 RE](#32-re) +- [4. Usage](#4-usage) + - [4.1 Prepare for the environment](#41-prepare-for-the-environment) + - [4.2 Quick start](#42-quick-start) + - [4.3 More](#43-more) +- [5. Reference](#5-reference) +- [6. License](#6-license) + + +## 1. Introduction + +Key information extraction (KIE) refers to extracting key information from text or images. As downstream task of OCR, the key information extraction task of document image has many practical application scenarios, such as form recognition, ticket information extraction, ID card information extraction, etc. + +PP-Structure conducts research based on the LayoutXLM multi-modal, and proposes the VI-LayoutXLM, which gets rid of visual features when finetuning the downstream tasks. An textline sorting method is also utilized to fit in reading order. What's more, UDML knowledge distillation is used for higher accuracy. Finally, the accuracy and inference speed of VI-LayoutXLM surpass those of LayoutXLM. + +The main features of the key information extraction module in PP-Structure are as follows. + + +- Integrate multi-modal methods such as [LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf), VI-LayoutXLM, and PP-OCR inference engine. +- Supports Semantic Entity Recognition (SER) and Relation Extraction (RE) tasks based on multimodal methods. Based on the SER task, the text recognition and classification in the image can be completed; based on the RE task, the relationship extraction of the text content in the image can be completed, such as judging the problem pair (pair). +- Supports custom training for SER tasks and RE tasks. +- Supports end-to-end system prediction and evaluation of OCR+SER. +- Supports end-to-end system prediction of OCR+SER+RE. +- Support SER model export and inference using PaddleInference. + + +## 2. Performance + +We evaluate the methods on the Chinese dataset of [XFUND](https://github.com/doc-analysis/XFUND), and the performance is as follows + +|Model | Backbone | Task | Config file | Hmean | Inference time (ms) | Download link| +| --- | --- | --- | --- | --- | --- | --- | +|VI-LayoutXLM| VI-LayoutXLM-base | SER | [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|**93.19%**| 15.49|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | SER | [ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%| 19.49 | [trained model](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)| +|VI-LayoutXLM| VI-LayoutXLM-base | RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|**83.92%**| 15.49|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%| 19.49|[trained model](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)| + + +* Note:Inference environment:V100 GPU + cuda10.2 + cudnn8.1.1 + TensorRT 7.2.3.4,tested using fp16. + +For more KIE models in PaddleOCR, please refer to [KIE model zoo](../../doc/doc_en/algorithm_overview_en.md). + + +## 3. Visualization + +There are two main solutions to the key information extraction task based on VI-LayoutXLM series model. + +(1) Text detection + text recognition + semantic entity recognition (SER) + +(2) Text detection + text recognition + semantic entity recognition (SER) + relationship extraction (RE) + + +The following images are demo results of the SER and RE models. For more detailed introduction to the above solutions, please refer to [KIE Guide](./how_to_do_kie.md). + +### 3.1 SER + +Demo results for SER task are as follows. + +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ + + +**Note:** test pictures are from [xfund dataset](https://github.com/doc-analysis/XFUND), [invoice dataset](https://aistudio.baidu.com/aistudio/datasetdetail/165561) and a composite ID card dataset. + + +Boxes of different colors in the image represent different categories. + +The invoice and application form images have three categories: `request`, `answer` and `header`. The `question` and 'answer' can be used to extract the relationship. + +For the ID card image, the model can directly identify the key information such as `name`, `gender`, `nationality`, so that the subsequent relationship extraction process is not required, and the key information extraction task can be completed using only on model. + +### 3.2 RE + +Demo results for RE task are as follows. + + +
+ +
+ +
+ +
+ +
+ +
+ +Red boxes are questions, blue boxes are answers. The green lines means the two conected objects are a pair. + + +## 4. Usage + +### 4.1 Prepare for the environment + + +Use the following command to install KIE dependencies. + + +```bash +git clone https://github.com/PaddlePaddle/PaddleOCR.git +cd PaddleOCR +pip install -r requirements.txt +pip install -r ppstructure/kie/requirements.txt +# 安装PaddleOCR引擎用于预测 +pip install paddleocr -U +``` + +The visualized results of SER are saved in the `./output` folder by default. Examples of results are as follows. + + +
+ +
+ + +### 4.2 Quick start + +Here we use XFUND dataset to quickly experience the SER model and RE model. + + +#### 4.2.1 Prepare for the dataset + +```bash +mkdir train_data +cd train_data +# download and uncompress the dataset +wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar && tar -xf XFUND.tar +cd .. +``` + +#### 4.2.2 Predict images using the trained model + +Use the following command to download the models. + +```bash +mkdir pretrained_model +cd pretrained_model +# download and uncompress the SER trained model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar && tar -xf ser_vi_layoutxlm_xfund_pretrained.tar + +# download and uncompress the RE trained model +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar && tar -xf re_vi_layoutxlm_xfund_pretrained.tar +``` + + +If you want to use OCR engine to obtain end-to-end prediction results, you can use the following command to predict. + +```bash +# just predict using SER trained model +python3 tools/infer_kie_token_ser.py \ + -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./ppstructure/docs/kie/input/zh_val_42.jpg + +# predict using SER and RE trained model at the same time +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/image/zh_val_42.jpg \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy +``` + +The visual result images and the predicted text file will be saved in the `Global.save_res_path` directory. + + +If you want to load the text detection and recognition results collected before, you can use the following command to predict. + +```bash +# just predict using SER trained model +python3 tools/infer_kie_token_ser.py \ + -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/val.json \ + Global.infer_mode=False + +# predict using SER and RE trained model at the same time +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/val.json \ + Global.infer_mode=False \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy +``` + +#### 4.2.3 Inference using PaddleInference + +Firstly, download the inference SER inference model. + +```bash +mkdir inference +cd inference +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_infer.tar && tar -xf re_vi_layoutxlm_xfund_infer.tar +cd .. +``` + +- SER + +Use the following command for inference. + + +```bash +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +The visual results and text file will be saved in directory `output`. + +- RE + +Use the following command for inference. + + +```bash +cd ppstructure +python3 kie/predict_kie_token_ser_re.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=../inference/re_vi_layoutxlm_xfund_infer \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --use_visual_backbone=False \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +The visual results and text file will be saved in directory `output`. + + +### 4.3 More + +For training, evaluation and inference tutorial for KIE models, please refer to [KIE doc](../../doc/doc_en/kie_en.md). + +For training, evaluation and inference tutorial for text detection models, please refer to [text detection doc](../../doc/doc_en/detection_en.md). + +For training, evaluation and inference tutorial for text recognition models, please refer to [text recognition doc](../../doc/doc_en/recognition_en.md). + +To complete the key information extraction task in your own scenario from data preparation to model selection, please refer to: [Guide to End-to-end KIE](./how_to_do_kie_en.md)。 + + +## 5. Reference + +- LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, https://arxiv.org/pdf/2104.08836.pdf +- microsoft/unilm/layoutxlm, https://github.com/microsoft/unilm/tree/master/layoutxlm +- XFUND dataset, https://github.com/doc-analysis/XFUND + +## 6. License + +The content of this project itself is licensed under the [Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/) diff --git a/ppstructure/kie/README_ch.md b/ppstructure/kie/README_ch.md new file mode 100644 index 0000000000000000000000000000000000000000..7a8b1942b1849834f8843c8f272ce08e95f4b993 --- /dev/null +++ b/ppstructure/kie/README_ch.md @@ -0,0 +1,262 @@ +[English](README.md) | 简体中文 + +# 关键信息抽取 + +- [1. 简介](#1-简介) +- [2. 精度与性能](#2-精度与性能) +- [3. 效果演示](#3-效果演示) + - [3.1 SER](#31-ser) + - [3.2 RE](#32-re) +- [4. 使用](#4-使用) + - [4.1 准备环境](#41-准备环境) + - [4.2 快速开始](#42-快速开始) + - [4.3 更多](#43-更多) +- [5. 参考链接](#5-参考链接) +- [6. License](#6-License) + + +## 1. 简介 + +关键信息抽取 (Key Information Extraction, KIE)指的是是从文本或者图像中,抽取出关键的信息。针对文档图像的关键信息抽取任务作为OCR的下游任务,存在非常多的实际应用场景,如表单识别、车票信息抽取、身份证信息抽取等。 + +PP-Structure 基于 LayoutXLM 文档多模态系列方法进行研究与优化,设计了视觉特征无关的多模态模型结构VI-LayoutXLM,同时引入符合阅读顺序的文本行排序方法以及UDML联合互学习蒸馏方法,最终在精度与速度均超越LayoutXLM。 + +PP-Structure中关键信息抽取模块的主要特性如下: + +- 集成[LayoutXLM](https://arxiv.org/pdf/2104.08836.pdf)、VI-LayoutXLM等多模态模型以及PP-OCR预测引擎。 +- 支持基于多模态方法的语义实体识别 (Semantic Entity Recognition, SER) 以及关系抽取 (Relation Extraction, RE) 任务。基于 SER 任务,可以完成对图像中的文本识别与分类;基于 RE 任务,可以完成对图象中的文本内容的关系提取,如判断问题对(pair)。 +- 支持SER任务和RE任务的自定义训练。 +- 支持OCR+SER的端到端系统预测与评估。 +- 支持OCR+SER+RE的端到端系统预测。 +- 支持SER模型的动转静导出与基于PaddleInfernece的模型推理。 + + +## 2. 精度与性能 + + +我们在 [XFUND](https://github.com/doc-analysis/XFUND) 的中文数据集上对算法进行了评估,SER与RE上的任务性能如下 + +|模型|骨干网络|任务|配置文件|hmean|预测耗时(ms)|下载链接| +| --- | --- | --- | --- | --- | --- | --- | +|VI-LayoutXLM| VI-LayoutXLM-base | SER | [ser_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh_udml.yml)|**93.19%**| 15.49|[训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | SER | [ser_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/ser_layoutxlm_xfund_zh.yml)|90.38%| 19.49 | [训练模型](https://paddleocr.bj.bcebos.com/pplayout/ser_LayoutXLM_xfun_zh.tar)| +|VI-LayoutXLM| VI-LayoutXLM-base | RE | [re_vi_layoutxlm_xfund_zh_udml.yml](../../configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh_udml.yml)|**83.92%**| 15.49|[训练模型](https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar)| +|LayoutXLM| LayoutXLM-base | RE | [re_layoutxlm_xfund_zh.yml](../../configs/kie/layoutlm_series/re_layoutxlm_xfund_zh.yml)|74.83%| 19.49|[训练模型](https://paddleocr.bj.bcebos.com/pplayout/re_LayoutXLM_xfun_zh.tar)| + + +* 注:预测耗时测试条件:V100 GPU + cuda10.2 + cudnn8.1.1 + TensorRT 7.2.3.4,使用FP16进行测试。 + +更多关于PaddleOCR中关键信息抽取模型的介绍,请参考[关键信息抽取模型库](../../doc/doc_ch/algorithm_overview.md)。 + + +## 3. 效果演示 + +基于多模态模型的关键信息抽取任务有2种主要的解决方案。 + +(1)文本检测 + 文本识别 + 语义实体识别(SER) +(2)文本检测 + 文本识别 + 语义实体识别(SER) + 关系抽取(RE) + +下面给出SER与RE任务的示例效果,关于上述解决方案的详细介绍,请参考[关键信息抽取全流程指南](./how_to_do_kie.md)。 + +### 3.1 SER + +对于SER任务,效果如下所示。 + +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +**注意:** 测试图片来源于[XFUND数据集](https://github.com/doc-analysis/XFUND)、[发票数据集](https://aistudio.baidu.com/aistudio/datasetdetail/165561)以及合成的身份证数据集。 + + +图中不同颜色的框表示不同的类别。 + +图中的发票以及申请表图像,有`QUESTION`, `ANSWER`, `HEADER` 3种类别,识别的`QUESTION`, `ANSWER`可以用于后续的问题与答案的关系抽取。 + +图中的身份证图像,则直接识别出其中的`姓名`、`性别`、`民族`等关键信息,这样就无需后续的关系抽取过程,一个模型即可完成关键信息抽取。 + + +### 3.2 RE + +对于RE任务,效果如下所示。 + +
+ +
+ +
+ +
+ +
+ +
+ + +红色框是问题,蓝色框是答案。绿色线条表示连接的两端为一个key-value的pair。 + +## 4. 使用 + +### 4.1 准备环境 + +使用下面的命令安装运行SER与RE关键信息抽取的依赖。 + +```bash +git clone https://github.com/PaddlePaddle/PaddleOCR.git +cd PaddleOCR +pip install -r requirements.txt +pip install -r ppstructure/kie/requirements.txt +# 安装PaddleOCR引擎用于预测 +pip install paddleocr -U +``` + +### 4.2 快速开始 + +下面XFUND数据集,快速体验SER模型与RE模型。 + +#### 4.2.1 准备数据 + +```bash +mkdir train_data +cd train_data +# 下载与解压数据 +wget https://paddleocr.bj.bcebos.com/ppstructure/dataset/XFUND.tar && tar -xf XFUND.tar +cd .. +``` + +#### 4.2.2 基于动态图的预测 + +首先下载模型。 + +```bash +mkdir pretrained_model +cd pretrained_model +# 下载并解压SER预训练模型 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_pretrained.tar && tar -xf ser_vi_layoutxlm_xfund_pretrained.tar + +# 下载并解压RE预训练模型 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_pretrained.tar && tar -xf re_vi_layoutxlm_xfund_pretrained.tar +``` + +如果希望使用OCR引擎,获取端到端的预测结果,可以使用下面的命令进行预测。 + +```bash +# 仅预测SER模型 +python3 tools/infer_kie_token_ser.py \ + -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./ppstructure/docs/kie/input/zh_val_42.jpg + +# SER + RE模型串联 +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/image/zh_val_42.jpg \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy +``` + +`Global.save_res_path`目录中会保存可视化的结果图像以及预测的文本文件。 + + +如果希望加载标注好的文本检测与识别结果,仅预测可以使用下面的命令进行预测。 + +```bash +# 仅预测SER模型 +python3 tools/infer_kie_token_ser.py \ + -c configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/val.json \ + Global.infer_mode=False + +# SER + RE模型串联 +python3 ./tools/infer_kie_token_ser_re.py \ + -c configs/kie/vi_layoutxlm/re_vi_layoutxlm_xfund_zh.yml \ + -o Architecture.Backbone.checkpoints=./pretrained_model/re_vi_layoutxlm_xfund_pretrained/best_accuracy \ + Global.infer_img=./train_data/XFUND/zh_val/val.json \ + Global.infer_mode=False \ + -c_ser configs/kie/vi_layoutxlm/ser_vi_layoutxlm_xfund_zh.yml \ + -o_ser Architecture.Backbone.checkpoints=./pretrained_model/ser_vi_layoutxlm_xfund_pretrained/best_accuracy +``` + +#### 4.2.3 基于PaddleInference的预测 + +首先下载SER和RE的推理模型。 + +```bash +mkdir inference +cd inference +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/ser_vi_layoutxlm_xfund_infer.tar && tar -xf ser_vi_layoutxlm_xfund_infer.tar +wget https://paddleocr.bj.bcebos.com/ppstructure/models/vi_layoutxlm/re_vi_layoutxlm_xfund_infer.tar && tar -xf re_vi_layoutxlm_xfund_infer.tar +cd .. +``` + +- SER + +执行下面的命令进行预测。 + +```bash +cd ppstructure +python3 kie/predict_kie_token_ser.py \ + --kie_algorithm=LayoutXLM \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +可视化结果保存在`output`目录下。 + +- RE + +执行下面的命令进行预测。 + +```bash +cd ppstructure +python3 kie/predict_kie_token_ser_re.py \ + --kie_algorithm=LayoutXLM \ + --re_model_dir=../inference/re_vi_layoutxlm_xfund_infer \ + --ser_model_dir=../inference/ser_vi_layoutxlm_xfund_infer \ + --use_visual_backbone=False \ + --image_dir=./docs/kie/input/zh_val_42.jpg \ + --ser_dict_path=../train_data/XFUND/class_list_xfun.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --ocr_order_method="tb-yx" +``` + +可视化结果保存在`output`目录下。 + + +### 4.3 更多 + +关于KIE模型的训练评估与推理,请参考:[关键信息抽取教程](../../doc/doc_ch/kie.md)。 + +关于文本检测模型的训练评估与推理,请参考:[文本检测教程](../../doc/doc_ch/detection.md)。 + +关于文本识别模型的训练评估与推理,请参考:[文本识别教程](../../doc/doc_ch/recognition.md)。 + +关于怎样在自己的场景中完成关键信息抽取任务,请参考:[关键信息抽取全流程指南](./how_to_do_kie.md)。 + + +## 5. 参考链接 + +- LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding, https://arxiv.org/pdf/2104.08836.pdf +- microsoft/unilm/layoutxlm, https://github.com/microsoft/unilm/tree/master/layoutxlm +- XFUND dataset, https://github.com/doc-analysis/XFUND + +## 6. License + +The content of this project itself is licensed under the [Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/) diff --git a/ppstructure/kie/how_to_do_kie.md b/ppstructure/kie/how_to_do_kie.md new file mode 100644 index 0000000000000000000000000000000000000000..0c47315d6e484720633a8e9709fee693f055810a --- /dev/null +++ b/ppstructure/kie/how_to_do_kie.md @@ -0,0 +1,168 @@ + +# 怎样完成基于图像数据的信息抽取任务 + +- [1. 简介](#1-简介) + - [1.1 背景](#11-背景) + - [1.2 主流方法](#12-主流方法) +- [2. 关键信息抽取任务流程](#2-关键信息抽取任务流程) + - [2.1 训练OCR模型](#21-训练OCR模型) + - [2.2 训练KIE模型](#22-训练KIE模型) +- [3. 参考文献](#3-参考文献) + + +## 1. 简介 + +### 1.1 背景 + +关键信息抽取 (Key Information Extraction, KIE)指的是是从文本或者图像中,抽取出关键的信息。针对文档图像的关键信息抽取任务作为OCR的下游任务,存在非常多的实际应用场景,如表单识别、车票信息抽取、身份证信息抽取等。然而,使用人力从这些文档图像中提取或者收集关键信息耗时费力,怎样自动化融合图像中的视觉、布局、文字等特征并完成关键信息抽取是一个价值与挑战并存的问题。 + +对于特定场景的文档图像,其中的关键信息位置、版式等较为固定,因此在研究早期有很多基于模板匹配的方法进行关键信息的抽取,考虑到其流程较为简单,该方法仍然被广泛应用在目前的很多场景中。但是这种基于模板匹配的方法在应用到不同的场景中时,需要耗费大量精力去调整与适配模板,迁移成本较高。 + +文档图像中的KIE一般包含2个子任务,示意图如下图所示。 + +* (1)SER: 语义实体识别 (Semantic Entity Recognition),对每一个检测到的文本进行分类,如将其分为姓名,身份证。如下图中的黑色框和红色框。 +* (2)RE: 关系抽取 (Relation Extraction),对每一个检测到的文本进行分类,如将其分为问题 (key) 和答案 (value) 。然后对每一个问题找到对应的答案,相当于完成key-value的匹配过程。如下图中的红色框和黑色框分别代表问题和答案,黄色线代表问题和答案之间的对应关系。 + + +
+ +
+ + +### 1.2 基于深度学习的主流方法 + +一般的KIE方法基于命名实体识别(Named Entity Recognition,NER)来展开研究,但是此类方法仅使用了文本信息而忽略了位置与视觉特征信息,因此精度受限。近几年大多学者开始融合多个模态的输入信息,进行特征融合,并对多模态信息进行处理,从而提升KIE的精度。主要方法有以下几种 + +* (1)基于Grid的方法:此类方法主要关注图像层面多模态信息的融合,文本大多大多为字符粒度,对文本与结构结构信息的嵌入方式较为简单,如Chargrid[1]等算法。 +* (2)基于Token的方法:此类方法参考NLP中的BERT等方法,将位置、视觉等特征信息共同编码到多模态模型中,并且在大规模数据集上进行预训练,从而在下游任务中,仅需要少量的标注数据便可以获得很好的效果。如LayoutLM[2], LayoutLMv2[3], LayoutXLM[4], StrucText[5]等算法。 +* (3)基于GCN的方法:此类方法尝试学习图像、文字之间的结构信息,从而可以解决开集信息抽取的问题(训练集中没有见过的模板),如GCN[6]、SDMGR[7]等算法。 +* (4)基于End-to-end的方法:此类方法将现有的OCR文字识别以及KIE信息抽取2个任务放在一个统一的网络中进行共同学习,并在学习过程中相互加强。如Trie[8]等算法。 + +更多关于该系列算法的详细介绍,请参考“动手学OCR·十讲”课程的课节六部分:[文档分析理论与实践](https://aistudio.baidu.com/aistudio/education/group/info/25207)。 + +## 2. 关键信息抽取任务流程 + +PaddleOCR中实现了LayoutXLM等算法(基于Token),同时,在PP-StructureV2中,对LayoutXLM多模态预训练模型的网络结构进行简化,去除了其中的Visual backbone部分,设计了视觉无关的VI-LayoutXLM模型,同时引入符合人类阅读顺序的排序逻辑以及UDML知识蒸馏策略,最终同时提升了关键信息抽取模型的精度与推理速度。 + +下面介绍怎样基于PaddleOCR完成关键信息抽取任务。 + +在非End-to-end的KIE方法中,完成关键信息抽取,至少需要**2个步骤**:首先使用OCR模型,完成文字位置与内容的提取,然后使用KIE模型,根据图像、文字位置以及文字内容,提取出其中的关键信息。 + +### 2.1 训练OCR模型 + +#### 2.1.1 文本检测 + +**(1)数据** + +PaddleOCR中提供的模型大多数为通用模型,在进行文本检测的过程中,相邻文本行的检测一般是根据位置的远近进行区分,如上图,使用PP-OCRv3通用中英文检测模型进行文本检测时,容易将”民族“与“汉”这2个代表不同的字段检测到一起,从而增加后续KIE任务的难度。因此建议在做KIE任务的过程中,首先训练一个针对该文档数据集的检测模型。 + +在数据标注时,关键信息的标注需要隔开,比上图中的 “民族汉” 3个字相隔较近,此时需要将”民族“与”汉“标注为2个文本检测框,否则会增加后续KIE任务的难度。 + +对于下游任务,一般来说,`200~300`张的文本训练数据即可保证基本的训练效果,如果没有太多的先验知识,可以先标注 **`200~300`** 张图片,进行后续文本检测模型的训练。 + + +**(2)模型** + +在模型选择方面,推荐使用PP-OCRv3_det,关于更多关于检测模型的训练方法介绍,请参考:[OCR文本检测模型训练教程](../../doc/doc_ch/detection.md)与[PP-OCRv3 文本检测模型训练教程](../../doc/doc_ch/PPOCRv3_det_train.md)。 + +#### 2.1.2 文本识别 + +相对自然场景,文档图像中的文本内容识别难度一般相对较低(背景相对不太复杂),因此**优先建议**尝试PaddleOCR中提供的PP-OCRv3通用文本识别模型([PP-OCRv3模型库链接](../../doc/doc_ch/models_list.md))。 + +**(1)数据** + +然而,在部分文档场景中也会存在一些挑战,如身份证场景中存在着罕见字,在发票等场景中的字体比较特殊,这些问题都会增加文本识别的难度,此时如果希望保证或者进一步提升模型的精度,建议基于特定文档场景的文本识别数据集,加载PP-OCRv3模型进行微调。 + +在模型微调的过程中,建议准备至少`5000`张垂类场景的文本识别图像,可以保证基本的模型微调效果。如果希望提升模型的精度与泛化能力,可以合成更多与该场景类似的文本识别数据,从公开数据集中收集通用真实文本识别数据,一并添加到该场景的文本识别训练任务过程中。在训练过程中,建议每个epoch的真实垂类数据、合成数据、通用数据比例在`1:1:1`左右,这可以通过设置不同数据源的采样比例进行控制。如有3个训练文本文件,分别包含1W、2W、5W条数据,那么可以在配置文件中设置数据如下: + +```yml +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/train_list_1W.txt + - ./train_data/train_list_2W.txt + - ./train_data/train_list_5W.txt + ratio_list: [1.0, 0.5, 0.2] + ... +``` + +**(2)模型** + +在模型选择方面,推荐使用通用中英文文本识别模型PP-OCRv3_rec,关于更多关于文本识别模型的训练方法介绍,请参考:[OCR文本识别模型训练教程](../../doc/doc_ch/recognition.md)与[PP-OCRv3文本识别模型库与配置文件](../../doc/doc_ch/models_list.md)。 + +### 2.2 训练KIE模型 + +对于识别得到的文字进行关键信息抽取,有2种主要的方法。 + +(1)直接使用SER,获取关键信息的类别:如身份证场景中,将“姓名“与”张三“分别标记为`name_key`与`name_value`。最终识别得到的类别为`name_value`对应的**文本字段**即为我们所需要的关键信息。 + +(2)联合SER与RE进行使用:这种方法中,首先使用SER,获取图像文字内容中所有的key与value,然后使用RE方法,对所有的key与value进行配对,找到映射关系,从而完成关键信息的抽取。 + +#### 2.2.1 SER + +以身份证场景为例, 关键信息一般包含`姓名`、`性别`、`民族`等,我们直接将对应的字段标注为特定的类别即可,如下图所示。 + +
+ +
+ +**注意:** + +- 标注过程中,对于无关于KIE关键信息的文本内容,均需要将其标注为`other`类别,相当于背景信息。如在身份证场景中,如果我们不关注性别信息,那么可以将“性别”与“男”这2个字段的类别均标注为`other`。 +- 标注过程中,需要以**文本行**为单位进行标注,无需标注单个字符的位置信息。 + +数据量方面,一般来说,对于比较固定的场景,**50张**左右的训练图片即可达到可以接受的效果,可以使用[PPOCRLabel](../../PPOCRLabel/README_ch.md)完成KIE的标注过程。 + +模型方面,推荐使用PP-StructureV2中提出的VI-LayoutXLM模型,它基于LayoutXLM模型进行改进,去除其中的视觉特征提取模块,在精度基本无损的情况下,进一步提升了模型推理速度。更多教程请参考:[VI-LayoutXLM算法介绍](../../doc/doc_ch/algorithm_kie_vi_layoutxlm.md)与[KIE关键信息抽取使用教程](../../doc/doc_ch/kie.md)。 + + +#### 2.2.2 SER + RE + +该过程主要包含SER与RE 2个过程。SER阶段主要用于识别出文档图像中的所有key与value,RE阶段主要用于对所有的key与value进行匹配。 + +以身份证场景为例, 关键信息一般包含`姓名`、`性别`、`民族`等关键信息,在SER阶段,我们需要识别所有的question (key) 与answer (value) 。标注如下所示。每个字段的类别信息(`label`字段)可以是question、answer或者other(与待抽取的关键信息无关的字段) + +
+ +
+ + +在RE阶段,需要标注每个字段的的id与连接信息,如下图所示。 + +
+ +
+ +每个文本行字段中,需要添加`id`与`linking`字段信息,`id`记录该文本行的唯一标识,同一张图片中的不同文本内容不能重复,`linking`是一个列表,记录了不同文本之间的连接信息。如字段“出生”的id为0,字段“1996年1月11日”的id为1,那么它们均有[[0, 1]]的`linking`标注,表示该id=0与id=1的字段构成key-value的关系(姓名、性别等字段类似,此处不再一一赘述)。 + + +**注意:** + +- 标注过程中,如果value是多个字符,那么linking中可以新增一个key-value对,如`[[0, 1], [0, 2]]` + + +数据量方面,一般来说,对于比较固定的场景,**50张**左右的训练图片即可达到可以接受的效果,可以使用PPOCRLabel完成KIE的标注过程。 + +模型方面,推荐使用PP-StructureV2中提出的VI-LayoutXLM模型,它基于LayoutXLM模型进行改进,去除其中的视觉特征提取模块,在精度基本无损的情况下,进一步提升了模型推理速度。更多教程请参考:[VI-LayoutXLM算法介绍](../../doc/doc_ch/algorithm_kie_vi_layoutxlm.md)与[KIE关键信息抽取使用教程](../../doc/doc_ch/kie.md)。 + + +## 3. 参考文献 + + +[1] Katti A R, Reisswig C, Guder C, et al. Chargrid: Towards understanding 2d documents[J]. arXiv preprint arXiv:1809.08799, 2018. + +[2] Xu Y, Li M, Cui L, et al. Layoutlm: Pre-training of text and layout for document image understanding[C]//Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2020: 1192-1200. + +[3] Xu Y, Xu Y, Lv T, et al. LayoutLMv2: Multi-modal pre-training for visually-rich document understanding[J]. arXiv preprint arXiv:2012.14740, 2020. + +[4]: Xu Y, Lv T, Cui L, et al. Layoutxlm: Multimodal pre-training for multilingual visually-rich document understanding[J]. arXiv preprint arXiv:2104.08836, 2021. + +[5] Li Y, Qian Y, Yu Y, et al. StrucTexT: Structured Text Understanding with Multi-Modal Transformers[C]//Proceedings of the 29th ACM International Conference on Multimedia. 2021: 1912-1920. + +[6] Liu X, Gao F, Zhang Q, et al. Graph convolution for multimodal information extraction from visually rich documents[J]. arXiv preprint arXiv:1903.11279, 2019. + +[7] Sun H, Kuang Z, Yue X, et al. Spatial Dual-Modality Graph Reasoning for Key Information Extraction[J]. arXiv preprint arXiv:2103.14470, 2021. + +[8] Zhang P, Xu Y, Cheng Z, et al. Trie: End-to-end text reading and information extraction for document understanding[C]//Proceedings of the 28th ACM International Conference on Multimedia. 2020: 1413-1422. diff --git a/ppstructure/kie/how_to_do_kie_en.md b/ppstructure/kie/how_to_do_kie_en.md new file mode 100644 index 0000000000000000000000000000000000000000..400bd1c2d29bb24b01818c9060220ecad1e9806c --- /dev/null +++ b/ppstructure/kie/how_to_do_kie_en.md @@ -0,0 +1,179 @@ + +# Key Information Extraction Pipeline + +- [1. Introduction](#1-Introduction) + - [1.1 Background](#11-Background) + - [1.2 Mainstream Deep-learning Solutions](#12-Mainstream-Deep-learning-Solutions) +- [2. KIE Pipeline](#2-KIE-Pipeline) + - [2.1 Train OCR Models](#21-Train-OCR-Models) + - [2.2 Train KIE Models](#22-Train-KIE-Models) +- [3. Reference](#3-Reference) + + +## 1. Introduction + +### 1.1 Background + +Key information extraction (KIE) refers to extracting key information from text or images. As the downstream task of OCR, KIE of document image has many practical application scenarios, such as form recognition, ticket information extraction, ID card information extraction, etc. However, it is time-consuming and laborious to extract key information from these document images by manpower. It's challengable but also valuable to combine multi-modal features (visual, layout, text, etc) together and complete KIE tasks. + +For the document images in a specific scene, the position and layout of the key information are relatively fixed. Therefore, in the early stage of the research, there are many methods based on template matching to extract the key information. This method is still widely used in many simple scenarios at present. However, it takes long time to adjut the template for different scenarios. + + +The KIE in the document image generally contains 2 subtasks, which is as shown follows. + +* (1) SER: semantic entity recognition, which classifies each detected textline, such as dividing it into name and ID card. As shown in the red boxes in the following figure. + +* (2) RE: relationship extraction, which matches the question and answer based on SER results. As shown in the figure below, the yellow arrows match the question and answer. + +
+ +
+ + + +### 1.2 Mainstream Deep-learning Solutions + +General KIE methods are based on Named Entity Recognition (NER), but such methods only use text information and ignore location and visual feature information, which leads to limited accuracy. In recent years, most scholars have started to combine mutil-modal features to improve the accuracy of KIE model. The main methods are as follows: + +* (1) Grid based methods. These methods mainly focus on the fusion of multi-modal information at the image level. Most texts are of character granularity. The text and structure information embedding method is simple, such as the algorithm of chargrid [1]. + +* (2) Token based methods. These methods refer to the NLP methods such as Bert, which encode the position, vision and other feature information into the multi-modal model, and conduct pre-training on large-scale datasets, so that in downstream tasks, only a small amount of annotation data is required to obtain excellent results. The representative algorithms are layoutlm [2], layoutlmv2 [3], layoutxlm [4], structext [5], etc. + +* (3) GCN based methods. These methods try to learn the structural information between images and characters, so as to solve the problem of extracting open set information (templates not seen in the training set), such as GCN [6], SDMGR [7] and other algorithms. + +* (4) End to end based methods: these methods put the existing OCR character recognition and KIE information extraction tasks into a unified network for common learning, and strengthen each other in the learning process. Such as TRIE [8]. + + +For more detailed introduction of the algorithms, please refer to Chapter 6 of [Diving into OCR](https://aistudio.baidu.com/aistudio/education/group/info/25207). + +## 2. KIE Pipeline + +Token based methods such as LayoutXLM are implemented in PaddleOCR. What's more, in PP-StructureV2, we simplify the LayoutXLM model and proposed VI-LayoutXLM, in which the visual feature extraction module is removed for speed-up. The textline sorting strategy conforming to the human reading order and UDML knowledge distillation strategy are utilized for higher model accuracy. + + +In the non end-to-end KIE method, KIE needs at least ** 2 steps**. Firstly, the OCR model is used to extract the text and its position. Secondly, the KIE model is used to extract the key information according to the image, text position and text content. + + +### 2.1 Train OCR Models + +#### 2.1.1 Text Detection + +**(1) Data** + +Most of the models provided in PaddleOCR are general models. In the process of text detection, the detection of adjacent text lines is generally based on the distance of the position. As shown in the figure above, when using PP-OCRv3 general English detection model for text detection, it is easy to detect the two fields representing different propoerties as one. Therefore, it is suggested to finetune a detection model according to your scenario firstly during the KIE task. + + +During data annotation, the different key information needs to be separated. Otherwise, it will increase the difficulty of subsequent KIE tasks. + +For downstream tasks, generally speaking, `200~300` training images can guarantee the basic training effect. If there is not too much prior knowledge, **`200~300`** images can be labeled firstly for subsequent text detection model training. + +**(2) Model** + +In terms of model selection, PP-OCRv3 detection model is recommended. For more information about the training methods of the detection model, please refer to: [Text detection tutorial](../../doc/doc_en/detection_en.md) and [PP-OCRv3 detection model tutorial](../../doc/doc_ch/PPOCRv3_det_train.md). + +#### 2.1.2 Text recognition + + +Compared with the natural scene, the text recognition in the document image is generally relatively easier (the background is not too complex), so **it is suggested to** try the PP-OCRv3 general text recognition model provided in PaddleOCR ([PP-OCRv3 model list](../../doc/doc_en/models_list_en.md)) + + +**(1) Data** + +However, there are also some challenges in some document scenarios, such as rare words in ID card scenarios and special fonts in invoice and other scenarios. These problems will increase the difficulty of text recognition. At this time, if you want to ensure or further improve the model accuracy, it is recommended to load PP-OCRv3 model based on the text recognition dataset of specific document scenarios for finetuning. + +In the process of model finetuning, it is recommended to prepare at least `5000` vertical scene text recognition images to ensure the basic model fine-tuning effect. If you want to improve the accuracy and generalization ability of the model, you can synthesize more text recognition images similar to the scene, collect general real text recognition data from the public data set, and add them to the text recognition training process. In the training process, it is suggested that the ratio of real data, synthetic data and general data of each epoch should be around `1:1:1`, which can be controlled by setting the sampling ratio of different data sources. If there are 3 training text files, including 10k, 20k and 50k pieces of data respectively, the data can be set in the configuration file as follows: + +```yml +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/train_list_10k.txt + - ./train_data/train_list_10k.txt + - ./train_data/train_list_50k.txt + ratio_list: [1.0, 0.5, 0.2] + ... +``` + +**(2) Model** + +In terms of model selection, PP-OCRv3 recognition model is recommended. For more information about the training methods of the recognition model, please refer to: [Text recognition tutorial](../../doc/doc_en/recognition_en.md) and [PP-OCRv3 model list](../../doc/doc_en/models_list_en.md). + + +### 2.2 Train KIE Models + +There are two main methods to extract the key information from the recognized texts. + +(1) Directly use SER model to obtain the key information category. For example, in the ID card scenario, we mark "name" and "Geoff Sample" as "name_key" and "name_value", respectively. The **text field** corresponding to the category "name_value" finally identified is the key information we need. + +(2) Joint use SER and RE models. For this case, we firstly use SER model to obtain all questions (keys) and questions (values) for the image text, and then use RE model to match all keys and values to find the relationship, so as to complete the extraction of key information. + +#### 2.2.1 SER + +Take the ID card scenario as an example. The key information generally includes `name`, `DOB`, etc. We can directly mark the corresponding fields as specific categories, as shown in the following figure. + +
+ +
+ +**Note:** + +- In the labeling process, text content without key information about KIE shall be labeled as`other`, which is equivalent to background information. For example, in the ID card scenario, if we do not pay attention to `DOB` information, we can mark the categories of `DOB` and `Area manager` as `other`. +- In the annotation process of, it is required to annotate the **textline** position rather than the character. + + +In terms of data, generally speaking, for relatively fixed scenes, **50** training images can achieve acceptable effects. You can refer to [PPOCRLabel](../../PPOCRLabel/README.md) for finish the labeling process. + +In terms of model, it is recommended to use the VI-layoutXLM model proposed in PP-StructureV2. It is improved based on the LayoutXLM model, removing the visual feature extraction module, and further improving the model inference speed without the significant reduction on model accuracy. For more tutorials, please refer to [VI-LayoutXLM introduction](../../doc/doc_en/algorithm_kie_vi_layoutxlm_en.md) and [KIE tutorial](../../doc/doc_en/kie_en.md). + + +#### 2.2.2 SER + RE + +The SER model is mainly used to identify all keys and values in the document image, and the RE model is mainly used to match all keys and values. + +Taking the ID card scenario as an example, the key information generally includes key information such as `name`, `DOB`, etc. in the SER stage, we need to identify all questions (keys) and answers (values). The demo annotation is as follows. All keys can be annotated as `question`, and all keys can be annotated as `answer`. + + +
+ +
+ + +In the RE stage, the ID and connection information of each field need to be marked, as shown in the following figure. + +
+ +
+ +For each textline, you need to add 'ID' and 'linking' field information. The 'ID' records the unique identifier of the textline. Different text contents in the same images cannot be repeated. The 'linking' is a list that records the connection information between different texts. If the ID of the field "name" is 0 and the ID of the field "Geoff Sample" is 1, then they all have [[0, 1]] 'linking' marks, indicating that the fields with `id=0` and `id=1` form a key value relationship (the fields such as DOB and Expires are similar, and will not be repeated here). + + +**Note:** + +-During annotation, if value is multiple textines, a key value pair can be added in linking, such as `[[0, 1], [0, 2]]`. + +In terms of data, generally speaking, for relatively fixed scenes, about **50** training images can achieve acceptable effects. + +In terms of model, it is recommended to use the VI-layoutXLM model proposed in PP-StructureV2. It is improved based on the LayoutXLM model, removing the visual feature extraction module, and further improving the model inference speed without the significant reduction on model accuracy. For more tutorials, please refer to [VI-LayoutXLM introduction](../../doc/doc_en/algorithm_kie_vi_layoutxlm_en.md) and [KIE tutorial](../../doc/doc_en/kie_en.md). + + + +## 3. Reference + + +[1] Katti A R, Reisswig C, Guder C, et al. Chargrid: Towards understanding 2d documents[J]. arXiv preprint arXiv:1809.08799, 2018. + +[2] Xu Y, Li M, Cui L, et al. Layoutlm: Pre-training of text and layout for document image understanding[C]//Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2020: 1192-1200. + +[3] Xu Y, Xu Y, Lv T, et al. LayoutLMv2: Multi-modal pre-training for visually-rich document understanding[J]. arXiv preprint arXiv:2012.14740, 2020. + +[4]: Xu Y, Lv T, Cui L, et al. Layoutxlm: Multimodal pre-training for multilingual visually-rich document understanding[J]. arXiv preprint arXiv:2104.08836, 2021. + +[5] Li Y, Qian Y, Yu Y, et al. StrucTexT: Structured Text Understanding with Multi-Modal Transformers[C]//Proceedings of the 29th ACM International Conference on Multimedia. 2021: 1912-1920. + +[6] Liu X, Gao F, Zhang Q, et al. Graph convolution for multimodal information extraction from visually rich documents[J]. arXiv preprint arXiv:1903.11279, 2019. + +[7] Sun H, Kuang Z, Yue X, et al. Spatial Dual-Modality Graph Reasoning for Key Information Extraction[J]. arXiv preprint arXiv:2103.14470, 2021. + +[8] Zhang P, Xu Y, Cheng Z, et al. Trie: End-to-end text reading and information extraction for document understanding[C]//Proceedings of the 28th ACM International Conference on Multimedia. 2020: 1413-1422. diff --git a/ppstructure/kie/predict_kie_token_ser.py b/ppstructure/kie/predict_kie_token_ser.py new file mode 100644 index 0000000000000000000000000000000000000000..e570979bcb419edbc2e09e190ae36ec1458c1826 --- /dev/null +++ b/ppstructure/kie/predict_kie_token_ser.py @@ -0,0 +1,177 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import json +import numpy as np +import time + +import tools.infer.utility as utility +from ppocr.data import create_operators, transform +from ppocr.postprocess import build_post_process +from ppocr.utils.logging import get_logger +from ppocr.utils.visual import draw_ser_results +from ppocr.utils.utility import get_image_file_list, check_and_read +from ppstructure.utility import parse_args + +from paddleocr import PaddleOCR + +logger = get_logger() + + +class SerPredictor(object): + def __init__(self, args): + self.ocr_engine = PaddleOCR( + use_angle_cls=args.use_angle_cls, + det_model_dir=args.det_model_dir, + rec_model_dir=args.rec_model_dir, + show_log=False, + use_gpu=args.use_gpu) + + pre_process_list = [{ + 'VQATokenLabelEncode': { + 'algorithm': args.kie_algorithm, + 'class_path': args.ser_dict_path, + 'contains_re': False, + 'ocr_engine': self.ocr_engine, + 'order_method': args.ocr_order_method, + } + }, { + 'VQATokenPad': { + 'max_seq_len': 512, + 'return_attention_mask': True + } + }, { + 'VQASerTokenChunk': { + 'max_seq_len': 512, + 'return_attention_mask': True + } + }, { + 'Resize': { + 'size': [224, 224] + } + }, { + 'NormalizeImage': { + 'std': [58.395, 57.12, 57.375], + 'mean': [123.675, 116.28, 103.53], + 'scale': '1', + 'order': 'hwc' + } + }, { + 'ToCHWImage': None + }, { + 'KeepKeys': { + 'keep_keys': [ + 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', + 'image', 'labels', 'segment_offset_id', 'ocr_info', + 'entities' + ] + } + }] + postprocess_params = { + 'name': 'VQASerTokenLayoutLMPostProcess', + "class_path": args.ser_dict_path, + } + + self.preprocess_op = create_operators(pre_process_list, + {'infer_mode': True}) + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor, self.output_tensors, self.config = \ + utility.create_predictor(args, 'ser', logger) + + def __call__(self, img): + ori_im = img.copy() + data = {'image': img} + data = transform(data, self.preprocess_op) + if data[0] is None: + return None, 0 + starttime = time.time() + + for idx in range(len(data)): + if isinstance(data[idx], np.ndarray): + data[idx] = np.expand_dims(data[idx], axis=0) + else: + data[idx] = [data[idx]] + + for idx in range(len(self.input_tensor)): + self.input_tensor[idx].copy_from_cpu(data[idx]) + + self.predictor.run() + + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + preds = outputs[0] + + post_result = self.postprocess_op( + preds, segment_offset_ids=data[6], ocr_infos=data[7]) + elapse = time.time() - starttime + return post_result, data, elapse + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + ser_predictor = SerPredictor(args) + count = 0 + total_time = 0 + + os.makedirs(args.output, exist_ok=True) + with open( + os.path.join(args.output, 'infer.txt'), mode='w', + encoding='utf-8') as f_w: + for image_file in image_file_list: + img, flag, _ = check_and_read(image_file) + if not flag: + img = cv2.imread(image_file) + img = img[:, :, ::-1] + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + ser_res, _, elapse = ser_predictor(img) + ser_res = ser_res[0] + + res_str = '{}\t{}\n'.format( + image_file, + json.dumps( + { + "ocr_info": ser_res, + }, ensure_ascii=False)) + f_w.write(res_str) + + img_res = draw_ser_results( + image_file, + ser_res, + font_path=args.vis_font_path, ) + + img_save_path = os.path.join(args.output, + os.path.basename(image_file)) + cv2.imwrite(img_save_path, img_res) + logger.info("save vis result to {}".format(img_save_path)) + if count > 0: + total_time += elapse + count += 1 + logger.info("Predict time of {}: {}".format(image_file, elapse)) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/ppstructure/kie/predict_kie_token_ser_re.py b/ppstructure/kie/predict_kie_token_ser_re.py new file mode 100644 index 0000000000000000000000000000000000000000..b29a8f69dbf99fa4410136277d7d92d0d41b2039 --- /dev/null +++ b/ppstructure/kie/predict_kie_token_ser_re.py @@ -0,0 +1,135 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import json +import numpy as np +import time + +import tools.infer.utility as utility +from tools.infer_kie_token_ser_re import make_input +from ppocr.postprocess import build_post_process +from ppocr.utils.logging import get_logger +from ppocr.utils.visual import draw_ser_results, draw_re_results +from ppocr.utils.utility import get_image_file_list, check_and_read +from ppstructure.utility import parse_args +from ppstructure.kie.predict_kie_token_ser import SerPredictor + +logger = get_logger() + + +class SerRePredictor(object): + def __init__(self, args): + self.use_visual_backbone = args.use_visual_backbone + self.ser_engine = SerPredictor(args) + if args.re_model_dir is not None: + postprocess_params = {'name': 'VQAReTokenLayoutLMPostProcess'} + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor, self.output_tensors, self.config = \ + utility.create_predictor(args, 're', logger) + else: + self.predictor = None + + def __call__(self, img): + starttime = time.time() + ser_results, ser_inputs, ser_elapse = self.ser_engine(img) + if self.predictor is None: + return ser_results, ser_elapse + + re_input, entity_idx_dict_batch = make_input(ser_inputs, ser_results) + if self.use_visual_backbone == False: + re_input.pop(4) + for idx in range(len(self.input_tensor)): + self.input_tensor[idx].copy_from_cpu(re_input[idx]) + + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + preds = dict( + loss=outputs[1], + pred_relations=outputs[2], + hidden_states=outputs[0], ) + + post_result = self.postprocess_op( + preds, + ser_results=ser_results, + entity_idx_dict_batch=entity_idx_dict_batch) + + elapse = time.time() - starttime + return post_result, elapse + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + ser_re_predictor = SerRePredictor(args) + count = 0 + total_time = 0 + + os.makedirs(args.output, exist_ok=True) + with open( + os.path.join(args.output, 'infer.txt'), mode='w', + encoding='utf-8') as f_w: + for image_file in image_file_list: + img, flag, _ = check_and_read(image_file) + if not flag: + img = cv2.imread(image_file) + img = img[:, :, ::-1] + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + re_res, elapse = ser_re_predictor(img) + re_res = re_res[0] + + res_str = '{}\t{}\n'.format( + image_file, + json.dumps( + { + "ocr_info": re_res, + }, ensure_ascii=False)) + f_w.write(res_str) + if ser_re_predictor.predictor is not None: + img_res = draw_re_results( + image_file, re_res, font_path=args.vis_font_path) + img_save_path = os.path.join( + args.output, + os.path.splitext(os.path.basename(image_file))[0] + + "_ser_re.jpg") + else: + img_res = draw_ser_results( + image_file, re_res, font_path=args.vis_font_path) + img_save_path = os.path.join( + args.output, + os.path.splitext(os.path.basename(image_file))[0] + + "_ser.jpg") + + cv2.imwrite(img_save_path, img_res) + logger.info("save vis result to {}".format(img_save_path)) + if count > 0: + total_time += elapse + count += 1 + logger.info("Predict time of {}: {}".format(image_file, elapse)) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/ppstructure/kie/requirements.txt b/ppstructure/kie/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cfcba764190fd46f98b76c27e93db6f4fa36c45 --- /dev/null +++ b/ppstructure/kie/requirements.txt @@ -0,0 +1,7 @@ +sentencepiece +yacs +seqeval +pypandoc +attrdict +python_docx +paddlenlp>=2.4.1 diff --git a/ppstructure/kie/tools/eval_with_label_end2end.py b/ppstructure/kie/tools/eval_with_label_end2end.py new file mode 100644 index 0000000000000000000000000000000000000000..b0fd84363f450dfb7e4ef18e53adc17ef088cf18 --- /dev/null +++ b/ppstructure/kie/tools/eval_with_label_end2end.py @@ -0,0 +1,259 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import sys +import shapely +from shapely.geometry import Polygon +import numpy as np +from collections import defaultdict +import operator +from rapidfuzz.distance import Levenshtein +import argparse +import json +import copy + + +def parse_ser_results_fp(fp, fp_type="gt", ignore_background=True): + # img/zh_val_0.jpg { + # "height": 3508, + # "width": 2480, + # "ocr_info": [ + # {"text": "Maribyrnong", "label": "other", "bbox": [1958, 144, 2184, 198]}, + # {"text": "CITYCOUNCIL", "label": "other", "bbox": [2052, 183, 2171, 214]}, + # ] + assert fp_type in ["gt", "pred"] + key = "label" if fp_type == "gt" else "pred" + res_dict = dict() + with open(fp, "r", encoding='utf-8') as fin: + lines = fin.readlines() + + for _, line in enumerate(lines): + img_path, info = line.strip().split("\t") + # get key + image_name = os.path.basename(img_path) + res_dict[image_name] = [] + # get infos + json_info = json.loads(info) + for single_ocr_info in json_info["ocr_info"]: + label = single_ocr_info[key].upper() + if label in ["O", "OTHERS", "OTHER"]: + label = "O" + if ignore_background and label == "O": + continue + single_ocr_info["label"] = label + res_dict[image_name].append(copy.deepcopy(single_ocr_info)) + return res_dict + + +def polygon_from_str(polygon_points): + """ + Create a shapely polygon object from gt or dt line. + """ + polygon_points = np.array(polygon_points).reshape(4, 2) + polygon = Polygon(polygon_points).convex_hull + return polygon + + +def polygon_iou(poly1, poly2): + """ + Intersection over union between two shapely polygons. + """ + if not poly1.intersects( + poly2): # this test is fast and can accelerate calculation + iou = 0 + else: + try: + inter_area = poly1.intersection(poly2).area + union_area = poly1.area + poly2.area - inter_area + iou = float(inter_area) / union_area + except shapely.geos.TopologicalError: + # except Exception as e: + # print(e) + print('shapely.geos.TopologicalError occurred, iou set to 0') + iou = 0 + return iou + + +def ed(args, str1, str2): + if args.ignore_space: + str1 = str1.replace(" ", "") + str2 = str2.replace(" ", "") + if args.ignore_case: + str1 = str1.lower() + str2 = str2.lower() + return Levenshtein.distance(str1, str2) + + +def convert_bbox_to_polygon(bbox): + """ + bbox : [x1, y1, x2, y2] + output: [[x1, y1], [x2, y2], [x3, y3], [x4, y4]] + """ + xmin, ymin, xmax, ymax = bbox + poly = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]] + return poly + + +def eval_e2e(args): + # gt + gt_results = parse_ser_results_fp(args.gt_json_path, "gt", + args.ignore_background) + # pred + dt_results = parse_ser_results_fp(args.pred_json_path, "pred", + args.ignore_background) + iou_thresh = args.iou_thres + num_gt_chars = 0 + gt_count = 0 + dt_count = 0 + hit = 0 + ed_sum = 0 + + for img_name in dt_results: + gt_info = gt_results[img_name] + gt_count += len(gt_info) + + dt_info = dt_results[img_name] + dt_count += len(dt_info) + + dt_match = [False] * len(dt_info) + gt_match = [False] * len(gt_info) + + all_ious = defaultdict(tuple) + # gt: {text, label, bbox or poly} + for index_gt, gt in enumerate(gt_info): + if "poly" not in gt: + gt["poly"] = convert_bbox_to_polygon(gt["bbox"]) + gt_poly = polygon_from_str(gt["poly"]) + for index_dt, dt in enumerate(dt_info): + if "poly" not in dt: + dt["poly"] = convert_bbox_to_polygon(dt["bbox"]) + dt_poly = polygon_from_str(dt["poly"]) + iou = polygon_iou(dt_poly, gt_poly) + if iou >= iou_thresh: + all_ious[(index_gt, index_dt)] = iou + sorted_ious = sorted( + all_ious.items(), key=operator.itemgetter(1), reverse=True) + sorted_gt_dt_pairs = [item[0] for item in sorted_ious] + + # matched gt and dt + for gt_dt_pair in sorted_gt_dt_pairs: + index_gt, index_dt = gt_dt_pair + if gt_match[index_gt] == False and dt_match[index_dt] == False: + gt_match[index_gt] = True + dt_match[index_dt] = True + # ocr rec results + gt_text = gt_info[index_gt]["text"] + dt_text = dt_info[index_dt]["text"] + + # ser results + gt_label = gt_info[index_gt]["label"] + dt_label = dt_info[index_dt]["pred"] + + if True: # ignore_masks[index_gt] == '0': + ed_sum += ed(args, gt_text, dt_text) + num_gt_chars += len(gt_text) + if gt_text == dt_text: + if args.ignore_ser_prediction or gt_label == dt_label: + hit += 1 + +# unmatched dt + for tindex, dt_match_flag in enumerate(dt_match): + if dt_match_flag == False: + dt_text = dt_info[tindex]["text"] + gt_text = "" + ed_sum += ed(args, dt_text, gt_text) + +# unmatched gt + for tindex, gt_match_flag in enumerate(gt_match): + if gt_match_flag == False: + dt_text = "" + gt_text = gt_info[tindex]["text"] + ed_sum += ed(args, gt_text, dt_text) + num_gt_chars += len(gt_text) + + eps = 1e-9 + print("config: ", args) + print('hit, dt_count, gt_count', hit, dt_count, gt_count) + precision = hit / (dt_count + eps) + recall = hit / (gt_count + eps) + fmeasure = 2.0 * precision * recall / (precision + recall + eps) + avg_edit_dist_img = ed_sum / len(gt_results) + avg_edit_dist_field = ed_sum / (gt_count + eps) + character_acc = 1 - ed_sum / (num_gt_chars + eps) + + print('character_acc: %.2f' % (character_acc * 100) + "%") + print('avg_edit_dist_field: %.2f' % (avg_edit_dist_field)) + print('avg_edit_dist_img: %.2f' % (avg_edit_dist_img)) + print('precision: %.2f' % (precision * 100) + "%") + print('recall: %.2f' % (recall * 100) + "%") + print('fmeasure: %.2f' % (fmeasure * 100) + "%") + + return + + +def parse_args(): + """ + """ + + def str2bool(v): + return v.lower() in ("true", "t", "1") + + parser = argparse.ArgumentParser() + ## Required parameters + parser.add_argument( + "--gt_json_path", + default=None, + type=str, + required=True, ) + parser.add_argument( + "--pred_json_path", + default=None, + type=str, + required=True, ) + + parser.add_argument("--iou_thres", default=0.5, type=float) + + parser.add_argument( + "--ignore_case", + default=False, + type=str2bool, + help="whether to do lower case for the strs") + + parser.add_argument( + "--ignore_space", + default=True, + type=str2bool, + help="whether to ignore space") + + parser.add_argument( + "--ignore_background", + default=True, + type=str2bool, + help="whether to ignore other label") + + parser.add_argument( + "--ignore_ser_prediction", + default=False, + type=str2bool, + help="whether to ignore ocr pred results") + + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + eval_e2e(args) diff --git a/ppstructure/kie/tools/trans_funsd_label.py b/ppstructure/kie/tools/trans_funsd_label.py new file mode 100644 index 0000000000000000000000000000000000000000..ef7d1db010a925b37d285befe77aa202db2141d9 --- /dev/null +++ b/ppstructure/kie/tools/trans_funsd_label.py @@ -0,0 +1,151 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import sys +import cv2 +import numpy as np +from copy import deepcopy + + +def trans_poly_to_bbox(poly): + x1 = np.min([p[0] for p in poly]) + x2 = np.max([p[0] for p in poly]) + y1 = np.min([p[1] for p in poly]) + y2 = np.max([p[1] for p in poly]) + return [x1, y1, x2, y2] + + +def get_outer_poly(bbox_list): + x1 = min([bbox[0] for bbox in bbox_list]) + y1 = min([bbox[1] for bbox in bbox_list]) + x2 = max([bbox[2] for bbox in bbox_list]) + y2 = max([bbox[3] for bbox in bbox_list]) + return [[x1, y1], [x2, y1], [x2, y2], [x1, y2]] + + +def load_funsd_label(image_dir, anno_dir): + imgs = os.listdir(image_dir) + annos = os.listdir(anno_dir) + + imgs = [img.replace(".png", "") for img in imgs] + annos = [anno.replace(".json", "") for anno in annos] + + fn_info_map = dict() + for anno_fn in annos: + res = [] + with open(os.path.join(anno_dir, anno_fn + ".json"), "r") as fin: + infos = json.load(fin) + infos = infos["form"] + old_id2new_id_map = dict() + global_new_id = 0 + for info in infos: + if info["text"] is None: + continue + words = info["words"] + if len(words) <= 0: + continue + word_idx = 1 + curr_bboxes = [words[0]["box"]] + curr_texts = [words[0]["text"]] + while word_idx < len(words): + # switch to a new link + if words[word_idx]["box"][0] + 10 <= words[word_idx - 1][ + "box"][2]: + if len("".join(curr_texts[0])) > 0: + res.append({ + "transcription": " ".join(curr_texts), + "label": info["label"], + "points": get_outer_poly(curr_bboxes), + "linking": info["linking"], + "id": global_new_id, + }) + if info["id"] not in old_id2new_id_map: + old_id2new_id_map[info["id"]] = [] + old_id2new_id_map[info["id"]].append(global_new_id) + global_new_id += 1 + curr_bboxes = [words[word_idx]["box"]] + curr_texts = [words[word_idx]["text"]] + else: + curr_bboxes.append(words[word_idx]["box"]) + curr_texts.append(words[word_idx]["text"]) + word_idx += 1 + if len("".join(curr_texts[0])) > 0: + res.append({ + "transcription": " ".join(curr_texts), + "label": info["label"], + "points": get_outer_poly(curr_bboxes), + "linking": info["linking"], + "id": global_new_id, + }) + if info["id"] not in old_id2new_id_map: + old_id2new_id_map[info["id"]] = [] + old_id2new_id_map[info["id"]].append(global_new_id) + global_new_id += 1 + res = sorted( + res, key=lambda r: (r["points"][0][1], r["points"][0][0])) + for i in range(len(res) - 1): + for j in range(i, 0, -1): + if abs(res[j + 1]["points"][0][1] - res[j]["points"][0][1]) < 20 and \ + (res[j + 1]["points"][0][0] < res[j]["points"][0][0]): + tmp = deepcopy(res[j]) + res[j] = deepcopy(res[j + 1]) + res[j + 1] = deepcopy(tmp) + else: + break + # re-generate unique ids + for idx, r in enumerate(res): + new_links = [] + for link in r["linking"]: + # illegal links will be removed + if link[0] not in old_id2new_id_map or link[ + 1] not in old_id2new_id_map: + continue + for src in old_id2new_id_map[link[0]]: + for dst in old_id2new_id_map[link[1]]: + new_links.append([src, dst]) + res[idx]["linking"] = deepcopy(new_links) + + fn_info_map[anno_fn] = res + + return fn_info_map + + +def main(): + test_image_dir = "train_data/FUNSD/testing_data/images/" + test_anno_dir = "train_data/FUNSD/testing_data/annotations/" + test_output_dir = "train_data/FUNSD/test.json" + + fn_info_map = load_funsd_label(test_image_dir, test_anno_dir) + with open(test_output_dir, "w") as fout: + for fn in fn_info_map: + fout.write(fn + ".png" + "\t" + json.dumps( + fn_info_map[fn], ensure_ascii=False) + "\n") + + train_image_dir = "train_data/FUNSD/training_data/images/" + train_anno_dir = "train_data/FUNSD/training_data/annotations/" + train_output_dir = "train_data/FUNSD/train.json" + + fn_info_map = load_funsd_label(train_image_dir, train_anno_dir) + with open(train_output_dir, "w") as fout: + for fn in fn_info_map: + fout.write(fn + ".png" + "\t" + json.dumps( + fn_info_map[fn], ensure_ascii=False) + "\n") + print("====ok====") + return + + +if __name__ == "__main__": + main() diff --git a/ppstructure/kie/tools/trans_xfun_data.py b/ppstructure/kie/tools/trans_xfun_data.py new file mode 100644 index 0000000000000000000000000000000000000000..11d221bea40367f091b3e09dde42e87f2217a617 --- /dev/null +++ b/ppstructure/kie/tools/trans_xfun_data.py @@ -0,0 +1,60 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json + + +def transfer_xfun_data(json_path=None, output_file=None): + with open(json_path, "r", encoding='utf-8') as fin: + lines = fin.readlines() + + json_info = json.loads(lines[0]) + documents = json_info["documents"] + with open(output_file, "w", encoding='utf-8') as fout: + for idx, document in enumerate(documents): + label_info = [] + img_info = document["img"] + document = document["document"] + image_path = img_info["fname"] + + for doc in document: + x1, y1, x2, y2 = doc["box"] + points = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]] + label_info.append({ + "transcription": doc["text"], + "label": doc["label"], + "points": points, + "id": doc["id"], + "linking": doc["linking"] + }) + + fout.write(image_path + "\t" + json.dumps( + label_info, ensure_ascii=False) + "\n") + + print("===ok====") + + +def parser_args(): + import argparse + parser = argparse.ArgumentParser(description="args for paddleserving") + parser.add_argument( + "--ori_gt_path", type=str, required=True, help='origin xfun gt path') + parser.add_argument( + "--output_path", type=str, required=True, help='path to save') + args = parser.parse_args() + return args + + +args = parser_args() +transfer_xfun_data(args.ori_gt_path, args.output_path) diff --git a/ppstructure/layout/README.md b/ppstructure/layout/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6830f8e82153f8ae7d2e798cda6782bc5518da4c --- /dev/null +++ b/ppstructure/layout/README.md @@ -0,0 +1,470 @@ +English | [简体中文](README_ch.md) + +# Layout analysis + +- [1. Introduction](#1-Introduction) +- [2. Quick start](#2-Quick-start) +- [3. Install](#3-Install) + - [3.1 Install PaddlePaddle](#31-Install-paddlepaddle) + - [3.2 Install PaddleDetection](#32-Install-paddledetection) +- [4. Data preparation](#4-Data-preparation) + - [4.1 English data set](#41-English-data-set) + - [4.2 More datasets](#42-More-datasets) +- [5. Start training](#5-Start-training) + - [5.1 Train](#51-Train) + - [5.2 FGD Distillation training](#52-Fgd-distillation-training) +- [6. Model evaluation and prediction](#6-Model-evaluation-and-prediction) + - [6.1 Indicator evaluation](#61-Indicator-evaluation) + - [6.2 Test layout analysis results](#62-Test-layout-analysis-results) +- [7. Model export and inference](#7-Model-export-and-inference) + - [7.1 Model export](#71-Model-export) + - [7.2 Model inference](#72-Model-inference) + + +## 1. Introduction + +Layout analysis refers to the regional division of documents in the form of pictures and the positioning of key areas, such as text, title, table, picture, etc. The layout analysis algorithm is based on the lightweight model PP-picodet of [PaddleDetection]( https://github.com/PaddlePaddle/PaddleDetection ), including English layout analysis, Chinese layout analysis and table layout analysis models. English layout analysis models can detect document layout elements such as text, title, table, figure, list. Chinese layout analysis models can detect document layout elements such as text, figure, figure caption, table, table caption, header, footer, reference, and equation. Table layout analysis models can detect table regions. + +
+ +
+ +## 2. Quick start +PP-Structure currently provides layout analysis models in Chinese, English and table documents. For the model link, see [models_list](../docs/models_list_en.md). The whl package is also provided for quick use, see [quickstart](../docs/quickstart_en.md) for details. + +## 3. Install + +### 3.1. Install PaddlePaddle + +- **(1) Install PaddlePaddle** + +```bash +python3 -m pip install --upgrade pip + +# GPU Install +python3 -m pip install "paddlepaddle-gpu>=2.3" -i https://mirror.baidu.com/pypi/simple + +# CPU Install +python3 -m pip install "paddlepaddle>=2.3" -i https://mirror.baidu.com/pypi/simple +``` +For more requirements, please refer to the instructions in the [Install file](https://www.paddlepaddle.org.cn/install/quick)。 + +### 3.2. Install PaddleDetection + +- **(1)Download PaddleDetection Source code** + +```bash +git clone https://github.com/PaddlePaddle/PaddleDetection.git +``` + +- **(2)Install third-party libraries** + +```bash +cd PaddleDetection +python3 -m pip install -r requirements.txt +``` + +## 4. Data preparation + +If you want to experience the prediction process directly, you can skip data preparation and download the pre-training model. + +### 4.1. English data set + +Download document analysis data set [PubLayNet](https://developer.ibm.com/exchanges/data/all/publaynet/)(Dataset 96G),contains 5 classes:`{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}` + +``` +# Download data +wget https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/publaynet.tar.gz +# Decompress data +tar -xvf publaynet.tar.gz +``` + +Uncompressed **directory structure:** + +``` +|-publaynet + |- test + |- PMC1277013_00004.jpg + |- PMC1291385_00002.jpg + | ... + |- train.json + |- train + |- PMC1291385_00002.jpg + |- PMC1277013_00004.jpg + | ... + |- val.json + |- val + |- PMC538274_00004.jpg + |- PMC539300_00004.jpg + | ... +``` + +**data distribution:** + +| File or Folder | Description | num | +| :------------- | :------------- | ------- | +| `train/` | Training set pictures | 335,703 | +| `val/` | Verification set pictures | 11,245 | +| `test/` | Test set pictures | 11,405 | +| `train.json` | Training set annotation files | - | +| `val.json` | Validation set dimension files | - | + +**Data Annotation** + +The JSON file contains the annotations of all images, and the data is stored in a dictionary nested manner.Contains the following keys: + +- info,represents the dimension file info。 + +- licenses,represents the dimension file licenses。 + +- images,represents the list of image information in the annotation file,each element is the information of an image。The information of one of the images is as follows: + + ``` + { + 'file_name': 'PMC4055390_00006.jpg', # file_name + 'height': 601, # image height + 'width': 792, # image width + 'id': 341427 # image id + } + ``` + +- annotations, represents the list of annotation information of the target object in the annotation file,each element is the annotation information of a target object。The following is the annotation information of one of the target objects: + + ``` + { + + 'segmentation': # Segmentation annotation of objects + 'area': 60518.099043117836, # Area of object + 'iscrowd': 0, # iscrowd + 'image_id': 341427, # image id + 'bbox': [50.58, 490.86, 240.15, 252.16], # bbox [x1,y1,w,h] + 'category_id': 1, # category_id + 'id': 3322348 # image id + } + ``` + +### 4.2. More datasets + +We provide CDLA(Chinese layout analysis), TableBank(Table layout analysis)etc. data set download links,process to the JSON format of the above annotation file,that is, the training can be conducted in the same way。 + +| dataset | 简介 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [cTDaR2019_cTDaR](https://cndplab-founder.github.io/cTDaR2019/) | For form detection (TRACKA) and form identification (TRACKB).Image types include historical data sets (beginning with cTDaR_t0, such as CTDAR_T00872.jpg) and modern data sets (beginning with cTDaR_t1, CTDAR_T10482.jpg). | +| [IIIT-AR-13K](http://cvit.iiit.ac.in/usodi/iiitar13k.php) | Data sets constructed by manually annotating figures or pages from publicly available annual reports, containing 5 categories:table, figure, natural image, logo, and signature. | +| [TableBank](https://github.com/doc-analysis/TableBank) | For table detection and recognition of large datasets, including Word and Latex document formats | +| [CDLA](https://github.com/buptlihang/CDLA) | Chinese document layout analysis data set, for Chinese literature (paper) scenarios, including 10 categories:Text, Title, Figure, Figure caption, Table, Table caption, Header, Footer, Reference, Equation | +| [DocBank](https://github.com/doc-analysis/DocBank) | Large-scale dataset (500K document pages) constructed using weakly supervised methods for document layout analysis, containing 12 categories:Author, Caption, Date, Equation, Figure, Footer, List, Paragraph, Reference, Section, Table, Title | + + +## 5. Start training + +Training scripts, evaluation scripts, and prediction scripts are provided, and the PubLayNet pre-training model is used as an example in this section. + +If you do not want training and directly experience the following process of model evaluation, prediction, motion to static, and inference, you can download the provided pre-trained model (PubLayNet dataset) and skip this part. + +``` +mkdir pretrained_model +cd pretrained_model +# Download PubLayNet pre-training model(Direct experience model evaluates, predicts, and turns static) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams +# Download the PubLaynet inference model(Direct experience model reasoning) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar +``` + +If the test image is Chinese, the pre-trained model of Chinese CDLA dataset can be downloaded to identify 10 types of document regions:Table, Figure, Figure caption, Table, Table caption, Header, Footer, Reference, Equation,Download the training model and inference model of Model 'picodet_lcnet_x1_0_fgd_layout_cdla' in [layout analysis model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md)。If only the table area in the image is detected, you can download the pre-trained model of the table dataset, and download the training model and inference model of the 'picodet_LCnet_x1_0_FGd_layout_table' model in [Layout Analysis model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md) + +### 5.1. Train + +Start training with the PaddleDetection [layout analysis profile](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/picodet/legacy_model/application/layout_analysis) + +* Modify Profile + +If you want to train your own data set, you need to modify the data configuration and the number of categories in the configuration file. + + +Using 'configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml' as an example, the change is as follows: + +```yaml +metric: COCO +# Number of categories +num_classes: 5 + +TrainDataset: + !COCODataSet + # Modify to your own training data directory + image_dir: train + # Modify to your own training data label file + anno_path: train.json + # Modify to your own training data root directory + dataset_dir: /root/publaynet/ + data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] + +EvalDataset: + !COCODataSet + # Modify to your own validation data directory + image_dir: val + # Modify to your own validation data label file + anno_path: val.json + # Modify to your own validation data root + dataset_dir: /root/publaynet/ + +TestDataset: + !ImageFolder + # Modify to your own test data label file + anno_path: /root/publaynet/val.json +``` + +* Start training. During training, PP picodet pre training model will be downloaded by default. There is no need to download in advance. + +```bash +# GPU training supports single-card and multi-card training +# The training log is automatically saved to the log directory + +# Single card training +export CUDA_VISIBLE_DEVICES=0 +python3 tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval + +# Multi-card training, with the -- GPUS parameter specifying the card number +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval +``` + +**Attention:**If the video memory is out during training, adjust Batch_size in TrainReader and base_LR in LearningRate. The published config is obtained by 8-card training. If the number of GPU cards is changed to 1, then the base_LR needs to be reduced by 8 times. + +After starting training normally, you will see the following log output: + +``` +[08/15 04:02:30] ppdet.utils.checkpoint INFO: Finish loading model weights: /root/.cache/paddle/weights/LCNet_x1_0_pretrained.pdparams +[08/15 04:02:46] ppdet.engine INFO: Epoch: [0] [ 0/1929] learning_rate: 0.040000 loss_vfl: 1.216707 loss_bbox: 1.142163 loss_dfl: 0.544196 loss: 2.903065 eta: 17 days, 13:50:26 batch_cost: 15.7452 data_cost: 2.9112 ips: 1.5243 images/s +[08/15 04:03:19] ppdet.engine INFO: Epoch: [0] [ 20/1929] learning_rate: 0.064000 loss_vfl: 1.180627 loss_bbox: 0.939552 loss_dfl: 0.442436 loss: 2.628206 eta: 2 days, 12:18:53 batch_cost: 1.5770 data_cost: 0.0008 ips: 15.2184 images/s +[08/15 04:03:47] ppdet.engine INFO: Epoch: [0] [ 40/1929] learning_rate: 0.088000 loss_vfl: 0.543321 loss_bbox: 1.071401 loss_dfl: 0.457817 loss: 2.057003 eta: 2 days, 0:07:03 batch_cost: 1.3190 data_cost: 0.0007 ips: 18.1954 images/s +[08/15 04:04:12] ppdet.engine INFO: Epoch: [0] [ 60/1929] learning_rate: 0.112000 loss_vfl: 0.630989 loss_bbox: 0.859183 loss_dfl: 0.384702 loss: 1.883143 eta: 1 day, 19:01:29 batch_cost: 1.2177 data_cost: 0.0006 ips: 19.7087 images/s +``` + +- `--eval` indicates that the best model is saved as `output/picodet_lcnet_x1_0_layout/best_accuracy` by default during the evaluation process 。 + +**Note that the configuration file for prediction / evaluation must be consistent with the training.** + +### 5.2. FGD Distillation Training + +PaddleDetection supports FGD-based [Focal and Global Knowledge Distillation for Detectors]( https://arxiv.org/abs/2111.11837v1) The training process of the target detection model of distillation, FGD distillation is divided into two parts `Focal` and `Global`. `Focal` Distillation separates the foreground and background of the image, allowing the student model to focus on the key pixels of the foreground and background features of the teacher model respectively;` Global`Distillation section reconstructs the relationships between different pixels and transfers them from the teacher to the student to compensate for the global information lost in `Focal`Distillation. + +Change the dataset and modify the data configuration and number of categories in the [TODO] configuration, referring to 4.1. Start training: + +```bash +# Single Card Training +export CUDA_VISIBLE_DEVICES=0 +python3 tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + --eval +``` + +- `-c`: Specify the model configuration file. +- `--slim_config`: Specify the compression policy profile. + +## 6. Model evaluation and prediction + +### 6.1. Indicator evaluation + + Model parameters in training are saved by default in `output/picodet_ Lcnet_ X1_ 0_ Under the layout` directory. When evaluating indicators, you need to set `weights` to point to the saved parameter file.Assessment datasets can be accessed via `configs/picodet/legacy_ Model/application/layout_ Analysis/picodet_ Lcnet_ X1_ 0_ Layout. Yml` . Modify `EvalDataset` : `img_dir`,`anno_ Path`and`dataset_dir` setting. + +```bash +# GPU evaluation, weights as weights to be measured +python3 tools/eval.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=./output/picodet_lcnet_x1_0_layout/best_model +``` + +The following information will be printed out, such as mAP, AP0.5, etc. + +```py + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.935 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.979 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.956 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.404 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.782 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.969 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.539 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.938 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.949 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.495 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.818 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.978 +[08/15 07:07:09] ppdet.engine INFO: Total sample number: 11245, averge FPS: 24.405059207157436 +[08/15 07:07:09] ppdet.engine INFO: Best test bbox ap is 0.935. +``` + +If you use the provided pre-training model for evaluation or the FGD distillation training model, replace the `weights` model path and execute the following command for evaluation: + +``` +python3 tools/eval.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=output/picodet_lcnet_x2_5_layout/best_model +``` + +- `-c`: Specify the model configuration file. +- `--slim_config`: Specify the distillation policy profile. +- `-o weights`: Specify the model path trained by the distillation algorithm. + +### 6.2. Test Layout Analysis Results + + +The profile predicted to be used must be consistent with the training, for example, if you pass `python3 tools/train'. Py-c configs/picodet/legacy_ Model/application/layout_ Analysis/picodet_ Lcnet_ X1_ 0_ Layout. Yml` completed the training process for the model. + +With trained PaddleDetection model, you can use the following commands to make model predictions. + +```bash +python3 tools/infer.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights='output/picodet_lcnet_x1_0_layout/best_model.pdparams' \ + --infer_img='docs/images/layout.jpg' \ + --output_dir=output_dir/ \ + --draw_threshold=0.5 +``` + +- `--infer_img`: Reasoning for a single picture can also be done via `--infer_ Dir`Inform all pictures in the file. +- `--output_dir`: Specify the path to save the visualization results. +- `--draw_threshold`:Specify the NMS threshold for drawing the result box. + +If you use the provided pre-training model for prediction or the FGD distillation training model, change the `weights` model path and execute the following command to make the prediction: + +``` +python3 tools/infer.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights='output/picodet_lcnet_x2_5_layout/best_model.pdparams' \ + --infer_img='docs/images/layout.jpg' \ + --output_dir=output_dir/ \ + --draw_threshold=0.5 +``` + + +## 7. Model Export and Inference + + +### 7.1 Model Export + +The inference model (the model saved by `paddle.jit.save`) is generally a solidified model saved after the model training is completed, and is mostly used to give prediction in deployment. + +The model saved during the training process is the checkpoints model, which saves the parameters of the model and is mostly used to resume training. + +Compared with the checkpoints model, the inference model will additionally save the structural information of the model. Therefore, it is easier to deploy because the model structure and model parameters are already solidified in the inference model file, and is suitable for integration with actual systems. + +Layout analysis model to inference model steps are as follows: + +```bash +python3 tools/export_model.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=output/picodet_lcnet_x1_0_layout/best_model \ + --output_dir=output_inference/ +``` + +* If no post-export processing is required, specify:`-o export.benchmark=True`(If -o already exists, delete -o here) +* If you do not need to export NMS, specify:`-o export.nms=False` + +After successful conversion, there are three files in the directory: + +``` +output_inference/picodet_lcnet_x1_0_layout/ + ├── model.pdiparams # inference Parameter file for model + ├── model.pdiparams.info # inference Model parameter information, ignorable + └── model.pdmodel # inference Model Structure File for Model +``` + +If you change the `weights` model path using the provided pre-training model to the Inference model, or using the FGD distillation training model, the model to inference model steps are as follows: + +```bash +python3 tools/export_model.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=./output/picodet_lcnet_x2_5_layout/best_model \ + --output_dir=output_inference/ +``` + +### 7.2 Model inference + +Replace model_with the provided inference training model for inference or the FGD distillation training `model_dir`Inference model path, execute the following commands for inference: + +```bash +python3 deploy/python/infer.py \ + --model_dir=output_inference/picodet_lcnet_x1_0_layout/ \ + --image_file=docs/images/layout.jpg \ + --device=CPU +``` + +- --device:Specify the GPU or CPU device + +When model inference is complete, you will see the following log output: + +``` +------------------------------------------ +----------- Model Configuration ----------- +Model Arch: PicoDet +Transform Order: +--transform op: Resize +--transform op: NormalizeImage +--transform op: Permute +--transform op: PadStride +-------------------------------------------- +class_id:0, confidence:0.9921, left_top:[20.18,35.66],right_bottom:[341.58,600.99] +class_id:0, confidence:0.9914, left_top:[19.77,611.42],right_bottom:[341.48,901.82] +class_id:0, confidence:0.9904, left_top:[369.36,375.10],right_bottom:[691.29,600.59] +class_id:0, confidence:0.9835, left_top:[369.60,608.60],right_bottom:[691.38,736.72] +class_id:0, confidence:0.9830, left_top:[369.58,805.38],right_bottom:[690.97,901.80] +class_id:0, confidence:0.9716, left_top:[383.68,271.44],right_bottom:[688.93,335.39] +class_id:0, confidence:0.9452, left_top:[370.82,34.48],right_bottom:[688.10,63.54] +class_id:1, confidence:0.8712, left_top:[370.84,771.03],right_bottom:[519.30,789.13] +class_id:3, confidence:0.9856, left_top:[371.28,67.85],right_bottom:[685.73,267.72] +save result to: output/layout.jpg +Test iter 0 +------------------ Inference Time Info ---------------------- +total_time(ms): 2196.0, img_num: 1 +average latency time(ms): 2196.00, QPS: 0.455373 +preprocess_time(ms): 2172.50, inference_time(ms): 11.90, postprocess_time(ms): 11.60 +``` + +- Model:model structure +- Transform Order:Preprocessing operation +- class_id, confidence, left_top, right_bottom:Indicates category id, confidence level, upper left coordinate, lower right coordinate, respectively +- save result to:Save path of visual layout analysis results, default save to ./output folder +- inference time info:Inference time, where preprocess_time represents the preprocessing time, Inference_time represents the model prediction time, and postprocess_time represents the post-processing time + +The result of visualization layout is shown in the following figure + +
+ +
+ + + +## Citations + +``` +@inproceedings{zhong2019publaynet, + title={PubLayNet: largest dataset ever for document layout analysis}, + author={Zhong, Xu and Tang, Jianbin and Yepes, Antonio Jimeno}, + booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)}, + year={2019}, + volume={}, + number={}, + pages={1015-1022}, + doi={10.1109/ICDAR.2019.00166}, + ISSN={1520-5363}, + month={Sep.}, + organization={IEEE} +} + +@inproceedings{yang2022focal, + title={Focal and global knowledge distillation for detectors}, + author={Yang, Zhendong and Li, Zhe and Jiang, Xiaohu and Gong, Yuan and Yuan, Zehuan and Zhao, Danpei and Yuan, Chun}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={4643--4652}, + year={2022} +} +``` diff --git a/ppstructure/layout/README_ch.md b/ppstructure/layout/README_ch.md new file mode 100644 index 0000000000000000000000000000000000000000..adef46d47389a50bf34500eee1aaf52ff5dfe449 --- /dev/null +++ b/ppstructure/layout/README_ch.md @@ -0,0 +1,469 @@ +简体中文 | [English](README.md) + +# 版面分析 + +- [1. 简介](#1-简介) +- [2. 快速开始](#2-快速开始) +- [3. 安装](#3-安装) + - [3.1 安装PaddlePaddle](#31-安装paddlepaddle) + - [3.2 安装PaddleDetection](#32-安装paddledetection) +- [4. 数据准备](#4-数据准备) + - [4.1 英文数据集](#41-英文数据集) + - [4.2 更多数据集](#42-更多数据集) +- [5. 开始训练](#5-开始训练) + - [5.1 启动训练](#51-启动训练) + - [5.2 FGD蒸馏训练](#52-fgd蒸馏训练) +- [6. 模型评估与预测](#6-模型评估与预测) + - [6.1 指标评估](#61-指标评估) + - [6.2 测试版面分析结果](#62-测试版面分析结果) +- [7 模型导出与预测](#7-模型导出与预测) + - [7.1 模型导出](#71-模型导出) + - [7.2 模型推理](#72-模型推理) + +## 1. 简介 + +版面分析指的是对图片形式的文档进行区域划分,定位其中的关键区域,如文字、标题、表格、图片等。版面分析算法基于[PaddleDetection](https://github.com/PaddlePaddle/PaddleDetection)的轻量模型PP-PicoDet进行开发,包含英文、中文、表格版面分析3类模型。其中,英文模型支持Text、Title、Tale、Figure、List5类区域的检测,中文模型支持Text、Title、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation10类区域的检测,表格版面分析支持Table区域的检测,版面分析效果如下图所示: + +
+ +
+ +## 2. 快速开始 + +PP-Structure目前提供了中文、英文、表格三类文档版面分析模型,模型链接见 [models_list](../docs/models_list.md#1-版面分析模型)。也提供了whl包的形式方便快速使用,详见 [quickstart](../docs/quickstart.md)。 + + +## 3. 安装 + +### 3.1. 安装PaddlePaddle + +- **(1) 安装PaddlePaddle** + +```bash +python3 -m pip install --upgrade pip + +# GPU安装 +python3 -m pip install "paddlepaddle-gpu>=2.3" -i https://mirror.baidu.com/pypi/simple + +# CPU安装 +python3 -m pip install "paddlepaddle>=2.3" -i https://mirror.baidu.com/pypi/simple +``` +更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 + +### 3.2. 安装PaddleDetection + +- **(1)下载PaddleDetection源码** + +```bash +git clone https://github.com/PaddlePaddle/PaddleDetection.git +``` + +- **(2)安装其他依赖** + +```bash +cd PaddleDetection +python3 -m pip install -r requirements.txt +``` + +## 4. 数据准备 + +如果希望直接体验预测过程,可以跳过数据准备,下载我们提供的预训练模型。 + +### 4.1. 英文数据集 + +下载文档分析数据集[PubLayNet](https://developer.ibm.com/exchanges/data/all/publaynet/)(数据集96G),包含5个类:`{0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}` + +``` +# 下载数据 +wget https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/publaynet.tar.gz +# 解压数据 +tar -xvf publaynet.tar.gz +``` + +解压之后的**目录结构:** + +``` +|-publaynet + |- test + |- PMC1277013_00004.jpg + |- PMC1291385_00002.jpg + | ... + |- train.json + |- train + |- PMC1291385_00002.jpg + |- PMC1277013_00004.jpg + | ... + |- val.json + |- val + |- PMC538274_00004.jpg + |- PMC539300_00004.jpg + | ... +``` + +**数据分布:** + +| File or Folder | Description | num | +| :------------- | :------------- | ------- | +| `train/` | 训练集图片 | 335,703 | +| `val/` | 验证集图片 | 11,245 | +| `test/` | 测试集图片 | 11,405 | +| `train.json` | 训练集标注文件 | - | +| `val.json` | 验证集标注文件 | - | + +**标注格式:** + +json文件包含所有图像的标注,数据以字典嵌套的方式存放,包含以下key: + +- info,表示标注文件info。 + +- licenses,表示标注文件licenses。 + +- images,表示标注文件中图像信息列表,每个元素是一张图像的信息。如下为其中一张图像的信息: + + ``` + { + 'file_name': 'PMC4055390_00006.jpg', # file_name + 'height': 601, # image height + 'width': 792, # image width + 'id': 341427 # image id + } + ``` + +- annotations,表示标注文件中目标物体的标注信息列表,每个元素是一个目标物体的标注信息。如下为其中一个目标物体的标注信息: + + ``` + { + + 'segmentation': # 物体的分割标注 + 'area': 60518.099043117836, # 物体的区域面积 + 'iscrowd': 0, # iscrowd + 'image_id': 341427, # image id + 'bbox': [50.58, 490.86, 240.15, 252.16], # bbox [x1,y1,w,h] + 'category_id': 1, # category_id + 'id': 3322348 # image id + } + ``` + +### 4.2. 更多数据集 + +我们提供了CDLA(中文版面分析)、TableBank(表格版面分析)等数据集的下连接,处理为上述标注文件json格式,即可以按相同方式进行训练。 + +| dataset | 简介 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [cTDaR2019_cTDaR](https://cndplab-founder.github.io/cTDaR2019/) | 用于表格检测(TRACKA)和表格识别(TRACKB)。图片类型包含历史数据集(以cTDaR_t0开头,如cTDaR_t00872.jpg)和现代数据集(以cTDaR_t1开头,cTDaR_t10482.jpg)。 | +| [IIIT-AR-13K](http://cvit.iiit.ac.in/usodi/iiitar13k.php) | 手动注释公开的年度报告中的图形或页面而构建的数据集,包含5类:table, figure, natural image, logo, and signature | +| [CDLA](https://github.com/buptlihang/CDLA) | 中文文档版面分析数据集,面向中文文献类(论文)场景,包含10类:Text、Title、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation | +| [TableBank](https://github.com/doc-analysis/TableBank) | 用于表格检测和识别大型数据集,包含Word和Latex2种文档格式 | +| [DocBank](https://github.com/doc-analysis/DocBank) | 使用弱监督方法构建的大规模数据集(500K文档页面),用于文档布局分析,包含12类:Author、Caption、Date、Equation、Figure、Footer、List、Paragraph、Reference、Section、Table、Title | + + +## 5. 开始训练 + +提供了训练脚本、评估脚本和预测脚本,本节将以PubLayNet预训练模型为例进行讲解。 + +如果不希望训练,直接体验后面的模型评估、预测、动转静、推理的流程,可以下载提供的预训练模型(PubLayNet数据集),并跳过5.1和5.2。 + +``` +mkdir pretrained_model +cd pretrained_model +# 下载PubLayNet预训练模型(直接体验模型评估、预测、动转静) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams +# 下载PubLaynet推理模型(直接体验模型推理) +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar +``` + +如果测试图片为中文,可以下载中文CDLA数据集的预训练模型,识别10类文档区域:Table、Figure、Figure caption、Table、Table caption、Header、Footer、Reference、Equation,在[版面分析模型](../docs/models_list.md)中下载`picodet_lcnet_x1_0_fgd_layout_cdla`模型的训练模型和推理模型。如果只检测图片中的表格区域,可以下载表格数据集的预训练模型,在[版面分析模型](../docs/models_list.md)中下载`picodet_lcnet_x1_0_fgd_layout_table`模型的训练模型和推理模型。 + +### 5.1. 启动训练 + +使用PaddleDetection[版面分析配置文件](https://github.com/PaddlePaddle/PaddleDetection/tree/release/2.5/configs/picodet/legacy_model/application/layout_analysis)启动训练 + +* 修改配置文件 + +如果你希望训练自己的数据集,需要修改配置文件中的数据配置、类别数。 + + +以`configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml` 为例,修改的内容如下所示。 + +```yaml +metric: COCO +# 类别数 +num_classes: 5 + +TrainDataset: + !COCODataSet + # 修改为你自己的训练数据目录 + image_dir: train + # 修改为你自己的训练数据标签文件 + anno_path: train.json + # 修改为你自己的训练数据根目录 + dataset_dir: /root/publaynet/ + data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd'] + +EvalDataset: + !COCODataSet + # 修改为你自己的验证数据目录 + image_dir: val + # 修改为你自己的验证数据标签文件 + anno_path: val.json + # 修改为你自己的验证数据根目录 + dataset_dir: /root/publaynet/ + +TestDataset: + !ImageFolder + # 修改为你自己的测试数据标签文件 + anno_path: /root/publaynet/val.json +``` + +* 开始训练,在训练时,会默认下载PP-PicoDet预训练模型,这里无需预先下载。 + +```bash +# GPU训练 支持单卡,多卡训练 +# 训练日志会自动保存到 log 目录中 + +# 单卡训练 +export CUDA_VISIBLE_DEVICES=0 +python3 tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval + +# 多卡训练,通过--gpus参数指定卡号 +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python3 -m paddle.distributed.launch --gpus '0,1,2,3' tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --eval +``` + +**注意:**如果训练时显存out memory,将TrainReader中batch_size调小,同时LearningRate中base_lr等比例减小。发布的config均由8卡训练得到,如果改变GPU卡数为1,那么base_lr需要减小8倍。 + +正常启动训练后,会看到以下log输出: + +``` +[08/15 04:02:30] ppdet.utils.checkpoint INFO: Finish loading model weights: /root/.cache/paddle/weights/LCNet_x1_0_pretrained.pdparams +[08/15 04:02:46] ppdet.engine INFO: Epoch: [0] [ 0/1929] learning_rate: 0.040000 loss_vfl: 1.216707 loss_bbox: 1.142163 loss_dfl: 0.544196 loss: 2.903065 eta: 17 days, 13:50:26 batch_cost: 15.7452 data_cost: 2.9112 ips: 1.5243 images/s +[08/15 04:03:19] ppdet.engine INFO: Epoch: [0] [ 20/1929] learning_rate: 0.064000 loss_vfl: 1.180627 loss_bbox: 0.939552 loss_dfl: 0.442436 loss: 2.628206 eta: 2 days, 12:18:53 batch_cost: 1.5770 data_cost: 0.0008 ips: 15.2184 images/s +[08/15 04:03:47] ppdet.engine INFO: Epoch: [0] [ 40/1929] learning_rate: 0.088000 loss_vfl: 0.543321 loss_bbox: 1.071401 loss_dfl: 0.457817 loss: 2.057003 eta: 2 days, 0:07:03 batch_cost: 1.3190 data_cost: 0.0007 ips: 18.1954 images/s +[08/15 04:04:12] ppdet.engine INFO: Epoch: [0] [ 60/1929] learning_rate: 0.112000 loss_vfl: 0.630989 loss_bbox: 0.859183 loss_dfl: 0.384702 loss: 1.883143 eta: 1 day, 19:01:29 batch_cost: 1.2177 data_cost: 0.0006 ips: 19.7087 images/s +``` + +- `--eval`表示训练的同时,进行评估, 评估过程中默认将最佳模型,保存为 `output/picodet_lcnet_x1_0_layout/best_accuracy` 。 + +**注意,预测/评估时的配置文件请务必与训练一致。** + +### 5.2. FGD蒸馏训练 + +PaddleDetection支持了基于FGD([Focal and Global Knowledge Distillation for Detectors](https://arxiv.org/abs/2111.11837v1))蒸馏的目标检测模型训练过程,FGD蒸馏分为两个部分`Focal`和`Global`。`Focal`蒸馏分离图像的前景和背景,让学生模型分别关注教师模型的前景和背景部分特征的关键像素;`Global`蒸馏部分重建不同像素之间的关系并将其从教师转移到学生,以补偿`Focal`蒸馏中丢失的全局信息。 + +更换数据集,修改【TODO】配置中的数据配置、类别数,具体可以参考4.1。启动训练: + +```bash +# 单卡训练 +export CUDA_VISIBLE_DEVICES=0 +python3 tools/train.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + --eval +``` + +- `-c`: 指定模型配置文件。 +- `--slim_config`: 指定压缩策略配置文件。 + +## 6. 模型评估与预测 + +### 6.1. 指标评估 + +训练中模型参数默认保存在`output/picodet_lcnet_x1_0_layout`目录下。在评估指标时,需要设置`weights`指向保存的参数文件。评估数据集可以通过 `configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml` 修改`EvalDataset`中的 `image_dir`、`anno_path`和`dataset_dir` 设置。 + +```bash +# GPU 评估, weights 为待测权重 +python3 tools/eval.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=./output/picodet_lcnet_x1_0_layout/best_model +``` + +会输出以下信息,打印出mAP、AP0.5等信息。 + +```py + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.935 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.979 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.956 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.404 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.782 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.969 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.539 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.938 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.949 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.495 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.818 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.978 +[08/15 07:07:09] ppdet.engine INFO: Total sample number: 11245, averge FPS: 24.405059207157436 +[08/15 07:07:09] ppdet.engine INFO: Best test bbox ap is 0.935. +``` + +若使用**提供的预训练模型进行评估**,或使用**FGD蒸馏训练的模型**,更换`weights`模型路径,执行如下命令进行评估: + +``` +python3 tools/eval.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=output/picodet_lcnet_x2_5_layout/best_model +``` + +- `-c`: 指定模型配置文件。 +- `--slim_config`: 指定蒸馏策略配置文件。 +- `-o weights`: 指定蒸馏算法训好的模型路径。 + +### 6.2 测试版面分析结果 + + +预测使用的配置文件必须与训练一致,如您通过 `python3 tools/train.py -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml` 完成了模型的训练过程。 + +使用 PaddleDetection 训练好的模型,您可以使用如下命令进行模型预测。 + +```bash +python3 tools/infer.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights='output/picodet_lcnet_x1_0_layout/best_model.pdparams' \ + --infer_img='docs/images/layout.jpg' \ + --output_dir=output_dir/ \ + --draw_threshold=0.5 +``` + +- `--infer_img`: 推理单张图片,也可以通过`--infer_dir`推理文件中的所有图片。 +- `--output_dir`: 指定可视化结果保存路径。 +- `--draw_threshold`:指定绘制结果框的NMS阈值。 + +若使用**提供的预训练模型进行预测**,或使用**FGD蒸馏训练的模型**,更换`weights`模型路径,执行如下命令进行预测: + +``` +python3 tools/infer.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights='output/picodet_lcnet_x2_5_layout/best_model.pdparams' \ + --infer_img='docs/images/layout.jpg' \ + --output_dir=output_dir/ \ + --draw_threshold=0.5 +``` + + +## 7. 模型导出与预测 + + +### 7.1 模型导出 + +inference 模型(`paddle.jit.save`保存的模型) 一般是模型训练,把模型结构和模型参数保存在文件中的固化模型,多用于预测部署场景。 训练过程中保存的模型是checkpoints模型,保存的只有模型的参数,多用于恢复训练等。 与checkpoints模型相比,inference 模型会额外保存模型的结构信息,在预测部署、加速推理上性能优越,灵活方便,适合于实际系统集成。 + +版面分析模型转inference模型步骤如下: + +```bash +python3 tools/export_model.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + -o weights=output/picodet_lcnet_x1_0_layout/best_model \ + --output_dir=output_inference/ +``` + +* 如无需导出后处理,请指定:`-o export.benchmark=True`(如果-o已出现过,此处删掉-o) +* 如无需导出NMS,请指定:`-o export.nms=False` + +转换成功后,在目录下有三个文件: + +``` +output_inference/picodet_lcnet_x1_0_layout/ + ├── model.pdiparams # inference模型的参数文件 + ├── model.pdiparams.info # inference模型的参数信息,可忽略 + └── model.pdmodel # inference模型的模型结构文件 +``` + +若使用**提供的预训练模型转Inference模型**,或使用**FGD蒸馏训练的模型**,更换`weights`模型路径,模型转inference模型步骤如下: + +```bash +python3 tools/export_model.py \ + -c configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml \ + --slim_config configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml \ + -o weights=./output/picodet_lcnet_x2_5_layout/best_model \ + --output_dir=output_inference/ +``` + + + +### 7.2 模型推理 + +若使用**提供的推理训练模型推理**,或使用**FGD蒸馏训练的模型**,更换`model_dir`推理模型路径,执行如下命令进行推理: + +```bash +python3 deploy/python/infer.py \ + --model_dir=output_inference/picodet_lcnet_x1_0_layout/ \ + --image_file=docs/images/layout.jpg \ + --device=CPU +``` + +- --device:指定GPU、CPU设备 + +模型推理完成,会看到以下log输出 + +``` +------------------------------------------ +----------- Model Configuration ----------- +Model Arch: PicoDet +Transform Order: +--transform op: Resize +--transform op: NormalizeImage +--transform op: Permute +--transform op: PadStride +-------------------------------------------- +class_id:0, confidence:0.9921, left_top:[20.18,35.66],right_bottom:[341.58,600.99] +class_id:0, confidence:0.9914, left_top:[19.77,611.42],right_bottom:[341.48,901.82] +class_id:0, confidence:0.9904, left_top:[369.36,375.10],right_bottom:[691.29,600.59] +class_id:0, confidence:0.9835, left_top:[369.60,608.60],right_bottom:[691.38,736.72] +class_id:0, confidence:0.9830, left_top:[369.58,805.38],right_bottom:[690.97,901.80] +class_id:0, confidence:0.9716, left_top:[383.68,271.44],right_bottom:[688.93,335.39] +class_id:0, confidence:0.9452, left_top:[370.82,34.48],right_bottom:[688.10,63.54] +class_id:1, confidence:0.8712, left_top:[370.84,771.03],right_bottom:[519.30,789.13] +class_id:3, confidence:0.9856, left_top:[371.28,67.85],right_bottom:[685.73,267.72] +save result to: output/layout.jpg +Test iter 0 +------------------ Inference Time Info ---------------------- +total_time(ms): 2196.0, img_num: 1 +average latency time(ms): 2196.00, QPS: 0.455373 +preprocess_time(ms): 2172.50, inference_time(ms): 11.90, postprocess_time(ms): 11.60 +``` + +- Model:模型结构 +- Transform Order:预处理操作 +- class_id、confidence、left_top、right_bottom:分别表示类别id、置信度、左上角坐标、右下角坐标 +- save result to:可视化版面分析结果保存路径,默认保存到`./output`文件夹 +- Inference Time Info:推理时间,其中preprocess_time表示预处理耗时,inference_time表示模型预测耗时,postprocess_time表示后处理耗时 + +可视化版面结果如下图所示 + +
+ +
+ + + +## Citations + +``` +@inproceedings{zhong2019publaynet, + title={PubLayNet: largest dataset ever for document layout analysis}, + author={Zhong, Xu and Tang, Jianbin and Yepes, Antonio Jimeno}, + booktitle={2019 International Conference on Document Analysis and Recognition (ICDAR)}, + year={2019}, + volume={}, + number={}, + pages={1015-1022}, + doi={10.1109/ICDAR.2019.00166}, + ISSN={1520-5363}, + month={Sep.}, + organization={IEEE} +} + +@inproceedings{yang2022focal, + title={Focal and global knowledge distillation for detectors}, + author={Yang, Zhendong and Li, Zhe and Jiang, Xiaohu and Gong, Yuan and Yuan, Zehuan and Zhao, Danpei and Yuan, Chun}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={4643--4652}, + year={2022} +} +``` diff --git a/ppstructure/layout/__init__.py b/ppstructure/layout/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1d11e265597c7c8e39098a228108da3bb954b892 --- /dev/null +++ b/ppstructure/layout/__init__.py @@ -0,0 +1,13 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppstructure/layout/__pycache__/__init__.cpython-37.pyc b/ppstructure/layout/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f85e9ca8f65e88cb8814dc1efb313653ec6a1a2e Binary files /dev/null and b/ppstructure/layout/__pycache__/__init__.cpython-37.pyc differ diff --git a/ppstructure/layout/__pycache__/__init__.cpython-38.pyc b/ppstructure/layout/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25a26e7d3e78ab59dd358537239f4e49bf37fd9e Binary files /dev/null and b/ppstructure/layout/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppstructure/layout/__pycache__/predict_layout.cpython-37.pyc b/ppstructure/layout/__pycache__/predict_layout.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8b900b508c8275cbe6efde117c853d316a69b31 Binary files /dev/null and b/ppstructure/layout/__pycache__/predict_layout.cpython-37.pyc differ diff --git a/ppstructure/layout/__pycache__/predict_layout.cpython-38.pyc b/ppstructure/layout/__pycache__/predict_layout.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e360eaa93042bb5f755bbce881140782a94b919 Binary files /dev/null and b/ppstructure/layout/__pycache__/predict_layout.cpython-38.pyc differ diff --git a/ppstructure/layout/predict_layout.py b/ppstructure/layout/predict_layout.py new file mode 100644 index 0000000000000000000000000000000000000000..9f8c884e144654901737191141622abfaa872d24 --- /dev/null +++ b/ppstructure/layout/predict_layout.py @@ -0,0 +1,131 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import numpy as np +import time + +import tools.infer.utility as utility +from ppocr.data import create_operators, transform +from ppocr.postprocess import build_post_process +from ppocr.utils.logging import get_logger +from ppocr.utils.utility import get_image_file_list, check_and_read +from ppstructure.utility import parse_args +from picodet_postprocess import PicoDetPostProcess + +logger = get_logger() + + +class LayoutPredictor(object): + def __init__(self, args): + pre_process_list = [{ + 'Resize': { + 'size': [800, 608] + } + }, { + 'NormalizeImage': { + 'std': [0.229, 0.224, 0.225], + 'mean': [0.485, 0.456, 0.406], + 'scale': '1./255.', + 'order': 'hwc' + } + }, { + 'ToCHWImage': None + }, { + 'KeepKeys': { + 'keep_keys': ['image'] + } + }] + postprocess_params = { + 'name': 'PicoDetPostProcess', + "layout_dict_path": args.layout_dict_path, + "score_threshold": args.layout_score_threshold, + "nms_threshold": args.layout_nms_threshold, + } + + self.preprocess_op = create_operators(pre_process_list) + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor, self.output_tensors, self.config = \ + utility.create_predictor(args, 'layout', logger) + + def __call__(self, img): + ori_im = img.copy() + data = {'image': img} + data = transform(data, self.preprocess_op) + img = data[0] + + if img is None: + return None, 0 + + img = np.expand_dims(img, axis=0) + img = img.copy() + + preds, elapse = 0, 1 + starttime = time.time() + + self.input_tensor.copy_from_cpu(img) + self.predictor.run() + + np_score_list, np_boxes_list = [], [] + output_names = self.predictor.get_output_names() + num_outs = int(len(output_names) / 2) + for out_idx in range(num_outs): + np_score_list.append( + self.predictor.get_output_handle(output_names[out_idx]) + .copy_to_cpu()) + np_boxes_list.append( + self.predictor.get_output_handle(output_names[ + out_idx + num_outs]).copy_to_cpu()) + preds = dict(boxes=np_score_list, boxes_num=np_boxes_list) + + post_preds = self.postprocess_op(ori_im, img, preds) + elapse = time.time() - starttime + return post_preds, elapse + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + layout_predictor = LayoutPredictor(args) + count = 0 + total_time = 0 + + repeats = 50 + for image_file in image_file_list: + img, flag, _ = check_and_read(image_file) + if not flag: + img = cv2.imread(image_file) + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + + layout_res, elapse = layout_predictor(img) + + logger.info("result: {}".format(layout_res)) + + if count > 0: + total_time += elapse + count += 1 + logger.info("Predict time of {}: {}".format(image_file, elapse)) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/ppstructure/pdf2word/README.md b/ppstructure/pdf2word/README.md new file mode 100644 index 0000000000000000000000000000000000000000..783555a3017d365bf4d0e969f2b0bdf81d03a779 --- /dev/null +++ b/ppstructure/pdf2word/README.md @@ -0,0 +1,49 @@ +# PDF2WORD + +PDF2Word是PaddleOCR社区开发者 [whjdark](https://github.com/whjdark) 基于PP-StructureV2版面分析与恢复模型实现的PDF转换Word应用程序,提供可直接安装的exe应用程序,**方便Windows用户免环境配置运行** + +## 1.使用 + +### 应用程序 + +1. 下载与安装:针对Windows用户,根据[软件下载]()一节下载软件后,运行 `pdf2word.exe` 。若您下载的是lite版本,安装过程中会在线下载环境依赖、模型等必要资源,安装时间较长,请确保网络畅通。serve版本打包了相关依赖,安装时间较短,可按需下载。 + +2. 转换:由于PP-Structure根据中英文数据分别进行适配,在转换相应文件时可**根据文档语言进行相应选择**。 + +### 脚本运行 + +3. 打开结果:点击`显示结果`,即可打开转换完成后的文件夹 + +> 注意: +> +> - 初次安装程序根据不同设备需要等待1-2分钟不等 +> - 使用Office与WPS打开的Word结果会出现不同,推荐以Office为准 +> - 本程序使用 [QPT](https://github.com/QPT-Family/QPT) 进行应用程序打包,感谢 [GT-ZhangAcer](https://github.com/GT-ZhangAcer) 对打包过程的支持 +> - 应用程序仅支持正版win10,11系统,不支持盗版Windows系统,若在安装过程中出现报错或缺少依赖,推荐直接使用 `paddleocr` whl包应用PDF2Word功能,详情可查看[链接](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/ppstructure/docs/quickstart.md) + +### 脚本启动界面 + +首次运行需要将切换路径到PaddleOCR文件目录 ,然后运行代码 + +``` +cd ./ppstructure/pdf2word +python pdf2word.py +``` + +### PaddleOCR whl包 + +针对Linux、Mac用户或已经拥有Python环境的用户,**推荐安装 `paddleocr` whl包直接应用PDF2Word功能**,详情可查看[链接](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/ppstructure/docs/quickstart.md) + + + +## 2.软件下载 + +如需获取已打包程序,可以扫描下方二维码,关注公众号填写问卷后,加入PaddleOCR官方交流群免费获取20G OCR学习大礼包,内含OCR场景应用集合(包含数码管、液晶屏、车牌、高精度SVTR模型等7个垂类模型)、《动手学OCR》电子书、课程回放视频、前沿论文等重磅资料 + +
+ +
+ +## 3.版本说明 + +v0.2版:新加入PDF解析功能,仅提供full版本,打包了所有依赖包与模型文件,尽可能避免安装失败问题。若仍然安装失败,推荐使用 `paddleocr` whl包 diff --git a/ppstructure/pdf2word/icons/chinese.png b/ppstructure/pdf2word/icons/chinese.png new file mode 100644 index 0000000000000000000000000000000000000000..328e2fff73bd75188fa888aae45c8cb4ca844f57 Binary files /dev/null and b/ppstructure/pdf2word/icons/chinese.png differ diff --git a/ppstructure/pdf2word/icons/english.png b/ppstructure/pdf2word/icons/english.png new file mode 100644 index 0000000000000000000000000000000000000000..536c4a910ae6fc05040f4958477a34aeae891ea0 Binary files /dev/null and b/ppstructure/pdf2word/icons/english.png differ diff --git a/ppstructure/pdf2word/icons/folder-open.png b/ppstructure/pdf2word/icons/folder-open.png new file mode 100644 index 0000000000000000000000000000000000000000..ab5f55f5a4819add116113b55f717f7a21aeafdd Binary files /dev/null and b/ppstructure/pdf2word/icons/folder-open.png differ diff --git a/ppstructure/pdf2word/icons/folder-plus.png b/ppstructure/pdf2word/icons/folder-plus.png new file mode 100644 index 0000000000000000000000000000000000000000..01ce6c10b0ed3e3975edbebbc8e886f846fabe8d Binary files /dev/null and b/ppstructure/pdf2word/icons/folder-plus.png differ diff --git a/ppstructure/pdf2word/pdf2word.py b/ppstructure/pdf2word/pdf2word.py new file mode 100644 index 0000000000000000000000000000000000000000..da555271b9d1c1fcd89937d376f07ecb5a7ab0f5 --- /dev/null +++ b/ppstructure/pdf2word/pdf2word.py @@ -0,0 +1,520 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import tarfile +import os +import time +import datetime +import functools +import cv2 +import platform +import numpy as np +import fitz +from PIL import Image +from pdf2docx.converter import Converter +from qtpy.QtWidgets import QApplication, QWidget, QPushButton, QProgressBar, \ + QGridLayout, QMessageBox, QLabel, QFileDialog, QCheckBox +from qtpy.QtCore import Signal, QThread, QObject +from qtpy.QtGui import QImage, QPixmap, QIcon + +file = os.path.dirname(os.path.abspath(__file__)) +root = os.path.abspath(os.path.join(file, '../../')) +sys.path.append(file) +sys.path.insert(0, root) + +from ppstructure.predict_system import StructureSystem, save_structure_res +from ppstructure.utility import parse_args, draw_structure_result +from ppocr.utils.network import download_with_progressbar +from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx +# from ScreenShotWidget import ScreenShotWidget + +__APPNAME__ = "pdf2word" +__VERSION__ = "0.2.2" + +URLs_EN = { + # 下载超英文轻量级PP-OCRv3模型的检测模型并解压 + "en_PP-OCRv3_det_infer": + "https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar", + # 下载英文轻量级PP-OCRv3模型的识别模型并解压 + "en_PP-OCRv3_rec_infer": + "https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar", + # 下载超轻量级英文表格英文模型并解压 + "en_ppstructure_mobile_v2.0_SLANet_infer": + "https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar", + # 英文版面分析模型 + "picodet_lcnet_x1_0_fgd_layout_infer": + "https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar", +} +DICT_EN = { + "rec_char_dict_path": "en_dict.txt", + "layout_dict_path": "layout_publaynet_dict.txt", +} + +URLs_CN = { + # 下载超中文轻量级PP-OCRv3模型的检测模型并解压 + "cn_PP-OCRv3_det_infer": + "https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar", + # 下载中文轻量级PP-OCRv3模型的识别模型并解压 + "cn_PP-OCRv3_rec_infer": + "https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar", + # 下载超轻量级英文表格英文模型并解压 + "cn_ppstructure_mobile_v2.0_SLANet_infer": + "https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar", + # 中文版面分析模型 + "picodet_lcnet_x1_0_fgd_layout_cdla_infer": + "https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_cdla_infer.tar", +} +DICT_CN = { + "rec_char_dict_path": "ppocr_keys_v1.txt", + "layout_dict_path": "layout_cdla_dict.txt", +} + + +def QImageToCvMat(incomingImage) -> np.array: + ''' + Converts a QImage into an opencv MAT format + ''' + + incomingImage = incomingImage.convertToFormat(QImage.Format.Format_RGBA8888) + + width = incomingImage.width() + height = incomingImage.height() + + ptr = incomingImage.bits() + ptr.setsize(height * width * 4) + arr = np.frombuffer(ptr, np.uint8).reshape((height, width, 4)) + return arr + + +def readImage(image_file) -> list: + if os.path.basename(image_file)[-3:] == 'pdf': + imgs = [] + with fitz.open(image_file) as pdf: + for pg in range(0, pdf.pageCount): + page = pdf[pg] + mat = fitz.Matrix(2, 2) + pm = page.getPixmap(matrix=mat, alpha=False) + + # if width or height > 2000 pixels, don't enlarge the image + if pm.width > 2000 or pm.height > 2000: + pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) + img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) + imgs.append(img) + else: + img = cv2.imread(image_file, cv2.IMREAD_COLOR) + if img is not None: + imgs = [img] + + return imgs + + +class Worker(QThread): + progressBarValue = Signal(int) + progressBarRange = Signal(int) + endsignal = Signal() + exceptedsignal = Signal(str) #发送一个异常信号 + loopFlag = True + + def __init__(self, predictors, save_pdf, vis_font_path, use_pdf2docx_api): + super(Worker, self).__init__() + self.predictors = predictors + self.save_pdf = save_pdf + self.vis_font_path = vis_font_path + self.lang = 'EN' + self.imagePaths = [] + self.use_pdf2docx_api = use_pdf2docx_api + self.outputDir = None + self.totalPageCnt = 0 + self.pageCnt = 0 + self.setStackSize(1024 * 1024) + + def setImagePath(self, imagePaths): + self.imagePaths = imagePaths + + def setLang(self, lang): + self.lang = lang + + def setOutputDir(self, outputDir): + self.outputDir = outputDir + + def setPDFParser(self, enabled): + self.use_pdf2docx_api = enabled + + def resetPageCnt(self): + self.pageCnt = 0 + + def resetTotalPageCnt(self): + self.totalPageCnt = 0 + + def ppocrPrecitor(self, imgs, img_name): + all_res = [] + # update progress bar ranges + self.totalPageCnt += len(imgs) + self.progressBarRange.emit(self.totalPageCnt) + # processing pages + for index, img in enumerate(imgs): + res, time_dict = self.predictors[self.lang](img) + + # save output + save_structure_res(res, self.outputDir, img_name) + # draw_img = draw_structure_result(img, res, self.vis_font_path) + # img_save_path = os.path.join(self.outputDir, img_name, 'show_{}.jpg'.format(index)) + # if res != []: + # cv2.imwrite(img_save_path, draw_img) + + # recovery + h, w, _ = img.shape + res = sorted_layout_boxes(res, w) + all_res += res + self.pageCnt += 1 + self.progressBarValue.emit(self.pageCnt) + + if all_res != []: + try: + convert_info_docx(imgs, all_res, self.outputDir, img_name) + except Exception as ex: + print("error in layout recovery image:{}, err msg: {}".format( + img_name, ex)) + print("Predict time : {:.3f}s".format(time_dict['all'])) + print('result save to {}'.format(self.outputDir)) + + def run(self): + self.resetPageCnt() + self.resetTotalPageCnt() + try: + os.makedirs(self.outputDir, exist_ok=True) + for i, image_file in enumerate(self.imagePaths): + if not self.loopFlag: + break + # using use_pdf2docx_api for PDF parsing + if self.use_pdf2docx_api \ + and os.path.basename(image_file)[-3:] == 'pdf': + self.totalPageCnt += 1 + self.progressBarRange.emit(self.totalPageCnt) + print( + '===============using use_pdf2docx_api===============') + img_name = os.path.basename(image_file).split('.')[0] + docx_file = os.path.join(self.outputDir, + '{}.docx'.format(img_name)) + cv = Converter(image_file) + cv.convert(docx_file) + cv.close() + print('docx save to {}'.format(docx_file)) + self.pageCnt += 1 + self.progressBarValue.emit(self.pageCnt) + else: + # using PPOCR for PDF/Image parsing + imgs = readImage(image_file) + if len(imgs) == 0: + continue + img_name = os.path.basename(image_file).split('.')[0] + os.makedirs( + os.path.join(self.outputDir, img_name), exist_ok=True) + self.ppocrPrecitor(imgs, img_name) + # file processed + self.endsignal.emit() + # self.exec() + except Exception as e: + self.exceptedsignal.emit(str(e)) # 将异常发送给UI进程 + + +class APP_Image2Doc(QWidget): + def __init__(self): + super().__init__() + # self.setFixedHeight(100) + # self.setFixedWidth(520) + + # settings + self.imagePaths = [] + # self.screenShotWg = ScreenShotWidget() + self.screenShot = None + self.save_pdf = False + self.output_dir = None + self.vis_font_path = os.path.join(root, "doc", "fonts", "simfang.ttf") + self.use_pdf2docx_api = False + + # ProgressBar + self.pb = QProgressBar() + self.pb.setRange(0, 100) + self.pb.setValue(0) + + # 初始化界面 + self.setupUi() + + # 下载模型 + self.downloadModels(URLs_EN) + self.downloadModels(URLs_CN) + + # 初始化模型 + predictors = { + 'EN': self.initPredictor('EN'), + 'CN': self.initPredictor('CN'), + } + + # 设置工作进程 + self._thread = Worker(predictors, self.save_pdf, self.vis_font_path, + self.use_pdf2docx_api) + self._thread.progressBarValue.connect( + self.handleProgressBarUpdateSingal) + self._thread.endsignal.connect(self.handleEndsignalSignal) + # self._thread.finished.connect(QObject.deleteLater) + self._thread.progressBarRange.connect(self.handleProgressBarRangeSingal) + self._thread.exceptedsignal.connect(self.handleThreadException) + self.time_start = 0 # save start time + + def setupUi(self): + self.setObjectName("MainWindow") + self.setWindowTitle(__APPNAME__ + " " + __VERSION__) + + layout = QGridLayout() + + self.openFileButton = QPushButton("打开文件") + self.openFileButton.setIcon(QIcon(QPixmap("./icons/folder-plus.png"))) + layout.addWidget(self.openFileButton, 0, 0, 1, 1) + self.openFileButton.clicked.connect(self.handleOpenFileSignal) + + # screenShotButton = QPushButton("截图识别") + # layout.addWidget(screenShotButton, 0, 1, 1, 1) + # screenShotButton.clicked.connect(self.screenShotSlot) + # screenShotButton.setEnabled(False) # temporarily disenble + + self.startCNButton = QPushButton("中文转换") + self.startCNButton.setIcon(QIcon(QPixmap("./icons/chinese.png"))) + layout.addWidget(self.startCNButton, 0, 1, 1, 1) + self.startCNButton.clicked.connect( + functools.partial(self.handleStartSignal, 'CN', False)) + + self.startENButton = QPushButton("英文转换") + self.startENButton.setIcon(QIcon(QPixmap("./icons/english.png"))) + layout.addWidget(self.startENButton, 0, 2, 1, 1) + self.startENButton.clicked.connect( + functools.partial(self.handleStartSignal, 'EN', False)) + + self.PDFParserButton = QPushButton('PDF解析', self) + layout.addWidget(self.PDFParserButton, 0, 3, 1, 1) + self.PDFParserButton.clicked.connect( + functools.partial(self.handleStartSignal, 'CN', True)) + + self.showResultButton = QPushButton("显示结果") + self.showResultButton.setIcon(QIcon(QPixmap("./icons/folder-open.png"))) + layout.addWidget(self.showResultButton, 0, 4, 1, 1) + self.showResultButton.clicked.connect(self.handleShowResultSignal) + + # ProgressBar + layout.addWidget(self.pb, 2, 0, 1, 5) + # time estimate label + self.timeEstLabel = QLabel(("Time Left: --")) + layout.addWidget(self.timeEstLabel, 3, 0, 1, 5) + + self.setLayout(layout) + + def downloadModels(self, URLs): + # using custom model + tar_file_name_list = [ + 'inference.pdiparams', 'inference.pdiparams.info', + 'inference.pdmodel', 'model.pdiparams', 'model.pdiparams.info', + 'model.pdmodel' + ] + model_path = os.path.join(root, 'inference') + os.makedirs(model_path, exist_ok=True) + + # download and unzip models + for name in URLs.keys(): + url = URLs[name] + print("Try downloading file: {}".format(url)) + tarname = url.split('/')[-1] + tarpath = os.path.join(model_path, tarname) + if os.path.exists(tarpath): + print("File have already exist. skip") + else: + try: + download_with_progressbar(url, tarpath) + except Exception as e: + print( + "Error occurred when downloading file, error message:") + print(e) + + # unzip model tar + try: + with tarfile.open(tarpath, 'r') as tarObj: + storage_dir = os.path.join(model_path, name) + os.makedirs(storage_dir, exist_ok=True) + for member in tarObj.getmembers(): + filename = None + for tar_file_name in tar_file_name_list: + if tar_file_name in member.name: + filename = tar_file_name + if filename is None: + continue + file = tarObj.extractfile(member) + with open(os.path.join(storage_dir, filename), + 'wb') as f: + f.write(file.read()) + except Exception as e: + print("Error occurred when unziping file, error message:") + print(e) + + def initPredictor(self, lang='EN'): + # init predictor args + args = parse_args() + args.table_max_len = 488 + args.ocr = True + args.recovery = True + args.save_pdf = self.save_pdf + args.table_char_dict_path = os.path.join(root, "ppocr", "utils", "dict", + "table_structure_dict.txt") + if lang == 'EN': + args.det_model_dir = os.path.join( + root, # 此处从这里找到模型存放位置 + "inference", + "en_PP-OCRv3_det_infer") + args.rec_model_dir = os.path.join(root, "inference", + "en_PP-OCRv3_rec_infer") + args.table_model_dir = os.path.join( + root, "inference", "en_ppstructure_mobile_v2.0_SLANet_infer") + args.output = os.path.join(root, "output") # 结果保存路径 + args.layout_model_dir = os.path.join( + root, "inference", "picodet_lcnet_x1_0_fgd_layout_infer") + lang_dict = DICT_EN + elif lang == 'CN': + args.det_model_dir = os.path.join( + root, # 此处从这里找到模型存放位置 + "inference", + "cn_PP-OCRv3_det_infer") + args.rec_model_dir = os.path.join(root, "inference", + "cn_PP-OCRv3_rec_infer") + args.table_model_dir = os.path.join( + root, "inference", "cn_ppstructure_mobile_v2.0_SLANet_infer") + args.output = os.path.join(root, "output") # 结果保存路径 + args.layout_model_dir = os.path.join( + root, "inference", "picodet_lcnet_x1_0_fgd_layout_cdla_infer") + lang_dict = DICT_CN + else: + raise ValueError("Unsupported language") + args.rec_char_dict_path = os.path.join(root, "ppocr", "utils", + lang_dict['rec_char_dict_path']) + args.layout_dict_path = os.path.join(root, "ppocr", "utils", "dict", + "layout_dict", + lang_dict['layout_dict_path']) + # init predictor + return StructureSystem(args) + + def handleOpenFileSignal(self): + ''' + 可以多选图像文件 + ''' + selectedFiles = QFileDialog.getOpenFileNames( + self, "多文件选择", "/", "图片文件 (*.png *.jpeg *.jpg *.bmp *.pdf)")[0] + if len(selectedFiles) > 0: + self.imagePaths = selectedFiles + self.screenShot = None # discard screenshot temp image + self.pb.setValue(0) + + # def screenShotSlot(self): + # ''' + # 选定图像文件和截图的转换过程只能同时进行一个 + # 截图只能同时转换一个 + # ''' + # self.screenShotWg.start() + # if self.screenShotWg.captureImage: + # self.screenShot = self.screenShotWg.captureImage + # self.imagePaths.clear() # discard openfile temp list + # self.pb.setRange(0, 1) + # self.pb.setValue(0) + + def handleStartSignal(self, lang='EN', pdfParser=False): + if self.screenShot: # for screenShot + img_name = 'screenshot_' + time.strftime("%Y%m%d%H%M%S", + time.localtime()) + image = QImageToCvMat(self.screenShot) + self.predictAndSave(image, img_name, lang) + # update Progress Bar + self.pb.setValue(1) + QMessageBox.information(self, u'Information', "文档提取完成") + elif len(self.imagePaths) > 0: # for image file selection + # Must set image path list and language before start + self.output_dir = os.path.join( + os.path.dirname(self.imagePaths[0]), + "output") # output_dir shold be same as imagepath + self._thread.setOutputDir(self.output_dir) + self._thread.setImagePath(self.imagePaths) + self._thread.setLang(lang) + self._thread.setPDFParser(pdfParser) + # disenble buttons + self.openFileButton.setEnabled(False) + self.startCNButton.setEnabled(False) + self.startENButton.setEnabled(False) + self.PDFParserButton.setEnabled(False) + # 启动工作进程 + self._thread.start() + self.time_start = time.time() # log start time + QMessageBox.information(self, u'Information', "开始转换") + else: + QMessageBox.warning(self, u'Information', "请选择要识别的文件或截图") + + def handleShowResultSignal(self): + if self.output_dir is None: + return + if os.path.exists(self.output_dir): + if platform.system() == 'Windows': + os.startfile(self.output_dir) + else: + os.system('open ' + os.path.normpath(self.output_dir)) + else: + QMessageBox.information(self, u'Information', "输出文件不存在") + + def handleProgressBarUpdateSingal(self, i): + self.pb.setValue(i) + # calculate time left of recognition + lenbar = self.pb.maximum() + avg_time = (time.time() - self.time_start + ) / i # Use average time to prevent time fluctuations + time_left = str(datetime.timedelta(seconds=avg_time * ( + lenbar - i))).split(".")[0] # Remove microseconds + self.timeEstLabel.setText(f"Time Left: {time_left}") # show time left + + def handleProgressBarRangeSingal(self, max): + self.pb.setRange(0, max) + + def handleEndsignalSignal(self): + # enble buttons + self.openFileButton.setEnabled(True) + self.startCNButton.setEnabled(True) + self.startENButton.setEnabled(True) + self.PDFParserButton.setEnabled(True) + QMessageBox.information(self, u'Information', "转换结束") + + def handleCBChangeSignal(self): + self._thread.setPDFParser(self.checkBox.isChecked()) + + def handleThreadException(self, message): + self._thread.quit() + QMessageBox.information(self, 'Error', message) + + +def main(): + app = QApplication(sys.argv) + + window = APP_Image2Doc() # 创建对象 + window.show() # 全屏显示窗口 + + QApplication.processEvents() + sys.exit(app.exec()) + + +if __name__ == "__main__": + main() diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py new file mode 100644 index 0000000000000000000000000000000000000000..b32b706299bc188d824ac984173c97adc378f8de --- /dev/null +++ b/ppstructure/predict_system.py @@ -0,0 +1,315 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import subprocess + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' +import cv2 +import json +import numpy as np +import time +import logging +from copy import deepcopy + +from ppocr.utils.utility import get_image_file_list, check_and_read +from ppocr.utils.logging import get_logger +from ppocr.utils.visual import draw_ser_results, draw_re_results +from tools.infer.predict_system import TextSystem +from ppstructure.layout.predict_layout import LayoutPredictor +from ppstructure.table.predict_table import TableSystem, to_excel +from ppstructure.utility import parse_args, draw_structure_result + +logger = get_logger() + + +class StructureSystem(object): + def __init__(self, args): + self.mode = args.mode + self.recovery = args.recovery + + self.image_orientation_predictor = None + if args.image_orientation: + import paddleclas + self.image_orientation_predictor = paddleclas.PaddleClas( + model_name="text_image_orientation") + + if self.mode == 'structure': + if not args.show_log: + logger.setLevel(logging.INFO) + if args.layout == False and args.ocr == True: + args.ocr = False + logger.warning( + "When args.layout is false, args.ocr is automatically set to false" + ) + args.drop_score = 0 + # init model + self.layout_predictor = None + self.text_system = None + self.table_system = None + if args.layout: + self.layout_predictor = LayoutPredictor(args) + if args.ocr: + self.text_system = TextSystem(args) + if args.table: + if self.text_system is not None: + self.table_system = TableSystem( + args, self.text_system.text_detector, + self.text_system.text_recognizer) + else: + self.table_system = TableSystem(args) + + elif self.mode == 'kie': + from ppstructure.kie.predict_kie_token_ser_re import SerRePredictor + self.kie_predictor = SerRePredictor(args) + + def __call__(self, img, return_ocr_result_in_table=False, img_idx=0): + time_dict = { + 'image_orientation': 0, + 'layout': 0, + 'table': 0, + 'table_match': 0, + 'det': 0, + 'rec': 0, + 'kie': 0, + 'all': 0 + } + start = time.time() + if self.image_orientation_predictor is not None: + tic = time.time() + cls_result = self.image_orientation_predictor.predict( + input_data=img) + cls_res = next(cls_result) + angle = cls_res[0]['label_names'][0] + cv_rotate_code = { + '90': cv2.ROTATE_90_COUNTERCLOCKWISE, + '180': cv2.ROTATE_180, + '270': cv2.ROTATE_90_CLOCKWISE + } + if angle in cv_rotate_code: + img = cv2.rotate(img, cv_rotate_code[angle]) + toc = time.time() + time_dict['image_orientation'] = toc - tic + if self.mode == 'structure': + ori_im = img.copy() + if self.layout_predictor is not None: + layout_res, elapse = self.layout_predictor(img) + time_dict['layout'] += elapse + else: + h, w = ori_im.shape[:2] + layout_res = [dict(bbox=None, label='table')] + res_list = [] + for region in layout_res: + res = '' + if region['bbox'] is not None: + x1, y1, x2, y2 = region['bbox'] + x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) + roi_img = ori_im[y1:y2, x1:x2, :] + else: + x1, y1, x2, y2 = 0, 0, w, h + roi_img = ori_im + if region['label'] == 'table': + if self.table_system is not None: + res, table_time_dict = self.table_system( + roi_img, return_ocr_result_in_table) + time_dict['table'] += table_time_dict['table'] + time_dict['table_match'] += table_time_dict['match'] + time_dict['det'] += table_time_dict['det'] + time_dict['rec'] += table_time_dict['rec'] + else: + if self.text_system is not None: + if self.recovery: + wht_im = np.ones(ori_im.shape, dtype=ori_im.dtype) + wht_im[y1:y2, x1:x2, :] = roi_img + filter_boxes, filter_rec_res, ocr_time_dict = self.text_system( + wht_im) + else: + filter_boxes, filter_rec_res, ocr_time_dict = self.text_system( + roi_img) + time_dict['det'] += ocr_time_dict['det'] + time_dict['rec'] += ocr_time_dict['rec'] + + # remove style char, + # when using the recognition model trained on the PubtabNet dataset, + # it will recognize the text format in the table, such as + style_token = [ + '', '', '', '', '', + '', '', '', '', + '', '', '', '', + '' + ] + res = [] + for box, rec_res in zip(filter_boxes, filter_rec_res): + rec_str, rec_conf = rec_res + for token in style_token: + if token in rec_str: + rec_str = rec_str.replace(token, '') + if not self.recovery: + box += [x1, y1] + res.append({ + 'text': rec_str, + 'confidence': float(rec_conf), + 'text_region': box.tolist() + }) + res_list.append({ + 'type': region['label'].lower(), + 'bbox': [x1, y1, x2, y2], + 'img': roi_img, + 'res': res, + 'img_idx': img_idx + }) + end = time.time() + time_dict['all'] = end - start + return res_list, time_dict + elif self.mode == 'kie': + re_res, elapse = self.kie_predictor(img) + time_dict['kie'] = elapse + time_dict['all'] = elapse + return re_res[0], time_dict + return None, None + + +def save_structure_res(res, save_folder, img_name, img_idx=0): + excel_save_folder = os.path.join(save_folder, img_name) + os.makedirs(excel_save_folder, exist_ok=True) + res_cp = deepcopy(res) + # save res + with open( + os.path.join(excel_save_folder, 'res_{}.txt'.format(img_idx)), + 'w', + encoding='utf8') as f: + for region in res_cp: + roi_img = region.pop('img') + f.write('{}\n'.format(json.dumps(region))) + + if region['type'].lower() == 'table' and len(region[ + 'res']) > 0 and 'html' in region['res']: + excel_path = os.path.join( + excel_save_folder, + '{}_{}.xlsx'.format(region['bbox'], img_idx)) + to_excel(region['res']['html'], excel_path) + elif region['type'].lower() == 'figure': + img_path = os.path.join( + excel_save_folder, + '{}_{}.jpg'.format(region['bbox'], img_idx)) + cv2.imwrite(img_path, roi_img) + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + image_file_list = image_file_list + image_file_list = image_file_list[args.process_id::args.total_process_num] + + if not args.use_pdf2docx_api: + structure_sys = StructureSystem(args) + save_folder = os.path.join(args.output, structure_sys.mode) + os.makedirs(save_folder, exist_ok=True) + img_num = len(image_file_list) + + for i, image_file in enumerate(image_file_list): + logger.info("[{}/{}] {}".format(i, img_num, image_file)) + img, flag_gif, flag_pdf = check_and_read(image_file) + img_name = os.path.basename(image_file).split('.')[0] + + if args.recovery and args.use_pdf2docx_api and flag_pdf: + from pdf2docx.converter import Converter + os.makedirs(args.output, exist_ok=True) + docx_file = os.path.join(args.output, + '{}_api.docx'.format(img_name)) + cv = Converter(image_file) + cv.convert(docx_file) + cv.close() + logger.info('docx save to {}'.format(docx_file)) + continue + + if not flag_gif and not flag_pdf: + img = cv2.imread(image_file) + + if not flag_pdf: + if img is None: + logger.error("error in loading image:{}".format(image_file)) + continue + imgs = [img] + else: + imgs = img + + all_res = [] + for index, img in enumerate(imgs): + res, time_dict = structure_sys(img, img_idx=index) + img_save_path = os.path.join(save_folder, img_name, + 'show_{}.jpg'.format(index)) + os.makedirs(os.path.join(save_folder, img_name), exist_ok=True) + if structure_sys.mode == 'structure' and res != []: + draw_img = draw_structure_result(img, res, args.vis_font_path) + save_structure_res(res, save_folder, img_name, index) + elif structure_sys.mode == 'kie': + if structure_sys.kie_predictor.predictor is not None: + draw_img = draw_re_results( + img, res, font_path=args.vis_font_path) + else: + draw_img = draw_ser_results( + img, res, font_path=args.vis_font_path) + + with open( + os.path.join(save_folder, img_name, + 'res_{}_kie.txt'.format(index)), + 'w', + encoding='utf8') as f: + res_str = '{}\t{}\n'.format( + image_file, + json.dumps( + { + "ocr_info": res + }, ensure_ascii=False)) + f.write(res_str) + if res != []: + cv2.imwrite(img_save_path, draw_img) + logger.info('result save to {}'.format(img_save_path)) + if args.recovery and res != []: + from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx + h, w, _ = img.shape + res = sorted_layout_boxes(res, w) + all_res += res + + if args.recovery and all_res != []: + try: + convert_info_docx(img, all_res, save_folder, img_name) + except Exception as ex: + logger.error("error in layout recovery image:{}, err msg: {}". + format(image_file, ex)) + continue + logger.info("Predict time : {:.3f}s".format(time_dict['all'])) + + +if __name__ == "__main__": + args = parse_args() + if args.use_mp: + p_list = [] + total_process_num = args.total_process_num + for process_id in range(total_process_num): + cmd = [sys.executable, "-u"] + sys.argv + [ + "--process_id={}".format(process_id), + "--use_mp={}".format(False) + ] + p = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stdout) + p_list.append(p) + for p in p_list: + p.wait() + else: + main(args) diff --git a/ppstructure/recovery/README.md b/ppstructure/recovery/README.md new file mode 100644 index 0000000000000000000000000000000000000000..46a348c8e5d4cf3e43c4287ee5b37030426c1524 --- /dev/null +++ b/ppstructure/recovery/README.md @@ -0,0 +1,213 @@ +English | [简体中文](README_ch.md) + +# Layout Recovery + +- [1. Introduction](#1) +- [2. Install](#2) + - [2.1 Install PaddlePaddle](#2.1) + - [2.2 Install PaddleOCR](#2.2) +- [3. Quick Start using standard PDF parse](#3) +- [4. Quick Start using image format PDF parse ](#4) + - [4.1 Download models](#4.1) + - [4.2 Layout recovery](#4.2) +- [5. More](#5) + + + +## 1. Introduction + +The layout recovery module is used to restore the image or pdf to an +editable Word file consistent with the original image layout. + +Two layout recovery methods are provided, you can choose by PDF format: + +- **Standard PDF parse(the input is standard PDF)**: Python based PDF to word library [pdf2docx] (https://github.com/dothinking/pdf2docx) is optimized, the method extracts data from PDF with PyMuPDF, then parse layout with rule, finally, generate docx with python-docx. + +- **Image format PDF parse(the input can be standard PDF or image format PDF)**: Layout recovery combines [layout analysis](../layout/README.md)、[table recognition](../table/README.md) to better recover images, tables, titles, etc. supports input files in PDF and document image formats in Chinese and English. + +The input formats and application scenarios of the two methods are as follows: + +| method | input formats | application scenarios/problem | +| :-----: | :----------: | :----------------------------------------------------------: | +| Standard PDF parse | pdf | Advantages: Better recovery for non-paper documents, each page remains on the same page after restoration
Disadvantages: English characters in some Chinese documents are garbled, some contents are still beyond the current page, the whole page content is restored to the table format, and the recovery effect of some pictures is not good | +| Image format PDF parse( | pdf、picture | Advantages: More suitable for paper document content recovery, OCR recognition effect is more good
Disadvantages: Currently, the recovery is based on rules, the effect of content typesetting (spacing, fonts, etc.) need to be further improved, and the effect of layout recovery depends on layout analysis | + +The following figure shows the effect of restoring the layout of documents by using PDF parse: + +
+ +
+ +The following figures show the effect of restoring the layout of English and Chinese documents by using OCR technique: + +
+ +
+ +
+ +
+ + + + +## 2. Install + + + +### 2.1 Install PaddlePaddle + +```bash +python3 -m pip install --upgrade pip + +# If you have cuda9 or cuda10 installed on your machine, please run the following command to install +python3 -m pip install "paddlepaddle-gpu" -i https://mirror.baidu.com/pypi/simple + +# CPU installation +python3 -m pip install "paddlepaddle" -i https://mirror.baidu.com/pypi/simple +```` + +For more requirements, please refer to the instructions in [Installation Documentation](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/install/pip/macos-pip_en.html). + + + +### 2.2 Install PaddleOCR + +- **(1) Download source code** + +```bash +[Recommended] git clone https://github.com/PaddlePaddle/PaddleOCR + +# If the pull cannot be successful due to network problems, you can also choose to use the hosting on the code cloud: +git clone https://gitee.com/paddlepaddle/PaddleOCR + +# Note: Code cloud hosting code may not be able to synchronize the update of this github project in real time, there is a delay of 3 to 5 days, please use the recommended method first. +```` + +- **(2) Install recovery `requirements`** + +The layout restoration is exported as docx files, so python-docx API need to be installed, and PyMuPDF api([requires Python >= 3.7](https://pypi.org/project/PyMuPDF/)) need to be installed to process the input files in pdf format. + +Install all the libraries by running the following command: + +```bash +python3 -m pip install -r ppstructure/recovery/requirements.txt +```` + + And if using pdf parse method, we need to install pdf2docx api. + +```bash +wget https://paddleocr.bj.bcebos.com/whl/pdf2docx-0.0.0-py3-none-any.whl +pip3 install pdf2docx-0.0.0-py3-none-any.whl +``` + + + +## 3. Quick Start using standard PDF parse + +`use_pdf2docx_api` use PDF parse for layout recovery, The whl package is also provided for quick use, follow the above code, for more infomation please refer to [quickstart](../docs/quickstart_en.md) for details. + +```bash +# install paddleocr +pip3 install "paddleocr>=2.6" +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true +``` + +Command line: + +```bash +python3 predict_system.py \ + --image_dir=ppstructure/recovery/UnrealText.pdf \ + --recovery=True \ + --use_pdf2docx_api=True \ + --output=../output/ +``` + + +## 4. Quick Start using image format PDF parse + +Through layout analysis, we divided the image/PDF documents into regions, located the key regions, such as text, table, picture, etc., and recorded the location, category, and regional pixel value information of each region. Different regions are processed separately, where: + +- OCR detection and recognition is performed in the text area, and the coordinates of the OCR detection box and the text content information are added on the basis of the previous information + +- The table area identifies tables and records html and text information of tables +- Save the image directly + +We can restore the test picture through the layout information, OCR detection and recognition structure, table information, and saved pictures. + +The whl package is also provided for quick use, follow the above code, for more infomation please refer to [quickstart](../docs/quickstart_en.md) for details. + +```bash +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' +``` + + +### 4.1 Download models + +If input is English document, download English models: + +```bash +cd PaddleOCR/ppstructure + +# download model +mkdir inference && cd inference +# Download the detection model of the ultra-lightweight English PP-OCRv3 model and unzip it +https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && tar xf en_PP-OCRv3_det_infer.tar +# Download the recognition model of the ultra-lightweight English PP-OCRv3 model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar && tar xf en_PP-OCRv3_rec_infer.tar +# Download the ultra-lightweight English table inch model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar +tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +# Download the layout model of publaynet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar +tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar +cd .. +``` +If input is Chinese document,download Chinese models: +[Chinese and English ultra-lightweight PP-OCRv3 model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/README.md#pp-ocr-series-model-listupdate-on-september-8th)、[table recognition model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#22-表格识别模型)、[layout analysis model](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#1-版面分析模型) + + +### 4.2 Layout recovery + + +```bash +python3 predict_system.py \ + --image_dir=./docs/table/1.png \ + --det_model_dir=inference/en_PP-OCRv3_det_infer \ + --rec_model_dir=inference/en_PP-OCRv3_rec_infer \ + --rec_char_dict_path=../ppocr/utils/en_dict.txt \ + --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --layout_model_dir=inference/picodet_lcnet_x1_0_fgd_layout_infer \ + --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --recovery=True \ + --output=../output/ +``` + +After running, the docx of each picture will be saved in the directory specified by the output field + +Field: + +- image_dir:test file, can be picture, picture directory, pdf file, pdf file directory +- det_model_dir:OCR detection model path +- rec_model_dir:OCR recognition model path +- rec_char_dict_path:OCR recognition dict path. If the Chinese model is used, change to "../ppocr/utils/ppocr_keys_v1.txt". And if you trained the model on your own dataset, change to the trained dictionary +- table_model_dir:tabel recognition model path +- table_char_dict_path:tabel recognition dict path. If the Chinese model is used, no need to change +- layout_model_dir:layout analysis model path +- layout_dict_path:layout analysis dict path. If the Chinese model is used, change to "../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt" +- recovery:whether to enable layout of recovery, default False +- output:save the recovery result path + + + +## 5. More + +For training, evaluation and inference tutorial for text detection models, please refer to [text detection doc](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_en/detection_en.md). + +For training, evaluation and inference tutorial for text recognition models, please refer to [text recognition doc](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_en/recognition_en.md). + +For training, evaluation and inference tutorial for layout analysis models, please refer to [layout analysis doc](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/layout/README.md) + +For training, evaluation and inference tutorial for table recognition models, please refer to [table recognition doc](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/table/README.md) diff --git a/ppstructure/recovery/README_ch.md b/ppstructure/recovery/README_ch.md new file mode 100644 index 0000000000000000000000000000000000000000..5a60bd81903aaab81e8b7c716de346bafccbc970 --- /dev/null +++ b/ppstructure/recovery/README_ch.md @@ -0,0 +1,222 @@ +[English](README.md) | 简体中文 + +# 版面恢复 + +- [1. 简介](#1) +- [2. 安装](#2) + - [2.1 安装PaddlePaddle](#2.1) + - [2.2 安装PaddleOCR](#2.2) +- [3.使用标准PDF解析进行版面恢复](#3) +- [4. 使用图片格式PDF解析进行版面恢复](#4) + - [4.1 下载模型](#4.1) + - [4.2 版面恢复](#4.2) +- [5. 更多](#5) + + + +## 1. 简介 + +版面恢复就是将输入的图片、pdf内容仍然像原文档那样排列着,段落不变、顺序不变的输出到word文档中等。 + +提供了2种版面恢复方法,可根据输入PDF的格式进行选择: + +- **标准PDF解析(输入须为标准PDF)**:基于Python的pdf转word库[pdf2docx](https://github.com/dothinking/pdf2docx)进行优化,该方法通过PyMuPDF获取页面元素,然后利用规则解析章节、段落、表格等布局及样式,最后通过python-docx将解析的内容元素重建到word文档中。 +- **图片格式PDF解析(输入可为标准PDF或图片格式PDF)**:结合[版面分析](../layout/README_ch.md)、[表格识别](../table/README_ch.md)技术,从而更好地恢复图片、表格、标题等内容,支持中、英文pdf文档、文档图片格式的输入文件。 + +2种方法输入格式、适用场景如下: + +| 方法 | 支持输入文件 | 适用场景/存在问题 | +| :-------------: | :----------: | :----------------------------------------------------------: | +| 标准PDF解析 | pdf | 优点:非论文文档恢复效果更优、每一页内容恢复后仍在同一页
缺点:有些中文文档中的英文乱码、仍存在内容超出当前页面的情况、整页内容恢复为表格格式、部分图片恢复效果不佳 | +| 图片格式PDF解析 | pdf、图片 | 优点:更适合论文文档正文内容的恢复、中英文文档OCR识别效果好
缺点:目前内容恢复基于规则,内容排版效果(间距、字体等)待进一步提升、版面恢复效果依赖于版面分析效果 | + +下图展示了通过PDF解析版面恢复效果: + +
+ +
+ +下图分别展示了通过OCR技术,英文文档和中文文档版面恢复的效果: + +
+ +
+
+ +
+ + +## 2. 安装 + + + +### 2.1 安装PaddlePaddle + +```bash +python3 -m pip install --upgrade pip + +# 您的机器安装的是CUDA9或CUDA10,请运行以下命令安装 +python3 -m pip install "paddlepaddle-gpu" -i https://mirror.baidu.com/pypi/simple + +# 您的机器是CPU,请运行以下命令安装 +python3 -m pip install "paddlepaddle" -i https://mirror.baidu.com/pypi/simple + +``` + +更多需求,请参照[安装文档](https://www.paddlepaddle.org.cn/install/quick)中的说明进行操作。 + + + +### 2.2 安装PaddleOCR + +- **(1)下载版面恢复源码** + +```bash +【推荐】git clone https://github.com/PaddlePaddle/PaddleOCR + +# 如果因为网络问题无法pull成功,也可选择使用码云上的托管: +git clone https://gitee.com/paddlepaddle/PaddleOCR + +# 注:码云托管代码可能无法实时同步本github项目更新,存在3~5天延时,请优先使用推荐方式。 +``` + +- **(2)安装recovery的`requirements`** + +版面恢复导出为docx文件,所以需要安装Python处理word文档的python-docx API,同时处理pdf格式的输入文件,需要安装PyMuPDF API([要求Python >= 3.7](https://pypi.org/project/PyMuPDF/))。 + +通过如下命令安装全部库: + +```bash +python3 -m pip install -r ppstructure/recovery/requirements.txt +``` + +使用pdf2docx库解析的方式恢复文档需要安装优化的pdf2docx。 + +```bash +wget https://paddleocr.bj.bcebos.com/whl/pdf2docx-0.0.0-py3-none-any.whl +pip3 install pdf2docx-0.0.0-py3-none-any.whl +``` + + + +## 3.使用标准PDF解析进行版面恢复 + +`use_pdf2docx_api`表示使用PDF解析的方式进行版面恢复,通过whl包的形式方便快速使用,代码如下,更多信息详见 [quickstart](../docs/quickstart.md)。 + +```bash +# 安装 paddleocr,推荐使用2.6版本 +pip3 install "paddleocr>=2.6" +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --use_pdf2docx_api=true +``` + +通过命令行的方式: + +```bash +python3 predict_system.py \ + --image_dir=ppstructure/recovery/UnrealText.pdf \ + --recovery=True \ + --use_pdf2docx_api=True \ + --output=../output/ +``` + + + +## 4.使用图片格式PDF解析进行版面恢复 + +我们通过版面分析对图片/pdf形式的文档进行区域划分,定位其中的关键区域,如文字、表格、图片等,记录每个区域的位置、类别、区域像素值信息。对不同的区域分别处理,其中: + +- 文字区域直接进行OCR检测和识别,在之前信息基础上增加OCR检测框坐标和文本内容信息 + +- 表格区域进行表格识别,记录表格html和文字信息 +- 图片直接保存 + +我们通过版面信息、OCR检测和识别结构、表格信息、保存的图片,对测试图片进行恢复即可。 + +提供如下代码实现版面恢复,也提供了whl包的形式方便快速使用,代码如下,更多信息详见 [quickstart](../docs/quickstart.md)。 + +```bash +# 安装 paddleocr,推荐使用2.6版本 +pip3 install "paddleocr>=2.6" +# 中文测试图 +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true +# 英文测试图 +paddleocr --image_dir=ppstructure/docs/table/1.png --type=structure --recovery=true --lang='en' +# pdf测试文件 +paddleocr --image_dir=ppstructure/recovery/UnrealText.pdf --type=structure --recovery=true --lang='en' +``` + + + +### 4.1 下载模型 + +如果输入为英文文档类型,下载OCR检测和识别、版面分析、表格识别的英文模型 + +```bash +cd PaddleOCR/ppstructure + +# 下载模型 +mkdir inference && cd inference +# 下载英文超轻量PP-OCRv3检测模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && tar xf en_PP-OCRv3_det_infer.tar +# 下载英文超轻量PP-OCRv3识别模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar && tar xf en_PP-OCRv3_rec_infer.tar +# 下载英文表格识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar +tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +# 下载英文版面分析模型 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar +tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar +cd .. +``` + +如果输入为中文文档类型,在下述链接中下载中文模型即可: + +[PP-OCRv3中英文超轻量文本检测和识别模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/README_ch.md#pp-ocr%E7%B3%BB%E5%88%97%E6%A8%A1%E5%9E%8B%E5%88%97%E8%A1%A8%E6%9B%B4%E6%96%B0%E4%B8%AD)、[表格识别模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#22-表格识别模型)、[版面分析模型](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/docs/models_list.md#1-版面分析模型) + + + +### 4.2 版面恢复 + +使用下载的模型恢复给定文档的版面,以英文模型为例,执行如下命令: + +```bash +python3 predict_system.py \ + --image_dir=./docs/table/1.png \ + --det_model_dir=inference/en_PP-OCRv3_det_infer \ + --rec_model_dir=inference/en_PP-OCRv3_rec_infer \ + --rec_char_dict_path=../ppocr/utils/en_dict.txt \ + --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --layout_model_dir=inference/picodet_lcnet_x1_0_fgd_layout_infer \ + --layout_dict_path=../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt \ + --vis_font_path=../doc/fonts/simfang.ttf \ + --recovery=True \ + --output=../output/ +``` + +运行完成后,恢复版面的docx文档会保存到`output`字段指定的目录下 + +字段含义: + +- image_dir:测试文件,可以是图片、图片目录、pdf文件、pdf文件目录 +- det_model_dir:OCR检测模型路径 +- rec_model_dir:OCR识别模型路径 +- rec_char_dict_path:OCR识别字典,如果更换为中文模型,需要更改为"../ppocr/utils/ppocr_keys_v1.txt",如果您在自己的数据集上训练的模型,则更改为训练的字典的文件 +- table_model_dir:表格识别模型路径 +- table_char_dict_path:表格识别字典,如果更换为中文模型,不需要更换字典 +- layout_model_dir:版面分析模型路径 +- layout_dict_path:版面分析字典,如果更换为中文模型,需要更改为"../ppocr/utils/dict/layout_dict/layout_cdla_dict.txt" +- recovery:是否进行版面恢复,默认False +- output:版面恢复结果保存路径 + + + +## 5. 更多 + +关于OCR检测模型的训练评估与推理,请参考:[文本检测教程](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/detection.md) + +关于OCR识别模型的训练评估与推理,请参考:[文本识别教程](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/doc/doc_ch/recognition.md) + +关于版面分析模型的训练评估与推理,请参考:[版面分析教程](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/layout/README_ch.md) + +关于表格识别模型的训练评估与推理,请参考:[表格识别教程](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/table/README_ch.md) diff --git a/ppstructure/recovery/__init__.py b/ppstructure/recovery/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1d11e265597c7c8e39098a228108da3bb954b892 --- /dev/null +++ b/ppstructure/recovery/__init__.py @@ -0,0 +1,13 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppstructure/recovery/recovery_to_doc.py b/ppstructure/recovery/recovery_to_doc.py new file mode 100644 index 0000000000000000000000000000000000000000..05018120820d49216e292d1d4a726cba6458db66 --- /dev/null +++ b/ppstructure/recovery/recovery_to_doc.py @@ -0,0 +1,146 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from copy import deepcopy + +from docx import Document +from docx import shared +from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.enum.section import WD_SECTION +from docx.oxml.ns import qn +from docx.enum.table import WD_TABLE_ALIGNMENT + +from ppstructure.recovery.table_process import HtmlToDocx + +from ppocr.utils.logging import get_logger +logger = get_logger() + + +def convert_info_docx(img, res, save_folder, img_name): + doc = Document() + doc.styles['Normal'].font.name = 'Times New Roman' + doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') + doc.styles['Normal'].font.size = shared.Pt(6.5) + + flag = 1 + for i, region in enumerate(res): + img_idx = region['img_idx'] + if flag == 2 and region['layout'] == 'single': + section = doc.add_section(WD_SECTION.CONTINUOUS) + section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '1') + flag = 1 + elif flag == 1 and region['layout'] == 'double': + section = doc.add_section(WD_SECTION.CONTINUOUS) + section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '2') + flag = 2 + + if region['type'].lower() == 'figure': + excel_save_folder = os.path.join(save_folder, img_name) + img_path = os.path.join(excel_save_folder, + '{}_{}.jpg'.format(region['bbox'], img_idx)) + paragraph_pic = doc.add_paragraph() + paragraph_pic.alignment = WD_ALIGN_PARAGRAPH.CENTER + run = paragraph_pic.add_run("") + if flag == 1: + run.add_picture(img_path, width=shared.Inches(5)) + elif flag == 2: + run.add_picture(img_path, width=shared.Inches(2)) + elif region['type'].lower() == 'title': + doc.add_heading(region['res'][0]['text']) + elif region['type'].lower() == 'table': + parser = HtmlToDocx() + parser.table_style = 'TableGrid' + parser.handle_table(region['res']['html'], doc) + else: + paragraph = doc.add_paragraph() + paragraph_format = paragraph.paragraph_format + for i, line in enumerate(region['res']): + if i == 0: + paragraph_format.first_line_indent = shared.Inches(0.25) + text_run = paragraph.add_run(line['text'] + ' ') + text_run.font.size = shared.Pt(10) + + # save to docx + docx_path = os.path.join(save_folder, '{}_ocr.docx'.format(img_name)) + doc.save(docx_path) + logger.info('docx save to {}'.format(docx_path)) + + +def sorted_layout_boxes(res, w): + """ + Sort text boxes in order from top to bottom, left to right + args: + res(list):ppstructure results + return: + sorted results(list) + """ + num_boxes = len(res) + if num_boxes == 1: + res[0]['layout'] = 'single' + return res + + sorted_boxes = sorted(res, key=lambda x: (x['bbox'][1], x['bbox'][0])) + _boxes = list(sorted_boxes) + + new_res = [] + res_left = [] + res_right = [] + i = 0 + + while True: + if i >= num_boxes: + break + if i == num_boxes - 1: + if _boxes[i]['bbox'][1] > _boxes[i - 1]['bbox'][3] and _boxes[i][ + 'bbox'][0] < w / 2 and _boxes[i]['bbox'][2] > w / 2: + new_res += res_left + new_res += res_right + _boxes[i]['layout'] = 'single' + new_res.append(_boxes[i]) + else: + if _boxes[i]['bbox'][2] > w / 2: + _boxes[i]['layout'] = 'double' + res_right.append(_boxes[i]) + new_res += res_left + new_res += res_right + elif _boxes[i]['bbox'][0] < w / 2: + _boxes[i]['layout'] = 'double' + res_left.append(_boxes[i]) + new_res += res_left + new_res += res_right + res_left = [] + res_right = [] + break + elif _boxes[i]['bbox'][0] < w / 4 and _boxes[i]['bbox'][2] < 3 * w / 4: + _boxes[i]['layout'] = 'double' + res_left.append(_boxes[i]) + i += 1 + elif _boxes[i]['bbox'][0] > w / 4 and _boxes[i]['bbox'][2] > w / 2: + _boxes[i]['layout'] = 'double' + res_right.append(_boxes[i]) + i += 1 + else: + new_res += res_left + new_res += res_right + _boxes[i]['layout'] = 'single' + new_res.append(_boxes[i]) + res_left = [] + res_right = [] + i += 1 + if res_left: + new_res += res_left + if res_right: + new_res += res_right + return new_res diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..761b9d7c3e34cedb335e2c93707619593ebede63 --- /dev/null +++ b/ppstructure/recovery/requirements.txt @@ -0,0 +1,5 @@ +python-docx +beautifulsoup4 +fonttools>=4.24.0 +fire>=0.3.0 +pdf2docx \ No newline at end of file diff --git a/ppstructure/recovery/table_process.py b/ppstructure/recovery/table_process.py new file mode 100644 index 0000000000000000000000000000000000000000..77a6ef7659666ebcbe54dd0c107cb2d62e4c7273 --- /dev/null +++ b/ppstructure/recovery/table_process.py @@ -0,0 +1,317 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: https://github.com/weizwx/html2docx/blob/master/htmldocx/h2d.py +""" + +import re +import docx +from docx import Document +from bs4 import BeautifulSoup +from html.parser import HTMLParser + + +def get_table_rows(table_soup): + table_row_selectors = [ + 'table > tr', 'table > thead > tr', 'table > tbody > tr', + 'table > tfoot > tr' + ] + # If there's a header, body, footer or direct child tr tags, add row dimensions from there + return table_soup.select(', '.join(table_row_selectors), recursive=False) + + +def get_table_columns(row): + # Get all columns for the specified row tag. + return row.find_all(['th', 'td'], recursive=False) if row else [] + + +def get_table_dimensions(table_soup): + # Get rows for the table + rows = get_table_rows(table_soup) + # Table is either empty or has non-direct children between table and tr tags + # Thus the row dimensions and column dimensions are assumed to be 0 + + cols = get_table_columns(rows[0]) if rows else [] + # Add colspan calculation column number + col_count = 0 + for col in cols: + colspan = col.attrs.get('colspan', 1) + col_count += int(colspan) + + return rows, col_count + + +def get_cell_html(soup): + # Returns string of td element with opening and closing tags removed + # Cannot use find_all as it only finds element tags and does not find text which + # is not inside an element + return ' '.join([str(i) for i in soup.contents]) + + +def delete_paragraph(paragraph): + # https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907 + p = paragraph._element + p.getparent().remove(p) + p._p = p._element = None + + +def remove_whitespace(string, leading=False, trailing=False): + """Remove white space from a string. + Args: + string(str): The string to remove white space from. + leading(bool, optional): Remove leading new lines when True. + trailing(bool, optional): Remove trailing new lines when False. + Returns: + str: The input string with new line characters removed and white space squashed. + Examples: + Single or multiple new line characters are replaced with space. + >>> remove_whitespace("abc\\ndef") + 'abc def' + >>> remove_whitespace("abc\\n\\n\\ndef") + 'abc def' + New line characters surrounded by white space are replaced with a single space. + >>> remove_whitespace("abc \\n \\n \\n def") + 'abc def' + >>> remove_whitespace("abc \\n \\n \\n def") + 'abc def' + Leading and trailing new lines are replaced with a single space. + >>> remove_whitespace("\\nabc") + ' abc' + >>> remove_whitespace(" \\n abc") + ' abc' + >>> remove_whitespace("abc\\n") + 'abc ' + >>> remove_whitespace("abc \\n ") + 'abc ' + Use ``leading=True`` to remove leading new line characters, including any surrounding + white space: + >>> remove_whitespace("\\nabc", leading=True) + 'abc' + >>> remove_whitespace(" \\n abc", leading=True) + 'abc' + Use ``trailing=True`` to remove trailing new line characters, including any surrounding + white space: + >>> remove_whitespace("abc \\n ", trailing=True) + 'abc' + """ + # Remove any leading new line characters along with any surrounding white space + if leading: + string = re.sub(r'^\s*\n+\s*', '', string) + + # Remove any trailing new line characters along with any surrounding white space + if trailing: + string = re.sub(r'\s*\n+\s*$', '', string) + + # Replace new line characters and absorb any surrounding space. + string = re.sub(r'\s*\n\s*', ' ', string) + # TODO need some way to get rid of extra spaces in e.g. text text + return re.sub(r'\s+', ' ', string) + + +font_styles = { + 'b': 'bold', + 'strong': 'bold', + 'em': 'italic', + 'i': 'italic', + 'u': 'underline', + 's': 'strike', + 'sup': 'superscript', + 'sub': 'subscript', + 'th': 'bold', +} + +font_names = { + 'code': 'Courier', + 'pre': 'Courier', +} + + +class HtmlToDocx(HTMLParser): + def __init__(self): + super().__init__() + self.options = { + 'fix-html': True, + 'images': True, + 'tables': True, + 'styles': True, + } + self.table_row_selectors = [ + 'table > tr', 'table > thead > tr', 'table > tbody > tr', + 'table > tfoot > tr' + ] + self.table_style = None + self.paragraph_style = None + + def set_initial_attrs(self, document=None): + self.tags = { + 'span': [], + 'list': [], + } + if document: + self.doc = document + else: + self.doc = Document() + self.bs = self.options[ + 'fix-html'] # whether or not to clean with BeautifulSoup + self.document = self.doc + self.include_tables = True #TODO add this option back in? + self.include_images = self.options['images'] + self.include_styles = self.options['styles'] + self.paragraph = None + self.skip = False + self.skip_tag = None + self.instances_to_skip = 0 + + def copy_settings_from(self, other): + """Copy settings from another instance of HtmlToDocx""" + self.table_style = other.table_style + self.paragraph_style = other.paragraph_style + + def ignore_nested_tables(self, tables_soup): + """ + Returns array containing only the highest level tables + Operates on the assumption that bs4 returns child elements immediately after + the parent element in `find_all`. If this changes in the future, this method will need to be updated + :return: + """ + new_tables = [] + nest = 0 + for table in tables_soup: + if nest: + nest -= 1 + continue + new_tables.append(table) + nest = len(table.find_all('table')) + return new_tables + + def get_tables(self): + if not hasattr(self, 'soup'): + self.include_tables = False + return + # find other way to do it, or require this dependency? + self.tables = self.ignore_nested_tables(self.soup.find_all('table')) + self.table_no = 0 + + def run_process(self, html): + if self.bs and BeautifulSoup: + self.soup = BeautifulSoup(html, 'html.parser') + html = str(self.soup) + if self.include_tables: + self.get_tables() + self.feed(html) + + def add_html_to_cell(self, html, cell): + if not isinstance(cell, docx.table._Cell): + raise ValueError('Second argument needs to be a %s' % + docx.table._Cell) + unwanted_paragraph = cell.paragraphs[0] + if unwanted_paragraph.text == "": + delete_paragraph(unwanted_paragraph) + self.set_initial_attrs(cell) + self.run_process(html) + # cells must end with a paragraph or will get message about corrupt file + # https://stackoverflow.com/a/29287121 + if not self.doc.paragraphs: + self.doc.add_paragraph('') + + def apply_paragraph_style(self, style=None): + try: + if style: + self.paragraph.style = style + elif self.paragraph_style: + self.paragraph.style = self.paragraph_style + except KeyError as e: + raise ValueError( + f"Unable to apply style {self.paragraph_style}.") from e + + def handle_table(self, html, doc): + """ + To handle nested tables, we will parse tables manually as follows: + Get table soup + Create docx table + Iterate over soup and fill docx table with new instances of this parser + Tell HTMLParser to ignore any tags until the corresponding closing table tag + """ + table_soup = BeautifulSoup(html, 'html.parser') + rows, cols_len = get_table_dimensions(table_soup) + table = doc.add_table(len(rows), cols_len) + table.style = doc.styles['Table Grid'] + + cell_row = 0 + for index, row in enumerate(rows): + cols = get_table_columns(row) + cell_col = 0 + for col in cols: + colspan = int(col.attrs.get('colspan', 1)) + rowspan = int(col.attrs.get('rowspan', 1)) + + cell_html = get_cell_html(col) + if col.name == 'th': + cell_html = "%s" % cell_html + + docx_cell = table.cell(cell_row, cell_col) + + while docx_cell.text != '': # Skip the merged cell + cell_col += 1 + docx_cell = table.cell(cell_row, cell_col) + + cell_to_merge = table.cell(cell_row + rowspan - 1, + cell_col + colspan - 1) + if docx_cell != cell_to_merge: + docx_cell.merge(cell_to_merge) + + child_parser = HtmlToDocx() + child_parser.copy_settings_from(self) + child_parser.add_html_to_cell(cell_html or ' ', docx_cell) + + cell_col += colspan + cell_row += 1 + + def handle_data(self, data): + if self.skip: + return + + # Only remove white space if we're not in a pre block. + if 'pre' not in self.tags: + # remove leading and trailing whitespace in all instances + data = remove_whitespace(data, True, True) + + if not self.paragraph: + self.paragraph = self.doc.add_paragraph() + self.apply_paragraph_style() + + # There can only be one nested link in a valid html document + # You cannot have interactive content in an A tag, this includes links + # https://html.spec.whatwg.org/#interactive-content + link = self.tags.get('a') + if link: + self.handle_link(link['href'], data) + else: + # If there's a link, dont put the data directly in the run + self.run = self.paragraph.add_run(data) + spans = self.tags['span'] + for span in spans: + if 'style' in span: + style = self.parse_dict_string(span['style']) + self.add_styles_to_run(style) + + # add font style and name + for tag in self.tags: + if tag in font_styles: + font_style = font_styles[tag] + setattr(self.run.font, font_style, True) + + if tag in font_names: + font_name = font_names[tag] + self.run.font.name = font_name diff --git a/ppstructure/table/README.md b/ppstructure/table/README.md new file mode 100644 index 0000000000000000000000000000000000000000..17f0488791512f56b766951b67fff4e480dbfdda --- /dev/null +++ b/ppstructure/table/README.md @@ -0,0 +1,159 @@ +English | [简体中文](README_ch.md) + +# Table Recognition + +- [1. pipeline](#1-pipeline) +- [2. Performance](#2-performance) +- [3. Result](#3-result) +- [4. How to use](#4-how-to-use) + - [4.1 Quick start](#41-quick-start) + - [4.2 Training, Evaluation and Inference](#42-training-evaluation-and-inference) + - [4.3 Calculate TEDS](#43-calculate-teds) +- [5. Reference](#5-reference) + + +## 1. pipeline +The table recognition mainly contains three models +1. Single line text detection-DB +2. Single line text recognition-CRNN +3. Table structure and cell coordinate prediction-SLANet + +The table recognition flow chart is as follows + +![tableocr_pipeline](../docs/table/tableocr_pipeline_en.jpg) + +1. The coordinates of single-line text is detected by DB model, and then sends it to the recognition model to get the recognition result. +2. The table structure and cell coordinates is predicted by SLANet model. +3. The recognition result of the cell is combined by the coordinates, recognition result of the single line and the coordinates of the cell. +4. The cell recognition result and the table structure together construct the html string of the table. + +## 2. Performance +We evaluated the algorithm on the PubTabNet[1] eval dataset, and the performance is as follows: + +|Method|Acc|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)|Speed| +| --- | --- | --- | ---| +| EDD[2] |x| 88.30% |x| +| TableRec-RARE(ours) | 71.73%| 93.88% |779ms| +| SLANet(ours) | 76.31%| 95.89%|766ms| + +The performance indicators are explained as follows: +- Acc: The accuracy of the table structure in each image, a wrong token is considered an error. +- TEDS: The accuracy of the model's restoration of table information. This indicator evaluates not only the table structure, but also the text content in the table. +- Speed: The inference speed of a single image when the model runs on the CPU machine and MKL is enabled. + +## 3. Result + +![](../docs/imgs/table_ch_result1.jpg) +![](../docs/imgs/table_ch_result2.jpg) +![](../docs/imgs/table_ch_result3.jpg) + +## 4. How to use + +### 4.1 Quick start + +PP-Structure currently provides table recognition models in both Chinese and English. For the model link, see [models_list](../docs/models_list.md). The whl package is also provided for quick use, see [quickstart](../docs/quickstart_en.md) for details. + +The following takes the Chinese table recognition model as an example to introduce how to recognize a table. + +Use the following commands to quickly complete the identification of a table. + +```python +cd PaddleOCR/ppstructure + +# download model +mkdir inference && cd inference +# Download the PP-OCRv3 text detection model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# Download the PP-OCRv3 text recognition model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# Download the PP-StructureV2 form recognition model and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar +cd .. +# run +python3.7 table/predict_table.py \ + --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ + --image_dir=docs/table/table.jpg \ + --output=../output/table + +``` + +After the operation is completed, the excel table of each image will be saved to the directory specified by the output field, and an html file will be produced in the directory to visually view the cell coordinates and the recognized table. + +**NOTE** +1. If you want to use the English table recognition model, you need to download the English text detection and recognition model and the English table recognition model in [models_list](../docs/models_list_en.md), and replace `table_structure_dict_ch.txt` with `table_structure_dict.txt`. +2. To use the TableRec-RARE model, you need to replace `table_structure_dict_ch.txt` with `table_structure_dict.txt`, and add parameter `--merge_no_span_structure=False` + +### 4.2 Training, Evaluation and Inference + +The training, evaluation and inference process of the text detection model can be referred to [detection](../../doc/doc_en/detection_en.md) + +The training, evaluation and inference process of the text recognition model can be referred to [recognition](../../doc/doc_en/recognition_en.md) + +The training, evaluation and inference process of the table recognition model can be referred to [table_recognition](../../doc/doc_en/table_recognition_en.md) + +### 4.3 Calculate TEDS + +The table uses [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) as the evaluation metric of the model. Before the model evaluation, the three models in the pipeline need to be exported as inference models (we have provided them), and the gt for evaluation needs to be prepared. Examples of gt are as follows: +```txt +PMC5755158_010_01.png
WeaningWeek 15Off-test
Weaning
Week 150.17 ± 0.080.16 ± 0.03
Off-test0.80 ± 0.240.19 ± 0.09
+``` +Each line in gt consists of the file name and the html string of the table. The file name and the html string of the table are separated by `\t`. + +You can also use the following command to generate an evaluation gt file from the annotation file: +```python +python3 ppstructure/table/convert_label2html.py --ori_gt_path /path/to/your_label_file --save_path /path/to/save_file +``` + +Use the following command to evaluate. After the evaluation is completed, the teds indicator will be output. +```python +python3 table/eval_table.py \ + --det_model_dir=path/to/det_model_dir \ + --rec_model_dir=path/to/rec_model_dir \ + --table_model_dir=path/to/table_model_dir \ + --image_dir=docs/table/table.jpg \ + --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --det_limit_side_len=736 \ + --det_limit_type=min \ + --gt_path=path/to/gt.txt +``` + +Evaluate on the PubLatNet dataset using the English model + +```bash +cd PaddleOCR/ppstructure +# Download the model +mkdir inference && cd inference +# Download the text detection model trained on the PubTabNet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar +# Download the text recognition model trained on the PubTabNet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar +# Download the table recognition model trained on the PubTabNet dataset and unzip it +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +cd .. + +python3 table/eval_table.py \ + --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer \ + --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer \ + --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ + --image_dir=train_data/table/pubtabnet/val/ \ + --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --det_limit_side_len=736 \ + --det_limit_type=min \ + --rec_image_shape=3,32,320 \ + --gt_path=path/to/gt.txt +``` + +output is +```bash +teds: 95.89 +``` + +## 5. Reference +1. https://github.com/ibm-aur-nlp/PubTabNet +2. https://arxiv.org/pdf/1911.10683 diff --git a/ppstructure/table/README_ch.md b/ppstructure/table/README_ch.md new file mode 100644 index 0000000000000000000000000000000000000000..b8817523c67821e49fc258d1e71c8eae3f48435a --- /dev/null +++ b/ppstructure/table/README_ch.md @@ -0,0 +1,163 @@ +[English](README.md) | 简体中文 + +# 表格识别 + +- [1. 表格识别 pipeline](#1-表格识别-pipeline) +- [2. 性能](#2-性能) +- [3. 效果演示](#3-效果演示) +- [4. 使用](#4-使用) + - [4.1 快速开始](#41-快速开始) + - [4.2 模型训练、评估与推理](#42-模型训练评估与推理) + - [4.3 计算TEDS](#43-计算teds) +- [5. Reference](#5-reference) + + +## 1. 表格识别 pipeline + +表格识别主要包含三个模型 +1. 单行文本检测-DB +2. 单行文本识别-CRNN +3. 表格结构和cell坐标预测-SLANet + +具体流程图如下 + +![tableocr_pipeline](../docs/table/tableocr_pipeline.jpg) + +流程说明: + +1. 图片由单行文字检测模型检测到单行文字的坐标,然后送入识别模型拿到识别结果。 +2. 图片由SLANet模型拿到表格的结构信息和单元格的坐标信息。 +3. 由单行文字的坐标、识别结果和单元格的坐标一起组合出单元格的识别结果。 +4. 单元格的识别结果和表格结构一起构造表格的html字符串。 + + +## 2. 性能 + +我们在 PubTabNet[1] 评估数据集上对算法进行了评估,性能如下 + + +|算法|Acc|[TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src)|Speed| +| --- | --- | --- | ---| +| EDD[2] |x| 88.30% |x| +| TableRec-RARE(ours) | 71.73%| 93.88% |779ms| +| SLANet(ours) |76.31%| 95.89%|766ms| + +性能指标解释如下: +- Acc: 模型对每张图像里表格结构的识别准确率,错一个token就算错误。 +- TEDS: 模型对表格信息还原的准确度,此指标评价内容不仅包含表格结构,还包含表格内的文字内容。 +- Speed: 模型在CPU机器上,开启MKL的情况下,单张图片的推理速度。 + +## 3. 效果演示 + +![](../docs/imgs/table_ch_result1.jpg) +![](../docs/imgs/table_ch_result2.jpg) +![](../docs/imgs/table_ch_result3.jpg) + +## 4. 使用 + +### 4.1 快速开始 + +PP-Structure目前提供了中英文两种语言的表格识别模型,模型链接见 [models_list](../docs/models_list.md)。也提供了whl包的形式方便快速使用,详见 [quickstart](../docs/quickstart.md)。 + +下面以中文表格识别模型为例,介绍如何识别一张表格。 + +使用如下命令即可快速完成一张表格的识别。 +```python +cd PaddleOCR/ppstructure + +# 下载模型 +mkdir inference && cd inference +# 下载PP-OCRv3文本检测模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# 下载PP-OCRv3文本识别模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# 下载PP-StructureV2中文表格识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf ch_ppstructure_mobile_v2.0_SLANet_infer.tar +cd .. +# 执行表格识别 +python table/predict_table.py \ + --det_model_dir=inference/ch_PP-OCRv3_det_infer \ + --rec_model_dir=inference/ch_PP-OCRv3_rec_infer \ + --table_model_dir=inference/ch_ppstructure_mobile_v2.0_SLANet_infer \ + --rec_char_dict_path=../ppocr/utils/ppocr_keys_v1.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict_ch.txt \ + --image_dir=docs/table/table.jpg \ + --output=../output/table +``` +运行完成后,每张图片的excel表格会保存到output字段指定的目录下,同时在该目录下回生产一个html文件,用于可视化查看单元格坐标和识别的表格。 + +**NOTE** +1. 如果想使用英文模型,需要在 [models_list](../docs/models_list.md) 中下载英文文字检测识别模型和英文表格识别模型,同时替换`table_structure_dict_ch.txt`为`table_structure_dict.txt`即可。 +2. 如需使用TableRec-RARE模型,需要替换`table_structure_dict_ch.txt`为`table_structure_dict.txt`,同时参数`--merge_no_span_structure=False` + +### 4.2 模型训练、评估与推理 + +文本检测模型的训练、评估和推理流程可参考 [detection](../../doc/doc_ch/detection.md) + +文本识别模型的训练、评估和推理流程可参考 [recognition](../../doc/doc_ch/recognition.md) + +表格识别模型的训练、评估和推理流程可参考 [table_recognition](../../doc/doc_ch/table_recognition.md) + +### 4.3 计算TEDS + +表格使用 [TEDS(Tree-Edit-Distance-based Similarity)](https://github.com/ibm-aur-nlp/PubTabNet/tree/master/src) 作为模型的评估指标。在进行模型评估之前,需要将pipeline中的三个模型分别导出为inference模型(我们已经提供好),还需要准备评估的gt, gt示例如下: +```txt +PMC5755158_010_01.png
WeaningWeek 15Off-test
Weaning
Week 150.17 ± 0.080.16 ± 0.03
Off-test0.80 ± 0.240.19 ± 0.09
+``` +gt每一行都由文件名和表格的html字符串组成,文件名和表格的html字符串之间使用`\t`分隔。 + +也可使用如下命令,由标注文件生成评估的gt文件: +```python +python3 ppstructure/table/convert_label2html.py --ori_gt_path /path/to/your_label_file --save_path /path/to/save_file +``` + +准备完成后使用如下命令进行评估,评估完成后会输出teds指标。 +```python +cd PaddleOCR/ppstructure +python3 table/eval_table.py \ + --det_model_dir=path/to/det_model_dir \ + --rec_model_dir=path/to/rec_model_dir \ + --table_model_dir=path/to/table_model_dir \ + --image_dir=docs/table/table.jpg \ + --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --det_limit_side_len=736 \ + --det_limit_type=min \ + --gt_path=path/to/gt.txt +``` + +如使用英文表格识别模型在PubLatNet数据集上进行评估 + +```bash +cd PaddleOCR/ppstructure +# 下载模型 +mkdir inference && cd inference +# 下载基于PubTabNet数据集训练的文本检测模型并解压 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar +# 下载基于PubTabNet数据集训练的文本识别模型并解压 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar +# 下载基于PubTabNet数据集训练的表格识别模型并解压 +wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar +cd .. + +python3 table/eval_table.py \ + --det_model_dir=inference/en_ppocr_mobile_v2.0_table_det_infer \ + --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer \ + --table_model_dir=inference/en_ppstructure_mobile_v2.0_SLANet_infer \ + --image_dir=train_data/table/pubtabnet/val/ \ + --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt \ + --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt \ + --det_limit_side_len=736 \ + --det_limit_type=min \ + --rec_image_shape=3,32,320 \ + --gt_path=path/to/gt.txt +``` + +将会输出 +```bash +teds: 95.89 +``` + +## 5. Reference +1. https://github.com/ibm-aur-nlp/PubTabNet +2. https://arxiv.org/pdf/1911.10683 diff --git a/ppstructure/table/__init__.py b/ppstructure/table/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1d11e265597c7c8e39098a228108da3bb954b892 --- /dev/null +++ b/ppstructure/table/__init__.py @@ -0,0 +1,13 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/ppstructure/table/__pycache__/__init__.cpython-37.pyc b/ppstructure/table/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9e64bd5c788b3ad71e176d81977453b28241eab Binary files /dev/null and b/ppstructure/table/__pycache__/__init__.cpython-37.pyc differ diff --git a/ppstructure/table/__pycache__/__init__.cpython-38.pyc b/ppstructure/table/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6211f5dd5c06da683a44c3fcd365d5d6244c74c1 Binary files /dev/null and b/ppstructure/table/__pycache__/__init__.cpython-38.pyc differ diff --git a/ppstructure/table/__pycache__/matcher.cpython-37.pyc b/ppstructure/table/__pycache__/matcher.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..acce5211597f714e0a4a90b108a2ba8b82a03225 Binary files /dev/null and b/ppstructure/table/__pycache__/matcher.cpython-37.pyc differ diff --git a/ppstructure/table/__pycache__/matcher.cpython-38.pyc b/ppstructure/table/__pycache__/matcher.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0acc58a6fe5d68c0333e31129ba62de87f4a6a20 Binary files /dev/null and b/ppstructure/table/__pycache__/matcher.cpython-38.pyc differ diff --git a/ppstructure/table/__pycache__/predict_structure.cpython-37.pyc b/ppstructure/table/__pycache__/predict_structure.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..446d00c3e39e132794a6376bb3feacd457c01099 Binary files /dev/null and b/ppstructure/table/__pycache__/predict_structure.cpython-37.pyc differ diff --git a/ppstructure/table/__pycache__/predict_structure.cpython-38.pyc b/ppstructure/table/__pycache__/predict_structure.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84436bf8e2ce1a5b2305df6f6856e3a011960414 Binary files /dev/null and b/ppstructure/table/__pycache__/predict_structure.cpython-38.pyc differ diff --git a/ppstructure/table/__pycache__/predict_table.cpython-37.pyc b/ppstructure/table/__pycache__/predict_table.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bed7c6f65eba0fb5096a24d570935ab38037345b Binary files /dev/null and b/ppstructure/table/__pycache__/predict_table.cpython-37.pyc differ diff --git a/ppstructure/table/__pycache__/predict_table.cpython-38.pyc b/ppstructure/table/__pycache__/predict_table.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4cfd421b382afba822653f877dd41344dac87ea9 Binary files /dev/null and b/ppstructure/table/__pycache__/predict_table.cpython-38.pyc differ diff --git a/ppstructure/table/__pycache__/table_master_match.cpython-37.pyc b/ppstructure/table/__pycache__/table_master_match.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2c78c9ebfd4cbf673f127047e010dafb215f29b Binary files /dev/null and b/ppstructure/table/__pycache__/table_master_match.cpython-37.pyc differ diff --git a/ppstructure/table/__pycache__/table_master_match.cpython-38.pyc b/ppstructure/table/__pycache__/table_master_match.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..efd9307d1890046cf70d38468fcd7f6248ac56e7 Binary files /dev/null and b/ppstructure/table/__pycache__/table_master_match.cpython-38.pyc differ diff --git a/ppstructure/table/convert_label2html.py b/ppstructure/table/convert_label2html.py new file mode 100644 index 0000000000000000000000000000000000000000..be16212ac420326a91cf8ab281a77e5990530c0e --- /dev/null +++ b/ppstructure/table/convert_label2html.py @@ -0,0 +1,102 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +conver table label to html +""" + +import json +import argparse +from tqdm import tqdm + + +def save_pred_txt(key, val, tmp_file_path): + with open(tmp_file_path, 'a+', encoding='utf-8') as f: + f.write('{}\t{}\n'.format(key, val)) + + +def skip_char(text, sp_char_list): + """ + skip empty cell + @param text: text in cell + @param sp_char_list: style char and special code + @return: + """ + for sp_char in sp_char_list: + text = text.replace(sp_char, '') + return text + + +def gen_html(img): + ''' + Formats HTML code from tokenized annotation of img + ''' + html_code = img['html']['structure']['tokens'].copy() + to_insert = [i for i, tag in enumerate(html_code) if tag in ('', '>')] + for i, cell in zip(to_insert[::-1], img['html']['cells'][::-1]): + if cell['tokens']: + text = ''.join(cell['tokens']) + # skip empty text + sp_char_list = ['', '', '\u2028', ' ', '', ''] + text_remove_style = skip_char(text, sp_char_list) + if len(text_remove_style) == 0: + continue + html_code.insert(i + 1, text) + html_code = ''.join(html_code) + html_code = '{}
'.format(html_code) + return html_code + + +def load_gt_data(gt_path): + """ + load gt + @param gt_path: + @return: + """ + data_list = {} + with open(gt_path, 'rb') as f: + lines = f.readlines() + for line in tqdm(lines): + data_line = line.decode('utf-8').strip("\n") + info = json.loads(data_line) + data_list[info['filename']] = info + return data_list + + +def convert(origin_gt_path, save_path): + """ + gen html from label file + @param origin_gt_path: + @param save_path: + @return: + """ + data_dict = load_gt_data(origin_gt_path) + for img_name, gt in tqdm(data_dict.items()): + html = gen_html(gt) + save_pred_txt(img_name, html, save_path) + print('conver finish') + + +def parse_args(): + parser = argparse.ArgumentParser(description="args for paddleserving") + parser.add_argument( + "--ori_gt_path", type=str, required=True, help="label gt path") + parser.add_argument( + "--save_path", type=str, required=True, help="path to save file") + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + convert(args.ori_gt_path, args.save_path) diff --git a/ppstructure/table/eval_table.py b/ppstructure/table/eval_table.py new file mode 100644 index 0000000000000000000000000000000000000000..4fc16b5d4c6a0143dcea149508bd6b62730092b6 --- /dev/null +++ b/ppstructure/table/eval_table.py @@ -0,0 +1,109 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +import cv2 +import pickle +import paddle +from tqdm import tqdm +from ppstructure.table.table_metric import TEDS +from ppstructure.table.predict_table import TableSystem +from ppstructure.utility import init_args +from ppocr.utils.logging import get_logger + +logger = get_logger() + + +def parse_args(): + parser = init_args() + parser.add_argument("--gt_path", type=str) + return parser.parse_args() + + +def load_txt(txt_path): + pred_html_dict = {} + if not os.path.exists(txt_path): + return pred_html_dict + with open(txt_path, encoding='utf-8') as f: + lines = f.readlines() + for line in lines: + line = line.strip().split('\t') + img_name, pred_html = line + pred_html_dict[img_name] = pred_html + return pred_html_dict + + +def load_result(path): + data = {} + if os.path.exists(path): + data = pickle.load(open(path, 'rb')) + return data + + +def save_result(path, data): + old_data = load_result(path) + old_data.update(data) + with open(path, 'wb') as f: + pickle.dump(old_data, f) + + +def main(gt_path, img_root, args): + os.makedirs(args.output, exist_ok=True) + # init TableSystem + text_sys = TableSystem(args) + # load gt and preds html result + gt_html_dict = load_txt(gt_path) + + ocr_result = load_result(os.path.join(args.output, 'ocr.pickle')) + structure_result = load_result( + os.path.join(args.output, 'structure.pickle')) + + pred_htmls = [] + gt_htmls = [] + for img_name, gt_html in tqdm(gt_html_dict.items()): + img = cv2.imread(os.path.join(img_root, img_name)) + # run ocr and save result + if img_name not in ocr_result: + dt_boxes, rec_res, _, _ = text_sys._ocr(img) + ocr_result[img_name] = [dt_boxes, rec_res] + save_result(os.path.join(args.output, 'ocr.pickle'), ocr_result) + # run structure and save result + if img_name not in structure_result: + structure_res, _ = text_sys._structure(img) + structure_result[img_name] = structure_res + save_result( + os.path.join(args.output, 'structure.pickle'), structure_result) + dt_boxes, rec_res = ocr_result[img_name] + structure_res = structure_result[img_name] + # match ocr and structure + pred_html = text_sys.match(structure_res, dt_boxes, rec_res) + + pred_htmls.append(pred_html) + gt_htmls.append(gt_html) + + # compute teds + teds = TEDS(n_jobs=16) + scores = teds.batch_evaluate_html(gt_htmls, pred_htmls) + logger.info('teds: {}'.format(sum(scores) / len(scores))) + + +if __name__ == '__main__': + args = parse_args() + main(args.gt_path, args.image_dir, args) diff --git a/ppstructure/table/matcher.py b/ppstructure/table/matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..9c5bd2630f78527ade4fd1309f22d1731fe838a2 --- /dev/null +++ b/ppstructure/table/matcher.py @@ -0,0 +1,192 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from ppstructure.table.table_master_match import deal_eb_token, deal_bb + + +def distance(box_1, box_2): + x1, y1, x2, y2 = box_1 + x3, y3, x4, y4 = box_2 + dis = abs(x3 - x1) + abs(y3 - y1) + abs(x4 - x2) + abs(y4 - y2) + dis_2 = abs(x3 - x1) + abs(y3 - y1) + dis_3 = abs(x4 - x2) + abs(y4 - y2) + return dis + min(dis_2, dis_3) + + +def compute_iou(rec1, rec2): + """ + computing IoU + :param rec1: (y0, x0, y1, x1), which reflects + (top, left, bottom, right) + :param rec2: (y0, x0, y1, x1) + :return: scala value of IoU + """ + # computing area of each rectangles + S_rec1 = (rec1[2] - rec1[0]) * (rec1[3] - rec1[1]) + S_rec2 = (rec2[2] - rec2[0]) * (rec2[3] - rec2[1]) + + # computing the sum_area + sum_area = S_rec1 + S_rec2 + + # find the each edge of intersect rectangle + left_line = max(rec1[1], rec2[1]) + right_line = min(rec1[3], rec2[3]) + top_line = max(rec1[0], rec2[0]) + bottom_line = min(rec1[2], rec2[2]) + + # judge if there is an intersect + if left_line >= right_line or top_line >= bottom_line: + return 0.0 + else: + intersect = (right_line - left_line) * (bottom_line - top_line) + return (intersect / (sum_area - intersect)) * 1.0 + + +class TableMatch: + def __init__(self, filter_ocr_result=False, use_master=False): + self.filter_ocr_result = filter_ocr_result + self.use_master = use_master + + def __call__(self, structure_res, dt_boxes, rec_res): + pred_structures, pred_bboxes = structure_res + if self.filter_ocr_result: + dt_boxes, rec_res = self._filter_ocr_result(pred_bboxes, dt_boxes, + rec_res) + matched_index = self.match_result(dt_boxes, pred_bboxes) + if self.use_master: + pred_html, pred = self.get_pred_html_master(pred_structures, + matched_index, rec_res) + else: + pred_html, pred = self.get_pred_html(pred_structures, matched_index, + rec_res) + return pred_html + + def match_result(self, dt_boxes, pred_bboxes): + matched = {} + for i, gt_box in enumerate(dt_boxes): + distances = [] + for j, pred_box in enumerate(pred_bboxes): + if len(pred_box) == 8: + pred_box = [ + np.min(pred_box[0::2]), np.min(pred_box[1::2]), + np.max(pred_box[0::2]), np.max(pred_box[1::2]) + ] + distances.append((distance(gt_box, pred_box), + 1. - compute_iou(gt_box, pred_box) + )) # compute iou and l1 distance + sorted_distances = distances.copy() + # select det box by iou and l1 distance + sorted_distances = sorted( + sorted_distances, key=lambda item: (item[1], item[0])) + if distances.index(sorted_distances[0]) not in matched.keys(): + matched[distances.index(sorted_distances[0])] = [i] + else: + matched[distances.index(sorted_distances[0])].append(i) + return matched + + def get_pred_html(self, pred_structures, matched_index, ocr_contents): + end_html = [] + td_index = 0 + for tag in pred_structures: + if '' in tag: + if '' == tag: + end_html.extend('') + if td_index in matched_index.keys(): + b_with = False + if '' in ocr_contents[matched_index[td_index][ + 0]] and len(matched_index[td_index]) > 1: + b_with = True + end_html.extend('') + for i, td_index_index in enumerate(matched_index[td_index]): + content = ocr_contents[td_index_index][0] + if len(matched_index[td_index]) > 1: + if len(content) == 0: + continue + if content[0] == ' ': + content = content[1:] + if '' in content: + content = content[3:] + if '' in content: + content = content[:-4] + if len(content) == 0: + continue + if i != len(matched_index[ + td_index]) - 1 and ' ' != content[-1]: + content += ' ' + end_html.extend(content) + if b_with: + end_html.extend('') + if '' == tag: + end_html.append('') + else: + end_html.append(tag) + td_index += 1 + else: + end_html.append(tag) + return ''.join(end_html), end_html + + def get_pred_html_master(self, pred_structures, matched_index, + ocr_contents): + end_html = [] + td_index = 0 + for token in pred_structures: + if '' in token: + txt = '' + b_with = False + if td_index in matched_index.keys(): + if '' in ocr_contents[matched_index[td_index][ + 0]] and len(matched_index[td_index]) > 1: + b_with = True + for i, td_index_index in enumerate(matched_index[td_index]): + content = ocr_contents[td_index_index][0] + if len(matched_index[td_index]) > 1: + if len(content) == 0: + continue + if content[0] == ' ': + content = content[1:] + if '' in content: + content = content[3:] + if '' in content: + content = content[:-4] + if len(content) == 0: + continue + if i != len(matched_index[ + td_index]) - 1 and ' ' != content[-1]: + content += ' ' + txt += content + if b_with: + txt = '{}'.format(txt) + if '' == token: + token = '{}'.format(txt) + else: + token = '{}'.format(txt) + td_index += 1 + token = deal_eb_token(token) + end_html.append(token) + html = ''.join(end_html) + html = deal_bb(html) + return html, end_html + + def _filter_ocr_result(self, pred_bboxes, dt_boxes, rec_res): + y1 = pred_bboxes[:, 1::2].min() + new_dt_boxes = [] + new_rec_res = [] + + for box, rec in zip(dt_boxes, rec_res): + if np.max(box[1::2]) < y1: + continue + new_dt_boxes.append(box) + new_rec_res.append(rec) + return new_dt_boxes, new_rec_res diff --git a/ppstructure/table/predict_structure.py b/ppstructure/table/predict_structure.py new file mode 100644 index 0000000000000000000000000000000000000000..08e381a846f1e8b4d38918e1031f5b219fed54e2 --- /dev/null +++ b/ppstructure/table/predict_structure.py @@ -0,0 +1,202 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import numpy as np +import time +import json + +import tools.infer.utility as utility +from ppocr.data import create_operators, transform +from ppocr.postprocess import build_post_process +from ppocr.utils.logging import get_logger +from ppocr.utils.utility import get_image_file_list, check_and_read +from ppocr.utils.visual import draw_rectangle +from ppstructure.utility import parse_args + +logger = get_logger() + + +def build_pre_process_list(args): + resize_op = {'ResizeTableImage': {'max_len': args.table_max_len, }} + pad_op = { + 'PaddingTableImage': { + 'size': [args.table_max_len, args.table_max_len] + } + } + normalize_op = { + 'NormalizeImage': { + 'std': [0.229, 0.224, 0.225] if + args.table_algorithm not in ['TableMaster'] else [0.5, 0.5, 0.5], + 'mean': [0.485, 0.456, 0.406] if + args.table_algorithm not in ['TableMaster'] else [0.5, 0.5, 0.5], + 'scale': '1./255.', + 'order': 'hwc' + } + } + to_chw_op = {'ToCHWImage': None} + keep_keys_op = {'KeepKeys': {'keep_keys': ['image', 'shape']}} + if args.table_algorithm not in ['TableMaster']: + pre_process_list = [ + resize_op, normalize_op, pad_op, to_chw_op, keep_keys_op + ] + else: + pre_process_list = [ + resize_op, pad_op, normalize_op, to_chw_op, keep_keys_op + ] + return pre_process_list + + +class TableStructurer(object): + def __init__(self, args): + self.args = args + self.use_onnx = args.use_onnx + pre_process_list = build_pre_process_list(args) + if args.table_algorithm not in ['TableMaster']: + postprocess_params = { + 'name': 'TableLabelDecode', + "character_dict_path": args.table_char_dict_path, + 'merge_no_span_structure': args.merge_no_span_structure + } + else: + postprocess_params = { + 'name': 'TableMasterLabelDecode', + "character_dict_path": args.table_char_dict_path, + 'box_shape': 'pad', + 'merge_no_span_structure': args.merge_no_span_structure + } + + self.preprocess_op = create_operators(pre_process_list) + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor, self.output_tensors, self.config = \ + utility.create_predictor(args, 'table', logger) + + if args.benchmark: + import auto_log + pid = os.getpid() + gpu_id = utility.get_infer_gpuid() + self.autolog = auto_log.AutoLogger( + model_name="table", + model_precision=args.precision, + batch_size=1, + data_shape="dynamic", + save_path=None, #args.save_log_path, + inference_config=self.config, + pids=pid, + process_name=None, + gpu_ids=gpu_id if args.use_gpu else None, + time_keys=[ + 'preprocess_time', 'inference_time', 'postprocess_time' + ], + warmup=0, + logger=logger) + + def __call__(self, img): + starttime = time.time() + if self.args.benchmark: + self.autolog.times.start() + + ori_im = img.copy() + data = {'image': img} + data = transform(data, self.preprocess_op) + img = data[0] + if img is None: + return None, 0 + img = np.expand_dims(img, axis=0) + img = img.copy() + if self.args.benchmark: + self.autolog.times.stamp() + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = img + outputs = self.predictor.run(self.output_tensors, input_dict) + else: + self.input_tensor.copy_from_cpu(img) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if self.args.benchmark: + self.autolog.times.stamp() + + preds = {} + preds['structure_probs'] = outputs[1] + preds['loc_preds'] = outputs[0] + + shape_list = np.expand_dims(data[-1], axis=0) + post_result = self.postprocess_op(preds, [shape_list]) + + structure_str_list = post_result['structure_batch_list'][0] + bbox_list = post_result['bbox_batch_list'][0] + structure_str_list = structure_str_list[0] + structure_str_list = [ + '', '', '' + ] + structure_str_list + ['
', '', ''] + elapse = time.time() - starttime + if self.args.benchmark: + self.autolog.times.end(stamp=True) + return (structure_str_list, bbox_list), elapse + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + table_structurer = TableStructurer(args) + count = 0 + total_time = 0 + os.makedirs(args.output, exist_ok=True) + with open( + os.path.join(args.output, 'infer.txt'), mode='w', + encoding='utf-8') as f_w: + for image_file in image_file_list: + img, flag, _ = check_and_read(image_file) + if not flag: + img = cv2.imread(image_file) + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + structure_res, elapse = table_structurer(img) + structure_str_list, bbox_list = structure_res + bbox_list_str = json.dumps(bbox_list.tolist()) + logger.info("result: {}, {}".format(structure_str_list, + bbox_list_str)) + f_w.write("result: {}, {}\n".format(structure_str_list, + bbox_list_str)) + + if len(bbox_list) > 0 and len(bbox_list[0]) == 4: + img = draw_rectangle(image_file, bbox_list) + else: + img = utility.draw_boxes(img, bbox_list) + img_save_path = os.path.join(args.output, + os.path.basename(image_file)) + cv2.imwrite(img_save_path, img) + logger.info("save vis result to {}".format(img_save_path)) + if count > 0: + total_time += elapse + count += 1 + logger.info("Predict time of {}: {}".format(image_file, elapse)) + if args.benchmark: + table_structurer.autolog.report() + + +if __name__ == "__main__": + main(parse_args()) diff --git a/ppstructure/table/predict_table.py b/ppstructure/table/predict_table.py new file mode 100644 index 0000000000000000000000000000000000000000..354baf6ddf5e73b2e933a9b9e8a568bda80340e5 --- /dev/null +++ b/ppstructure/table/predict_table.py @@ -0,0 +1,230 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' +import cv2 +import copy +import logging +import numpy as np +import time +import tools.infer.predict_rec as predict_rec +import tools.infer.predict_det as predict_det +import tools.infer.utility as utility +from tools.infer.predict_system import sorted_boxes +from ppocr.utils.utility import get_image_file_list, check_and_read +from ppocr.utils.logging import get_logger +from ppstructure.table.matcher import TableMatch +from ppstructure.table.table_master_match import TableMasterMatcher +from ppstructure.utility import parse_args +import ppstructure.table.predict_structure as predict_strture + +logger = get_logger() + + +def expand(pix, det_box, shape): + x0, y0, x1, y1 = det_box + # print(shape) + h, w, c = shape + tmp_x0 = x0 - pix + tmp_x1 = x1 + pix + tmp_y0 = y0 - pix + tmp_y1 = y1 + pix + x0_ = tmp_x0 if tmp_x0 >= 0 else 0 + x1_ = tmp_x1 if tmp_x1 <= w else w + y0_ = tmp_y0 if tmp_y0 >= 0 else 0 + y1_ = tmp_y1 if tmp_y1 <= h else h + return x0_, y0_, x1_, y1_ + + +class TableSystem(object): + def __init__(self, args, text_detector=None, text_recognizer=None): + self.args = args + if not args.show_log: + logger.setLevel(logging.INFO) + benchmark_tmp = False + if args.benchmark: + benchmark_tmp = args.benchmark + args.benchmark = False + self.text_detector = predict_det.TextDetector(copy.deepcopy( + args)) if text_detector is None else text_detector + self.text_recognizer = predict_rec.TextRecognizer(copy.deepcopy( + args)) if text_recognizer is None else text_recognizer + if benchmark_tmp: + args.benchmark = True + self.table_structurer = predict_strture.TableStructurer(args) + if args.table_algorithm in ['TableMaster']: + self.match = TableMasterMatcher() + else: + self.match = TableMatch(filter_ocr_result=True) + + self.predictor, self.input_tensor, self.output_tensors, self.config = utility.create_predictor( + args, 'table', logger) + + def __call__(self, img, return_ocr_result_in_table=False): + result = dict() + time_dict = {'det': 0, 'rec': 0, 'table': 0, 'all': 0, 'match': 0} + start = time.time() + structure_res, elapse = self._structure(copy.deepcopy(img)) + result['cell_bbox'] = structure_res[1].tolist() + time_dict['table'] = elapse + + dt_boxes, rec_res, det_elapse, rec_elapse = self._ocr( + copy.deepcopy(img)) + time_dict['det'] = det_elapse + time_dict['rec'] = rec_elapse + + if return_ocr_result_in_table: + result['boxes'] = dt_boxes #[x.tolist() for x in dt_boxes] + result['rec_res'] = rec_res + + tic = time.time() + pred_html = self.match(structure_res, dt_boxes, rec_res) + toc = time.time() + time_dict['match'] = toc - tic + result['html'] = pred_html + end = time.time() + time_dict['all'] = end - start + return result, time_dict + + def _structure(self, img): + structure_res, elapse = self.table_structurer(copy.deepcopy(img)) + return structure_res, elapse + + def _ocr(self, img): + h, w = img.shape[:2] + dt_boxes, det_elapse = self.text_detector(copy.deepcopy(img)) + dt_boxes = sorted_boxes(dt_boxes) + + r_boxes = [] + for box in dt_boxes: + x_min = max(0, box[:, 0].min() - 1) + x_max = min(w, box[:, 0].max() + 1) + y_min = max(0, box[:, 1].min() - 1) + y_max = min(h, box[:, 1].max() + 1) + box = [x_min, y_min, x_max, y_max] + r_boxes.append(box) + dt_boxes = np.array(r_boxes) + logger.debug("dt_boxes num : {}, elapse : {}".format( + len(dt_boxes), det_elapse)) + if dt_boxes is None: + return None, None + + img_crop_list = [] + for i in range(len(dt_boxes)): + det_box = dt_boxes[i] + x0, y0, x1, y1 = expand(2, det_box, img.shape) + text_rect = img[int(y0):int(y1), int(x0):int(x1), :] + img_crop_list.append(text_rect) + rec_res, rec_elapse = self.text_recognizer(img_crop_list) + logger.debug("rec_res num : {}, elapse : {}".format( + len(rec_res), rec_elapse)) + return dt_boxes, rec_res, det_elapse, rec_elapse + + +def to_excel(html_table, excel_path): + from tablepyxl import tablepyxl + tablepyxl.document_to_xl(html_table, excel_path) + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + image_file_list = image_file_list[args.process_id::args.total_process_num] + os.makedirs(args.output, exist_ok=True) + + table_sys = TableSystem(args) + img_num = len(image_file_list) + + f_html = open( + os.path.join(args.output, 'show.html'), mode='w', encoding='utf-8') + f_html.write('\n\n') + f_html.write('\n') + f_html.write( + "" + ) + f_html.write("\n") + f_html.write('') + f_html.write('') + f_html.write('') + f_html.write("\n") + + for i, image_file in enumerate(image_file_list): + logger.info("[{}/{}] {}".format(i, img_num, image_file)) + img, flag, _ = check_and_read(image_file) + excel_path = os.path.join( + args.output, os.path.basename(image_file).split('.')[0] + '.xlsx') + if not flag: + img = cv2.imread(image_file) + if img is None: + logger.error("error in loading image:{}".format(image_file)) + continue + starttime = time.time() + pred_res, _ = table_sys(img) + pred_html = pred_res['html'] + logger.info(pred_html) + to_excel(pred_html, excel_path) + logger.info('excel saved to {}'.format(excel_path)) + elapse = time.time() - starttime + logger.info("Predict time : {:.3f}s".format(elapse)) + + if len(pred_res['cell_bbox']) > 0 and len(pred_res['cell_bbox'][ + 0]) == 4: + img = predict_strture.draw_rectangle(image_file, + pred_res['cell_bbox']) + else: + img = utility.draw_boxes(img, pred_res['cell_bbox']) + img_save_path = os.path.join(args.output, os.path.basename(image_file)) + cv2.imwrite(img_save_path, img) + + f_html.write("\n") + f_html.write(f'\n') + f_html.write('
img name\n') + f_html.write('ori imagetable htmlcell box
{os.path.basename(image_file)}
\n') + f_html.write(f'
' + pred_html.replace( + '
', '').replace('
', '') + + '
\n') + f_html.write( + f'\n') + f_html.write("\n") + f_html.write("\n") + f_html.close() + + if args.benchmark: + table_sys.table_structurer.autolog.report() + + +if __name__ == "__main__": + args = parse_args() + if args.use_mp: + import subprocess + p_list = [] + total_process_num = args.total_process_num + for process_id in range(total_process_num): + cmd = [sys.executable, "-u"] + sys.argv + [ + "--process_id={}".format(process_id), + "--use_mp={}".format(False) + ] + p = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stdout) + p_list.append(p) + for p in p_list: + p.wait() + else: + main(args) diff --git a/ppstructure/table/table_master_match.py b/ppstructure/table/table_master_match.py new file mode 100644 index 0000000000000000000000000000000000000000..7a7208d4a94bb357b1bbce0d664d9d6449a96874 --- /dev/null +++ b/ppstructure/table/table_master_match.py @@ -0,0 +1,953 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This code is refer from: +https://github.com/JiaquanYe/TableMASTER-mmocr/blob/master/table_recognition/match.py +""" + +import os +import re +import cv2 +import glob +import copy +import math +import pickle +import numpy as np + +from shapely.geometry import Polygon, MultiPoint +""" +Useful function in matching. +""" + + +def remove_empty_bboxes(bboxes): + """ + remove [0., 0., 0., 0.] in structure master bboxes. + len(bboxes.shape) must be 2. + :param bboxes: + :return: + """ + new_bboxes = [] + for bbox in bboxes: + if sum(bbox) == 0.: + continue + new_bboxes.append(bbox) + return np.array(new_bboxes) + + +def xywh2xyxy(bboxes): + if len(bboxes.shape) == 1: + new_bboxes = np.empty_like(bboxes) + new_bboxes[0] = bboxes[0] - bboxes[2] / 2 + new_bboxes[1] = bboxes[1] - bboxes[3] / 2 + new_bboxes[2] = bboxes[0] + bboxes[2] / 2 + new_bboxes[3] = bboxes[1] + bboxes[3] / 2 + return new_bboxes + elif len(bboxes.shape) == 2: + new_bboxes = np.empty_like(bboxes) + new_bboxes[:, 0] = bboxes[:, 0] - bboxes[:, 2] / 2 + new_bboxes[:, 1] = bboxes[:, 1] - bboxes[:, 3] / 2 + new_bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2] / 2 + new_bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3] / 2 + return new_bboxes + else: + raise ValueError + + +def xyxy2xywh(bboxes): + if len(bboxes.shape) == 1: + new_bboxes = np.empty_like(bboxes) + new_bboxes[0] = bboxes[0] + (bboxes[2] - bboxes[0]) / 2 + new_bboxes[1] = bboxes[1] + (bboxes[3] - bboxes[1]) / 2 + new_bboxes[2] = bboxes[2] - bboxes[0] + new_bboxes[3] = bboxes[3] - bboxes[1] + return new_bboxes + elif len(bboxes.shape) == 2: + new_bboxes = np.empty_like(bboxes) + new_bboxes[:, 0] = bboxes[:, 0] + (bboxes[:, 2] - bboxes[:, 0]) / 2 + new_bboxes[:, 1] = bboxes[:, 1] + (bboxes[:, 3] - bboxes[:, 1]) / 2 + new_bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0] + new_bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1] + return new_bboxes + else: + raise ValueError + + +def pickle_load(path, prefix='end2end'): + if os.path.isfile(path): + data = pickle.load(open(path, 'rb')) + elif os.path.isdir(path): + data = dict() + search_path = os.path.join(path, '{}_*.pkl'.format(prefix)) + pkls = glob.glob(search_path) + for pkl in pkls: + this_data = pickle.load(open(pkl, 'rb')) + data.update(this_data) + else: + raise ValueError + return data + + +def convert_coord(xyxy): + """ + Convert two points format to four points format. + :param xyxy: + :return: + """ + new_bbox = np.zeros([4, 2], dtype=np.float32) + new_bbox[0, 0], new_bbox[0, 1] = xyxy[0], xyxy[1] + new_bbox[1, 0], new_bbox[1, 1] = xyxy[2], xyxy[1] + new_bbox[2, 0], new_bbox[2, 1] = xyxy[2], xyxy[3] + new_bbox[3, 0], new_bbox[3, 1] = xyxy[0], xyxy[3] + return new_bbox + + +def cal_iou(bbox1, bbox2): + bbox1_poly = Polygon(bbox1).convex_hull + bbox2_poly = Polygon(bbox2).convex_hull + union_poly = np.concatenate((bbox1, bbox2)) + + if not bbox1_poly.intersects(bbox2_poly): + iou = 0 + else: + inter_area = bbox1_poly.intersection(bbox2_poly).area + union_area = MultiPoint(union_poly).convex_hull.area + if union_area == 0: + iou = 0 + else: + iou = float(inter_area) / union_area + return iou + + +def cal_distance(p1, p2): + delta_x = p1[0] - p2[0] + delta_y = p1[1] - p2[1] + d = math.sqrt((delta_x**2) + (delta_y**2)) + return d + + +def is_inside(center_point, corner_point): + """ + Find if center_point inside the bbox(corner_point) or not. + :param center_point: center point (x, y) + :param corner_point: corner point ((x1,y1),(x2,y2)) + :return: + """ + x_flag = False + y_flag = False + if (center_point[0] >= corner_point[0][0]) and ( + center_point[0] <= corner_point[1][0]): + x_flag = True + if (center_point[1] >= corner_point[0][1]) and ( + center_point[1] <= corner_point[1][1]): + y_flag = True + if x_flag and y_flag: + return True + else: + return False + + +def find_no_match(match_list, all_end2end_nums, type='end2end'): + """ + Find out no match end2end bbox in previous match list. + :param match_list: matching pairs. + :param all_end2end_nums: numbers of end2end_xywh + :param type: 'end2end' corresponding to idx 0, 'master' corresponding to idx 1. + :return: no match pse bbox index list + """ + if type == 'end2end': + idx = 0 + elif type == 'master': + idx = 1 + else: + raise ValueError + + no_match_indexs = [] + # m[0] is end2end index m[1] is master index + matched_bbox_indexs = [m[idx] for m in match_list] + for n in range(all_end2end_nums): + if n not in matched_bbox_indexs: + no_match_indexs.append(n) + return no_match_indexs + + +def is_abs_lower_than_threshold(this_bbox, target_bbox, threshold=3): + # only consider y axis, for grouping in row. + delta = abs(this_bbox[1] - target_bbox[1]) + if delta < threshold: + return True + else: + return False + + +def sort_line_bbox(g, bg): + """ + Sorted the bbox in the same line(group) + compare coord 'x' value, where 'y' value is closed in the same group. + :param g: index in the same group + :param bg: bbox in the same group + :return: + """ + + xs = [bg_item[0] for bg_item in bg] + xs_sorted = sorted(xs) + + g_sorted = [None] * len(xs_sorted) + bg_sorted = [None] * len(xs_sorted) + for g_item, bg_item in zip(g, bg): + idx = xs_sorted.index(bg_item[0]) + bg_sorted[idx] = bg_item + g_sorted[idx] = g_item + + return g_sorted, bg_sorted + + +def flatten(sorted_groups, sorted_bbox_groups): + idxs = [] + bboxes = [] + for group, bbox_group in zip(sorted_groups, sorted_bbox_groups): + for g, bg in zip(group, bbox_group): + idxs.append(g) + bboxes.append(bg) + return idxs, bboxes + + +def sort_bbox(end2end_xywh_bboxes, no_match_end2end_indexes): + """ + This function will group the render end2end bboxes in row. + :param end2end_xywh_bboxes: + :param no_match_end2end_indexes: + :return: + """ + groups = [] + bbox_groups = [] + for index, end2end_xywh_bbox in zip(no_match_end2end_indexes, + end2end_xywh_bboxes): + this_bbox = end2end_xywh_bbox + if len(groups) == 0: + groups.append([index]) + bbox_groups.append([this_bbox]) + else: + flag = False + for g, bg in zip(groups, bbox_groups): + # this_bbox is belong to bg's row or not + if is_abs_lower_than_threshold(this_bbox, bg[0]): + g.append(index) + bg.append(this_bbox) + flag = True + break + if not flag: + # this_bbox is not belong to bg's row, create a row. + groups.append([index]) + bbox_groups.append([this_bbox]) + + # sorted bboxes in a group + tmp_groups, tmp_bbox_groups = [], [] + for g, bg in zip(groups, bbox_groups): + g_sorted, bg_sorted = sort_line_bbox(g, bg) + tmp_groups.append(g_sorted) + tmp_bbox_groups.append(bg_sorted) + + # sorted groups, sort by coord y's value. + sorted_groups = [None] * len(tmp_groups) + sorted_bbox_groups = [None] * len(tmp_bbox_groups) + ys = [bg[0][1] for bg in tmp_bbox_groups] + sorted_ys = sorted(ys) + for g, bg in zip(tmp_groups, tmp_bbox_groups): + idx = sorted_ys.index(bg[0][1]) + sorted_groups[idx] = g + sorted_bbox_groups[idx] = bg + + # flatten, get final result + end2end_sorted_idx_list, end2end_sorted_bbox_list \ + = flatten(sorted_groups, sorted_bbox_groups) + + return end2end_sorted_idx_list, end2end_sorted_bbox_list, sorted_groups, sorted_bbox_groups + + +def get_bboxes_list(end2end_result, structure_master_result): + """ + This function is use to convert end2end results and structure master results to + List of xyxy bbox format and List of xywh bbox format + :param end2end_result: bbox's format is xyxy + :param structure_master_result: bbox's format is xywh + :return: 4 kind list of bbox () + """ + # end2end + end2end_xyxy_list = [] + end2end_xywh_list = [] + for end2end_item in end2end_result: + src_bbox = end2end_item['bbox'] + end2end_xyxy_list.append(src_bbox) + xywh_bbox = xyxy2xywh(src_bbox) + end2end_xywh_list.append(xywh_bbox) + end2end_xyxy_bboxes = np.array(end2end_xyxy_list) + end2end_xywh_bboxes = np.array(end2end_xywh_list) + + # structure master + src_bboxes = structure_master_result['bbox'] + src_bboxes = remove_empty_bboxes(src_bboxes) + structure_master_xyxy_bboxes = src_bboxes + xywh_bbox = xyxy2xywh(src_bboxes) + structure_master_xywh_bboxes = xywh_bbox + + return end2end_xyxy_bboxes, end2end_xywh_bboxes, structure_master_xywh_bboxes, structure_master_xyxy_bboxes + + +def center_rule_match(end2end_xywh_bboxes, structure_master_xyxy_bboxes): + """ + Judge end2end Bbox's center point is inside structure master Bbox or not, + if end2end Bbox's center is in structure master Bbox, get matching pair. + :param end2end_xywh_bboxes: + :param structure_master_xyxy_bboxes: + :return: match pairs list, e.g. [[0,1], [1,2], ...] + """ + match_pairs_list = [] + for i, end2end_xywh in enumerate(end2end_xywh_bboxes): + for j, master_xyxy in enumerate(structure_master_xyxy_bboxes): + x_end2end, y_end2end = end2end_xywh[0], end2end_xywh[1] + x_master1, y_master1, x_master2, y_master2 \ + = master_xyxy[0], master_xyxy[1], master_xyxy[2], master_xyxy[3] + center_point_end2end = (x_end2end, y_end2end) + corner_point_master = ((x_master1, y_master1), + (x_master2, y_master2)) + if is_inside(center_point_end2end, corner_point_master): + match_pairs_list.append([i, j]) + return match_pairs_list + + +def iou_rule_match(end2end_xyxy_bboxes, end2end_xyxy_indexes, + structure_master_xyxy_bboxes): + """ + Use iou to find matching list. + choose max iou value bbox as match pair. + :param end2end_xyxy_bboxes: + :param end2end_xyxy_indexes: original end2end indexes. + :param structure_master_xyxy_bboxes: + :return: match pairs list, e.g. [[0,1], [1,2], ...] + """ + match_pair_list = [] + for end2end_xyxy_index, end2end_xyxy in zip(end2end_xyxy_indexes, + end2end_xyxy_bboxes): + max_iou = 0 + max_match = [None, None] + for j, master_xyxy in enumerate(structure_master_xyxy_bboxes): + end2end_4xy = convert_coord(end2end_xyxy) + master_4xy = convert_coord(master_xyxy) + iou = cal_iou(end2end_4xy, master_4xy) + if iou > max_iou: + max_match[0], max_match[1] = end2end_xyxy_index, j + max_iou = iou + + if max_match[0] is None: + # no match + continue + match_pair_list.append(max_match) + return match_pair_list + + +def distance_rule_match(end2end_indexes, end2end_bboxes, master_indexes, + master_bboxes): + """ + Get matching between no-match end2end bboxes and no-match master bboxes. + Use min distance to match. + This rule will only run (no-match end2end nums > 0) and (no-match master nums > 0) + It will Return master_bboxes_nums match-pairs. + :param end2end_indexes: + :param end2end_bboxes: + :param master_indexes: + :param master_bboxes: + :return: match_pairs list, e.g. [[0,1], [1,2], ...] + """ + min_match_list = [] + for j, master_bbox in zip(master_indexes, master_bboxes): + min_distance = np.inf + min_match = [0, 0] # i, j + for i, end2end_bbox in zip(end2end_indexes, end2end_bboxes): + x_end2end, y_end2end = end2end_bbox[0], end2end_bbox[1] + x_master, y_master = master_bbox[0], master_bbox[1] + end2end_point = (x_end2end, y_end2end) + master_point = (x_master, y_master) + dist = cal_distance(master_point, end2end_point) + if dist < min_distance: + min_match[0], min_match[1] = i, j + min_distance = dist + min_match_list.append(min_match) + return min_match_list + + +def extra_match(no_match_end2end_indexes, master_bbox_nums): + """ + This function will create some virtual master bboxes, + and get match with the no match end2end indexes. + :param no_match_end2end_indexes: + :param master_bbox_nums: + :return: + """ + end_nums = len(no_match_end2end_indexes) + master_bbox_nums + extra_match_list = [] + for i in range(master_bbox_nums, end_nums): + end2end_index = no_match_end2end_indexes[i - master_bbox_nums] + extra_match_list.append([end2end_index, i]) + return extra_match_list + + +def get_match_dict(match_list): + """ + Convert match_list to a dict, where key is master bbox's index, value is end2end bbox index. + :param match_list: + :return: + """ + match_dict = dict() + for match_pair in match_list: + end2end_index, master_index = match_pair[0], match_pair[1] + if master_index not in match_dict.keys(): + match_dict[master_index] = [end2end_index] + else: + match_dict[master_index].append(end2end_index) + return match_dict + + +def deal_successive_space(text): + """ + deal successive space character for text + 1. Replace ' '*3 with '' which is real space is text + 2. Remove ' ', which is split token, not true space + 3. Replace '' with ' ', to get real text + :param text: + :return: + """ + text = text.replace(' ' * 3, '') + text = text.replace(' ', '') + text = text.replace('', ' ') + return text + + +def reduce_repeat_bb(text_list, break_token): + """ + convert ['Local', 'government', 'unit'] to ['Local government unit'] + PS: maybe style Local is also exist, too. it can be processed like this. + :param text_list: + :param break_token: + :return: + """ + count = 0 + for text in text_list: + if text.startswith(''): + count += 1 + if count == len(text_list): + new_text_list = [] + for text in text_list: + text = text.replace('', '').replace('', '') + new_text_list.append(text) + return ['' + break_token.join(new_text_list) + ''] + else: + return text_list + + +def get_match_text_dict(match_dict, end2end_info, break_token=' '): + match_text_dict = dict() + for master_index, end2end_index_list in match_dict.items(): + text_list = [ + end2end_info[end2end_index]['text'] + for end2end_index in end2end_index_list + ] + text_list = reduce_repeat_bb(text_list, break_token) + text = break_token.join(text_list) + match_text_dict[master_index] = text + return match_text_dict + + +def merge_span_token(master_token_list): + """ + Merge the span style token (row span or col span). + :param master_token_list: + :return: + """ + new_master_token_list = [] + pointer = 0 + if master_token_list[-1] != '': + master_token_list.append('') + while master_token_list[pointer] != '': + try: + if master_token_list[pointer] == '' + '' + """ + tmp = ''.join(master_token_list[pointer:pointer + 3 + 1]) + pointer += 4 + new_master_token_list.append(tmp) + + elif master_token_list[pointer + 2].startswith( + ' colspan=') or master_token_list[ + pointer + 2].startswith(' rowspan='): + """ + example: + pattern + '' + '' + """ + tmp = ''.join(master_token_list[pointer:pointer + 4 + 1]) + pointer += 5 + new_master_token_list.append(tmp) + + else: + new_master_token_list.append(master_token_list[pointer]) + pointer += 1 + else: + new_master_token_list.append(master_token_list[pointer]) + pointer += 1 + except: + print("Break in merge...") + break + new_master_token_list.append('') + + return new_master_token_list + + +def deal_eb_token(master_token): + """ + post process with , , ... + emptyBboxTokenDict = { + "[]": '', + "[' ']": '', + "['', ' ', '']": '', + "['\\u2028', '\\u2028']": '', + "['', ' ', '']": '', + "['', '']": '', + "['', ' ', '']": '', + "['', '', '', '']": '', + "['', '', ' ', '', '']": '', + "['', '']": '', + "['', ' ', '\\u2028', ' ', '\\u2028', ' ', '']": '', + } + :param master_token: + :return: + """ + master_token = master_token.replace('', '') + master_token = master_token.replace('', ' ') + master_token = master_token.replace('', ' ') + master_token = master_token.replace('', '\u2028\u2028') + master_token = master_token.replace('', ' ') + master_token = master_token.replace('', '') + master_token = master_token.replace('', ' ') + master_token = master_token.replace('', + '') + master_token = master_token.replace('', + ' ') + master_token = master_token.replace('', '') + master_token = master_token.replace('', + ' \u2028 \u2028 ') + return master_token + + +def insert_text_to_token(master_token_list, match_text_dict): + """ + Insert OCR text result to structure token. + :param master_token_list: + :param match_text_dict: + :return: + """ + master_token_list = merge_span_token(master_token_list) + merged_result_list = [] + text_count = 0 + for master_token in master_token_list: + if master_token.startswith(' len(match_text_dict) - 1: + text_count += 1 + continue + elif text_count not in match_text_dict.keys(): + text_count += 1 + continue + else: + master_token = master_token.replace( + '><', '>{}<'.format(match_text_dict[text_count])) + text_count += 1 + master_token = deal_eb_token(master_token) + merged_result_list.append(master_token) + + return ''.join(merged_result_list) + + +def deal_isolate_span(thead_part): + """ + Deal with isolate span cases in this function. + It causes by wrong prediction in structure recognition model. + eg. predict to rowspan="2">. + :param thead_part: + :return: + """ + # 1. find out isolate span tokens. + isolate_pattern = " rowspan=\"(\d)+\" colspan=\"(\d)+\">
|" \ + " colspan=\"(\d)+\" rowspan=\"(\d)+\">
|" \ + " rowspan=\"(\d)+\">|" \ + " colspan=\"(\d)+\">" + isolate_iter = re.finditer(isolate_pattern, thead_part) + isolate_list = [i.group() for i in isolate_iter] + + # 2. find out span number, by step 1 results. + span_pattern = " rowspan=\"(\d)+\" colspan=\"(\d)+\"|" \ + " colspan=\"(\d)+\" rowspan=\"(\d)+\"|" \ + " rowspan=\"(\d)+\"|" \ + " colspan=\"(\d)+\"" + corrected_list = [] + for isolate_item in isolate_list: + span_part = re.search(span_pattern, isolate_item) + spanStr_in_isolateItem = span_part.group() + # 3. merge the span number into the span token format string. + if spanStr_in_isolateItem is not None: + corrected_item = ''.format(spanStr_in_isolateItem) + corrected_list.append(corrected_item) + else: + corrected_list.append(None) + + # 4. replace original isolated token. + for corrected_item, isolate_item in zip(corrected_list, isolate_list): + if corrected_item is not None: + thead_part = thead_part.replace(isolate_item, corrected_item) + else: + pass + return thead_part + + +def deal_duplicate_bb(thead_part): + """ + Deal duplicate or after replace. + Keep one in a token. + :param thead_part: + :return: + """ + # 1. find out in . + td_pattern = "(.+?)|" \ + "(.+?)|" \ + "(.+?)|" \ + "(.+?)|" \ + "(.*?)" + td_iter = re.finditer(td_pattern, thead_part) + td_list = [t.group() for t in td_iter] + + # 2. is multiply in or not? + new_td_list = [] + for td_item in td_list: + if td_item.count('') > 1 or td_item.count('') > 1: + # multiply in case. + # 1. remove all + td_item = td_item.replace('', '').replace('', '') + # 2. replace -> , -> . + td_item = td_item.replace('', '').replace('', + '') + new_td_list.append(td_item) + else: + new_td_list.append(td_item) + + # 3. replace original thead part. + for td_item, new_td_item in zip(td_list, new_td_list): + thead_part = thead_part.replace(td_item, new_td_item) + return thead_part + + +def deal_bb(result_token): + """ + In our opinion, always occurs in text's context. + This function will find out all tokens in and insert by manual. + :param result_token: + :return: + """ + # find out parts. + thead_pattern = '(.*?)' + if re.search(thead_pattern, result_token) is None: + return result_token + thead_part = re.search(thead_pattern, result_token).group() + origin_thead_part = copy.deepcopy(thead_part) + + # check "rowspan" or "colspan" occur in parts or not . + span_pattern = "|||" + span_iter = re.finditer(span_pattern, thead_part) + span_list = [s.group() for s in span_iter] + has_span_in_head = True if len(span_list) > 0 else False + + if not has_span_in_head: + # not include "rowspan" or "colspan" branch 1. + # 1. replace to , and to + # 2. it is possible to predict text include or by Text-line recognition, + # so we replace to , and to + thead_part = thead_part.replace('', '')\ + .replace('', '')\ + .replace('', '')\ + .replace('', '') + else: + # include "rowspan" or "colspan" branch 2. + # Firstly, we deal rowspan or colspan cases. + # 1. replace > to > + # 2. replace to + # 3. it is possible to predict text include or by Text-line recognition, + # so we replace to , and to + + # Secondly, deal ordinary cases like branch 1 + + # replace ">" to "" + replaced_span_list = [] + for sp in span_list: + replaced_span_list.append(sp.replace('>', '>')) + for sp, rsp in zip(span_list, replaced_span_list): + thead_part = thead_part.replace(sp, rsp) + + # replace "" to "" + thead_part = thead_part.replace('', '') + + # remove duplicated by re.sub + mb_pattern = "()+" + single_b_string = "" + thead_part = re.sub(mb_pattern, single_b_string, thead_part) + + mgb_pattern = "()+" + single_gb_string = "" + thead_part = re.sub(mgb_pattern, single_gb_string, thead_part) + + # ordinary cases like branch 1 + thead_part = thead_part.replace('', '').replace('', + '') + + # convert back to , empty cell has no . + # but space cell( ) is suitable for + thead_part = thead_part.replace('', '') + # deal with duplicated + thead_part = deal_duplicate_bb(thead_part) + # deal with isolate span tokens, which causes by wrong predict by structure prediction. + # eg.PMC5994107_011_00.png + thead_part = deal_isolate_span(thead_part) + # replace original result with new thead part. + result_token = result_token.replace(origin_thead_part, thead_part) + return result_token + + +class Matcher: + def __init__(self, end2end_file, structure_master_file): + """ + This class process the end2end results and structure recognition results. + :param end2end_file: end2end results predict by end2end inference. + :param structure_master_file: structure recognition results predict by structure master inference. + """ + self.end2end_file = end2end_file + self.structure_master_file = structure_master_file + self.end2end_results = pickle_load(end2end_file, prefix='end2end') + self.structure_master_results = pickle_load( + structure_master_file, prefix='structure') + + def match(self): + """ + Match process: + pre-process : convert end2end and structure master results to xyxy, xywh ndnarray format. + 1. Use pseBbox is inside masterBbox judge rule + 2. Use iou between pseBbox and masterBbox rule + 3. Use min distance of center point rule + :return: + """ + match_results = dict() + for idx, (file_name, + end2end_result) in enumerate(self.end2end_results.items()): + match_list = [] + if file_name not in self.structure_master_results: + continue + structure_master_result = self.structure_master_results[file_name] + end2end_xyxy_bboxes, end2end_xywh_bboxes, structure_master_xywh_bboxes, structure_master_xyxy_bboxes = \ + get_bboxes_list(end2end_result, structure_master_result) + + # rule 1: center rule + center_rule_match_list = \ + center_rule_match(end2end_xywh_bboxes, structure_master_xyxy_bboxes) + match_list.extend(center_rule_match_list) + + # rule 2: iou rule + # firstly, find not match index in previous step. + center_no_match_end2end_indexs = \ + find_no_match(match_list, len(end2end_xywh_bboxes), type='end2end') + if len(center_no_match_end2end_indexs) > 0: + center_no_match_end2end_xyxy = end2end_xyxy_bboxes[ + center_no_match_end2end_indexs] + # secondly, iou rule match + iou_rule_match_list = \ + iou_rule_match(center_no_match_end2end_xyxy, center_no_match_end2end_indexs, structure_master_xyxy_bboxes) + match_list.extend(iou_rule_match_list) + + # rule 3: distance rule + # match between no-match end2end bboxes and no-match master bboxes. + # it will return master_bboxes_nums match-pairs. + # firstly, find not match index in previous step. + centerIou_no_match_end2end_indexs = \ + find_no_match(match_list, len(end2end_xywh_bboxes), type='end2end') + centerIou_no_match_master_indexs = \ + find_no_match(match_list, len(structure_master_xywh_bboxes), type='master') + if len(centerIou_no_match_master_indexs) > 0 and len( + centerIou_no_match_end2end_indexs) > 0: + centerIou_no_match_end2end_xywh = end2end_xywh_bboxes[ + centerIou_no_match_end2end_indexs] + centerIou_no_match_master_xywh = structure_master_xywh_bboxes[ + centerIou_no_match_master_indexs] + distance_match_list = distance_rule_match( + centerIou_no_match_end2end_indexs, + centerIou_no_match_end2end_xywh, + centerIou_no_match_master_indexs, + centerIou_no_match_master_xywh) + match_list.extend(distance_match_list) + + # TODO: + # The render no-match pseBbox, insert the last + # After step3 distance rule, a master bbox at least match one end2end bbox. + # But end2end bbox maybe overmuch, because numbers of master bbox will cut by max length. + # For these render end2end bboxes, we will make some virtual master bboxes, and get matching. + # The above extra insert bboxes will be further processed in "formatOutput" function. + # After this operation, it will increase TEDS score. + no_match_end2end_indexes = \ + find_no_match(match_list, len(end2end_xywh_bboxes), type='end2end') + if len(no_match_end2end_indexes) > 0: + no_match_end2end_xywh = end2end_xywh_bboxes[ + no_match_end2end_indexes] + # sort the render no-match end2end bbox in row + end2end_sorted_indexes_list, end2end_sorted_bboxes_list, sorted_groups, sorted_bboxes_groups = \ + sort_bbox(no_match_end2end_xywh, no_match_end2end_indexes) + # make virtual master bboxes, and get matching with the no-match end2end bboxes. + extra_match_list = extra_match( + end2end_sorted_indexes_list, + len(structure_master_xywh_bboxes)) + match_list_add_extra_match = copy.deepcopy(match_list) + match_list_add_extra_match.extend(extra_match_list) + else: + # no no-match end2end bboxes + match_list_add_extra_match = copy.deepcopy(match_list) + sorted_groups = [] + sorted_bboxes_groups = [] + + match_result_dict = { + 'match_list': match_list, + 'match_list_add_extra_match': match_list_add_extra_match, + 'sorted_groups': sorted_groups, + 'sorted_bboxes_groups': sorted_bboxes_groups + } + + # format output + match_result_dict = self._format(match_result_dict, file_name) + + match_results[file_name] = match_result_dict + + return match_results + + def _format(self, match_result, file_name): + """ + Extend the master token(insert virtual master token), and format matching result. + :param match_result: + :param file_name: + :return: + """ + end2end_info = self.end2end_results[file_name] + master_info = self.structure_master_results[file_name] + master_token = master_info['text'] + sorted_groups = match_result['sorted_groups'] + + # creat virtual master token + virtual_master_token_list = [] + for line_group in sorted_groups: + tmp_list = [''] + item_nums = len(line_group) + for _ in range(item_nums): + tmp_list.append('') + tmp_list.append('') + virtual_master_token_list.extend(tmp_list) + + # insert virtual master token + master_token_list = master_token.split(',') + if master_token_list[-1] == '': + # complete predict(no cut by max length) + # This situation insert virtual master token will drop TEDs score in val set. + # So we will not extend virtual token in this situation. + + # fake extend virtual + master_token_list[:-1].extend(virtual_master_token_list) + + # real extend virtual + # master_token_list = master_token_list[:-1] + # master_token_list.extend(virtual_master_token_list) + # master_token_list.append('') + + elif master_token_list[-1] == '': + master_token_list.append('') + master_token_list.extend(virtual_master_token_list) + master_token_list.append('') + else: + master_token_list.extend(virtual_master_token_list) + master_token_list.append('') + + # format output + match_result.setdefault('matched_master_token_list', master_token_list) + return match_result + + def get_merge_result(self, match_results): + """ + Merge the OCR result into structure token to get final results. + :param match_results: + :return: + """ + merged_results = dict() + + # break_token is linefeed token, when one master bbox has multiply end2end bboxes. + break_token = ' ' + + for idx, (file_name, match_info) in enumerate(match_results.items()): + end2end_info = self.end2end_results[file_name] + master_token_list = match_info['matched_master_token_list'] + match_list = match_info['match_list_add_extra_match'] + + match_dict = get_match_dict(match_list) + match_text_dict = get_match_text_dict(match_dict, end2end_info, + break_token) + merged_result = insert_text_to_token(master_token_list, + match_text_dict) + merged_result = deal_bb(merged_result) + + merged_results[file_name] = merged_result + + return merged_results + + +class TableMasterMatcher(Matcher): + def __init__(self): + pass + + def __call__(self, structure_res, dt_boxes, rec_res, img_name=1): + end2end_results = {img_name: []} + for dt_box, res in zip(dt_boxes, rec_res): + d = dict( + bbox=np.array(dt_box), + text=res[0], ) + end2end_results[img_name].append(d) + + self.end2end_results = end2end_results + + structure_master_result_dict = {img_name: {}} + pred_structures, pred_bboxes = structure_res + pred_structures = ','.join(pred_structures[3:-3]) + structure_master_result_dict[img_name]['text'] = pred_structures + structure_master_result_dict[img_name]['bbox'] = pred_bboxes + self.structure_master_results = structure_master_result_dict + + # match + match_results = self.match() + merged_results = self.get_merge_result(match_results) + pred_html = merged_results[img_name] + pred_html = '' + pred_html + '
' + return pred_html diff --git a/ppstructure/table/table_metric/__init__.py b/ppstructure/table/table_metric/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..de2d307430f68881ece1e41357d3b2f423e07ddd --- /dev/null +++ b/ppstructure/table/table_metric/__init__.py @@ -0,0 +1,16 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['TEDS'] +from .table_metric import TEDS \ No newline at end of file diff --git a/ppstructure/table/table_metric/parallel.py b/ppstructure/table/table_metric/parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..f7326a1f506ca5fb7b3e97b0d077dc016e7eb7c7 --- /dev/null +++ b/ppstructure/table/table_metric/parallel.py @@ -0,0 +1,51 @@ +from tqdm import tqdm +from concurrent.futures import ProcessPoolExecutor, as_completed + + +def parallel_process(array, function, n_jobs=16, use_kwargs=False, front_num=0): + """ + A parallel version of the map function with a progress bar. + Args: + array (array-like): An array to iterate over. + function (function): A python function to apply to the elements of array + n_jobs (int, default=16): The number of cores to use + use_kwargs (boolean, default=False): Whether to consider the elements of array as dictionaries of + keyword arguments to function + front_num (int, default=3): The number of iterations to run serially before kicking off the parallel job. + Useful for catching bugs + Returns: + [function(array[0]), function(array[1]), ...] + """ + # We run the first few iterations serially to catch bugs + if front_num > 0: + front = [function(**a) if use_kwargs else function(a) + for a in array[:front_num]] + else: + front = [] + # If we set n_jobs to 1, just run a list comprehension. This is useful for benchmarking and debugging. + if n_jobs == 1: + return front + [function(**a) if use_kwargs else function(a) for a in tqdm(array[front_num:])] + # Assemble the workers + with ProcessPoolExecutor(max_workers=n_jobs) as pool: + # Pass the elements of array into function + if use_kwargs: + futures = [pool.submit(function, **a) for a in array[front_num:]] + else: + futures = [pool.submit(function, a) for a in array[front_num:]] + kwargs = { + 'total': len(futures), + 'unit': 'it', + 'unit_scale': True, + 'leave': True + } + # Print out the progress as tasks complete + for f in tqdm(as_completed(futures), **kwargs): + pass + out = [] + # Get the results from the futures. + for i, future in tqdm(enumerate(futures)): + try: + out.append(future.result()) + except Exception as e: + out.append(e) + return front + out diff --git a/ppstructure/table/table_metric/table_metric.py b/ppstructure/table/table_metric/table_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..923a9c0071d083de72a2a896d6f62037373d4e73 --- /dev/null +++ b/ppstructure/table/table_metric/table_metric.py @@ -0,0 +1,214 @@ +# Copyright 2020 IBM +# Author: peter.zhong@au1.ibm.com +# +# This is free software; you can redistribute it and/or modify +# it under the terms of the Apache 2.0 License. +# +# This software is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Apache 2.0 License for more details. + +from rapidfuzz.distance import Levenshtein +from apted import APTED, Config +from apted.helpers import Tree +from lxml import etree, html +from collections import deque +from .parallel import parallel_process +from tqdm import tqdm + + +class TableTree(Tree): + def __init__(self, tag, colspan=None, rowspan=None, content=None, *children): + self.tag = tag + self.colspan = colspan + self.rowspan = rowspan + self.content = content + self.children = list(children) + + def bracket(self): + """Show tree using brackets notation""" + if self.tag == 'td': + result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \ + (self.tag, self.colspan, self.rowspan, self.content) + else: + result = '"tag": %s' % self.tag + for child in self.children: + result += child.bracket() + return "{{{}}}".format(result) + + +class CustomConfig(Config): + def rename(self, node1, node2): + """Compares attributes of trees""" + #print(node1.tag) + if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): + return 1. + if node1.tag == 'td': + if node1.content or node2.content: + #print(node1.content, ) + return Levenshtein.normalized_distance(node1.content, node2.content) + return 0. + + + +class CustomConfig_del_short(Config): + def rename(self, node1, node2): + """Compares attributes of trees""" + if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): + return 1. + if node1.tag == 'td': + if node1.content or node2.content: + #print('before') + #print(node1.content, node2.content) + #print('after') + node1_content = node1.content + node2_content = node2.content + if len(node1_content) < 3: + node1_content = ['####'] + if len(node2_content) < 3: + node2_content = ['####'] + return Levenshtein.normalized_distance(node1_content, node2_content) + return 0. + +class CustomConfig_del_block(Config): + def rename(self, node1, node2): + """Compares attributes of trees""" + if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): + return 1. + if node1.tag == 'td': + if node1.content or node2.content: + + node1_content = node1.content + node2_content = node2.content + while ' ' in node1_content: + print(node1_content.index(' ')) + node1_content.pop(node1_content.index(' ')) + while ' ' in node2_content: + print(node2_content.index(' ')) + node2_content.pop(node2_content.index(' ')) + return Levenshtein.normalized_distance(node1_content, node2_content) + return 0. + +class TEDS(object): + ''' Tree Edit Distance basead Similarity + ''' + + def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None): + assert isinstance(n_jobs, int) and ( + n_jobs >= 1), 'n_jobs must be an integer greather than 1' + self.structure_only = structure_only + self.n_jobs = n_jobs + self.ignore_nodes = ignore_nodes + self.__tokens__ = [] + + def tokenize(self, node): + ''' Tokenizes table cells + ''' + self.__tokens__.append('<%s>' % node.tag) + if node.text is not None: + self.__tokens__ += list(node.text) + for n in node.getchildren(): + self.tokenize(n) + if node.tag != 'unk': + self.__tokens__.append('' % node.tag) + if node.tag != 'td' and node.tail is not None: + self.__tokens__ += list(node.tail) + + def load_html_tree(self, node, parent=None): + ''' Converts HTML tree to the format required by apted + ''' + global __tokens__ + if node.tag == 'td': + if self.structure_only: + cell = [] + else: + self.__tokens__ = [] + self.tokenize(node) + cell = self.__tokens__[1:-1].copy() + new_node = TableTree(node.tag, + int(node.attrib.get('colspan', '1')), + int(node.attrib.get('rowspan', '1')), + cell, *deque()) + else: + new_node = TableTree(node.tag, None, None, None, *deque()) + if parent is not None: + parent.children.append(new_node) + if node.tag != 'td': + for n in node.getchildren(): + self.load_html_tree(n, new_node) + if parent is None: + return new_node + + def evaluate(self, pred, true): + ''' Computes TEDS score between the prediction and the ground truth of a + given sample + ''' + if (not pred) or (not true): + return 0.0 + parser = html.HTMLParser(remove_comments=True, encoding='utf-8') + pred = html.fromstring(pred, parser=parser) + true = html.fromstring(true, parser=parser) + if pred.xpath('body/table') and true.xpath('body/table'): + pred = pred.xpath('body/table')[0] + true = true.xpath('body/table')[0] + if self.ignore_nodes: + etree.strip_tags(pred, *self.ignore_nodes) + etree.strip_tags(true, *self.ignore_nodes) + n_nodes_pred = len(pred.xpath(".//*")) + n_nodes_true = len(true.xpath(".//*")) + n_nodes = max(n_nodes_pred, n_nodes_true) + tree_pred = self.load_html_tree(pred) + tree_true = self.load_html_tree(true) + distance = APTED(tree_pred, tree_true, + CustomConfig()).compute_edit_distance() + return 1.0 - (float(distance) / n_nodes) + else: + return 0.0 + + def batch_evaluate(self, pred_json, true_json): + ''' Computes TEDS score between the prediction and the ground truth of + a batch of samples + @params pred_json: {'FILENAME': 'HTML CODE', ...} + @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...} + @output: {'FILENAME': 'TEDS SCORE', ...} + ''' + samples = true_json.keys() + if self.n_jobs == 1: + scores = [self.evaluate(pred_json.get( + filename, ''), true_json[filename]['html']) for filename in tqdm(samples)] + else: + inputs = [{'pred': pred_json.get( + filename, ''), 'true': true_json[filename]['html']} for filename in samples] + scores = parallel_process( + inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1) + scores = dict(zip(samples, scores)) + return scores + + def batch_evaluate_html(self, pred_htmls, true_htmls): + ''' Computes TEDS score between the prediction and the ground truth of + a batch of samples + ''' + if self.n_jobs == 1: + scores = [self.evaluate(pred_html, true_html) for ( + pred_html, true_html) in zip(pred_htmls, true_htmls)] + else: + inputs = [{"pred": pred_html, "true": true_html} for( + pred_html, true_html) in zip(pred_htmls, true_htmls)] + + scores = parallel_process( + inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1) + return scores + + +if __name__ == '__main__': + import json + import pprint + with open('sample_pred.json') as fp: + pred_json = json.load(fp) + with open('sample_gt.json') as fp: + true_json = json.load(fp) + teds = TEDS(n_jobs=4) + scores = teds.batch_evaluate(pred_json, true_json) + pp = pprint.PrettyPrinter() + pp.pprint(scores) diff --git a/ppstructure/table/tablepyxl/__init__.py b/ppstructure/table/tablepyxl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dc0085071cf4497b01fc648e7c38f2e8d9d173d0 --- /dev/null +++ b/ppstructure/table/tablepyxl/__init__.py @@ -0,0 +1,13 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/ppstructure/table/tablepyxl/__pycache__/__init__.cpython-37.pyc b/ppstructure/table/tablepyxl/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07dfd2fc520825013ab12090e9f012985af773a4 Binary files /dev/null and b/ppstructure/table/tablepyxl/__pycache__/__init__.cpython-37.pyc differ diff --git a/ppstructure/table/tablepyxl/__pycache__/style.cpython-37.pyc b/ppstructure/table/tablepyxl/__pycache__/style.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e8b0bd05e3a4596b22a0874992d1ddfe5eab834 Binary files /dev/null and b/ppstructure/table/tablepyxl/__pycache__/style.cpython-37.pyc differ diff --git a/ppstructure/table/tablepyxl/__pycache__/tablepyxl.cpython-37.pyc b/ppstructure/table/tablepyxl/__pycache__/tablepyxl.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee951359d6558f9189910167ab4affe19ba5547b Binary files /dev/null and b/ppstructure/table/tablepyxl/__pycache__/tablepyxl.cpython-37.pyc differ diff --git a/ppstructure/table/tablepyxl/style.py b/ppstructure/table/tablepyxl/style.py new file mode 100644 index 0000000000000000000000000000000000000000..ebd794b1b47d7f9e4f9294dde7330f592d613656 --- /dev/null +++ b/ppstructure/table/tablepyxl/style.py @@ -0,0 +1,283 @@ +# This is where we handle translating css styles into openpyxl styles +# and cascading those from parent to child in the dom. + +from openpyxl.cell import cell +from openpyxl.styles import Font, Alignment, PatternFill, NamedStyle, Border, Side, Color +from openpyxl.styles.fills import FILL_SOLID +from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE +from openpyxl.styles.colors import BLACK + +FORMAT_DATE_MMDDYYYY = 'mm/dd/yyyy' + + +def colormap(color): + """ + Convenience for looking up known colors + """ + cmap = {'black': BLACK} + return cmap.get(color, color) + + +def style_string_to_dict(style): + """ + Convert css style string to a python dictionary + """ + def clean_split(string, delim): + return (s.strip() for s in string.split(delim)) + styles = [clean_split(s, ":") for s in style.split(";") if ":" in s] + return dict(styles) + + +def get_side(style, name): + return {'border_style': style.get('border-{}-style'.format(name)), + 'color': colormap(style.get('border-{}-color'.format(name)))} + +known_styles = {} + + +def style_dict_to_named_style(style_dict, number_format=None): + """ + Change css style (stored in a python dictionary) to openpyxl NamedStyle + """ + + style_and_format_string = str({ + 'style_dict': style_dict, + 'parent': style_dict.parent, + 'number_format': number_format, + }) + + if style_and_format_string not in known_styles: + # Font + font = Font(bold=style_dict.get('font-weight') == 'bold', + color=style_dict.get_color('color', None), + size=style_dict.get('font-size')) + + # Alignment + alignment = Alignment(horizontal=style_dict.get('text-align', 'general'), + vertical=style_dict.get('vertical-align'), + wrap_text=style_dict.get('white-space', 'nowrap') == 'normal') + + # Fill + bg_color = style_dict.get_color('background-color') + fg_color = style_dict.get_color('foreground-color', Color()) + fill_type = style_dict.get('fill-type') + if bg_color and bg_color != 'transparent': + fill = PatternFill(fill_type=fill_type or FILL_SOLID, + start_color=bg_color, + end_color=fg_color) + else: + fill = PatternFill() + + # Border + border = Border(left=Side(**get_side(style_dict, 'left')), + right=Side(**get_side(style_dict, 'right')), + top=Side(**get_side(style_dict, 'top')), + bottom=Side(**get_side(style_dict, 'bottom')), + diagonal=Side(**get_side(style_dict, 'diagonal')), + diagonal_direction=None, + outline=Side(**get_side(style_dict, 'outline')), + vertical=None, + horizontal=None) + + name = 'Style {}'.format(len(known_styles) + 1) + + pyxl_style = NamedStyle(name=name, font=font, fill=fill, alignment=alignment, border=border, + number_format=number_format) + + known_styles[style_and_format_string] = pyxl_style + + return known_styles[style_and_format_string] + + +class StyleDict(dict): + """ + It's like a dictionary, but it looks for items in the parent dictionary + """ + def __init__(self, *args, **kwargs): + self.parent = kwargs.pop('parent', None) + super(StyleDict, self).__init__(*args, **kwargs) + + def __getitem__(self, item): + if item in self: + return super(StyleDict, self).__getitem__(item) + elif self.parent: + return self.parent[item] + else: + raise KeyError('{} not found'.format(item)) + + def __hash__(self): + return hash(tuple([(k, self.get(k)) for k in self._keys()])) + + # Yielding the keys avoids creating unnecessary data structures + # and happily works with both python2 and python3 where the + # .keys() method is a dictionary_view in python3 and a list in python2. + def _keys(self): + yielded = set() + for k in self.keys(): + yielded.add(k) + yield k + if self.parent: + for k in self.parent._keys(): + if k not in yielded: + yielded.add(k) + yield k + + def get(self, k, d=None): + try: + return self[k] + except KeyError: + return d + + def get_color(self, k, d=None): + """ + Strip leading # off colors if necessary + """ + color = self.get(k, d) + if hasattr(color, 'startswith') and color.startswith('#'): + color = color[1:] + if len(color) == 3: # Premailers reduces colors like #00ff00 to #0f0, openpyxl doesn't like that + color = ''.join(2 * c for c in color) + return color + + +class Element(object): + """ + Our base class for representing an html element along with a cascading style. + The element is created along with a parent so that the StyleDict that we store + can point to the parent's StyleDict. + """ + def __init__(self, element, parent=None): + self.element = element + self.number_format = None + parent_style = parent.style_dict if parent else None + self.style_dict = StyleDict(style_string_to_dict(element.get('style', '')), parent=parent_style) + self._style_cache = None + + def style(self): + """ + Turn the css styles for this element into an openpyxl NamedStyle. + """ + if not self._style_cache: + self._style_cache = style_dict_to_named_style(self.style_dict, number_format=self.number_format) + return self._style_cache + + def get_dimension(self, dimension_key): + """ + Extracts the dimension from the style dict of the Element and returns it as a float. + """ + dimension = self.style_dict.get(dimension_key) + if dimension: + if dimension[-2:] in ['px', 'em', 'pt', 'in', 'cm']: + dimension = dimension[:-2] + dimension = float(dimension) + return dimension + + +class Table(Element): + """ + The concrete implementations of Elements are semantically named for the types of elements we are interested in. + This defines a very concrete tree structure for html tables that we expect to deal with. I prefer this compared to + allowing Element to have an arbitrary number of children and dealing with an abstract element tree. + """ + def __init__(self, table): + """ + takes an html table object (from lxml) + """ + super(Table, self).__init__(table) + table_head = table.find('thead') + self.head = TableHead(table_head, parent=self) if table_head is not None else None + table_body = table.find('tbody') + self.body = TableBody(table_body if table_body is not None else table, parent=self) + + +class TableHead(Element): + """ + This class maps to the `` element of the html table. + """ + def __init__(self, head, parent=None): + super(TableHead, self).__init__(head, parent=parent) + self.rows = [TableRow(tr, parent=self) for tr in head.findall('tr')] + + +class TableBody(Element): + """ + This class maps to the `` element of the html table. + """ + def __init__(self, body, parent=None): + super(TableBody, self).__init__(body, parent=parent) + self.rows = [TableRow(tr, parent=self) for tr in body.findall('tr')] + + +class TableRow(Element): + """ + This class maps to the `` element of the html table. + """ + def __init__(self, tr, parent=None): + super(TableRow, self).__init__(tr, parent=parent) + self.cells = [TableCell(cell, parent=self) for cell in tr.findall('th') + tr.findall('td')] + + +def element_to_string(el): + return _element_to_string(el).strip() + + +def _element_to_string(el): + string = '' + + for x in el.iterchildren(): + string += '\n' + _element_to_string(x) + + text = el.text.strip() if el.text else '' + tail = el.tail.strip() if el.tail else '' + + return text + string + '\n' + tail + + +class TableCell(Element): + """ + This class maps to the `` element of the html table. + """ + CELL_TYPES = {'TYPE_STRING', 'TYPE_FORMULA', 'TYPE_NUMERIC', 'TYPE_BOOL', 'TYPE_CURRENCY', 'TYPE_PERCENTAGE', + 'TYPE_NULL', 'TYPE_INLINE', 'TYPE_ERROR', 'TYPE_FORMULA_CACHE_STRING', 'TYPE_INTEGER'} + + def __init__(self, cell, parent=None): + super(TableCell, self).__init__(cell, parent=parent) + self.value = element_to_string(cell) + self.number_format = self.get_number_format() + + def data_type(self): + cell_types = self.CELL_TYPES & set(self.element.get('class', '').split()) + if cell_types: + if 'TYPE_FORMULA' in cell_types: + # Make sure TYPE_FORMULA takes precedence over the other classes in the set. + cell_type = 'TYPE_FORMULA' + elif cell_types & {'TYPE_CURRENCY', 'TYPE_INTEGER', 'TYPE_PERCENTAGE'}: + cell_type = 'TYPE_NUMERIC' + else: + cell_type = cell_types.pop() + else: + cell_type = 'TYPE_STRING' + return getattr(cell, cell_type) + + def get_number_format(self): + if 'TYPE_CURRENCY' in self.element.get('class', '').split(): + return FORMAT_CURRENCY_USD_SIMPLE + if 'TYPE_INTEGER' in self.element.get('class', '').split(): + return '#,##0' + if 'TYPE_PERCENTAGE' in self.element.get('class', '').split(): + return FORMAT_PERCENTAGE + if 'TYPE_DATE' in self.element.get('class', '').split(): + return FORMAT_DATE_MMDDYYYY + if self.data_type() == cell.TYPE_NUMERIC: + try: + int(self.value) + except ValueError: + return '#,##0.##' + else: + return '#,##0' + + def format(self, cell): + cell.style = self.style() + data_type = self.data_type() + if data_type: + cell.data_type = data_type \ No newline at end of file diff --git a/ppstructure/table/tablepyxl/tablepyxl.py b/ppstructure/table/tablepyxl/tablepyxl.py new file mode 100644 index 0000000000000000000000000000000000000000..ba3cc0fc499fccd93ffe3993a99296bc6603ed8a --- /dev/null +++ b/ppstructure/table/tablepyxl/tablepyxl.py @@ -0,0 +1,118 @@ +# Do imports like python3 so our package works for 2 and 3 +from __future__ import absolute_import + +from lxml import html +from openpyxl import Workbook +from openpyxl.utils import get_column_letter +from premailer import Premailer +from tablepyxl.style import Table + + +def string_to_int(s): + if s.isdigit(): + return int(s) + return 0 + + +def get_Tables(doc): + tree = html.fromstring(doc) + comments = tree.xpath('//comment()') + for comment in comments: + comment.drop_tag() + return [Table(table) for table in tree.xpath('//table')] + + +def write_rows(worksheet, elem, row, column=1): + """ + Writes every tr child element of elem to a row in the worksheet + returns the next row after all rows are written + """ + from openpyxl.cell.cell import MergedCell + + initial_column = column + for table_row in elem.rows: + for table_cell in table_row.cells: + cell = worksheet.cell(row=row, column=column) + while isinstance(cell, MergedCell): + column += 1 + cell = worksheet.cell(row=row, column=column) + + colspan = string_to_int(table_cell.element.get("colspan", "1")) + rowspan = string_to_int(table_cell.element.get("rowspan", "1")) + if rowspan > 1 or colspan > 1: + worksheet.merge_cells(start_row=row, start_column=column, + end_row=row + rowspan - 1, end_column=column + colspan - 1) + + cell.value = table_cell.value + table_cell.format(cell) + min_width = table_cell.get_dimension('min-width') + max_width = table_cell.get_dimension('max-width') + + if colspan == 1: + # Initially, when iterating for the first time through the loop, the width of all the cells is None. + # As we start filling in contents, the initial width of the cell (which can be retrieved by: + # worksheet.column_dimensions[get_column_letter(column)].width) is equal to the width of the previous + # cell in the same column (i.e. width of A2 = width of A1) + width = max(worksheet.column_dimensions[get_column_letter(column)].width or 0, len(table_cell.value) + 2) + if max_width and width > max_width: + width = max_width + elif min_width and width < min_width: + width = min_width + worksheet.column_dimensions[get_column_letter(column)].width = width + column += colspan + row += 1 + column = initial_column + return row + + +def table_to_sheet(table, wb): + """ + Takes a table and workbook and writes the table to a new sheet. + The sheet title will be the same as the table attribute name. + """ + ws = wb.create_sheet(title=table.element.get('name')) + insert_table(table, ws, 1, 1) + + +def document_to_workbook(doc, wb=None, base_url=None): + """ + Takes a string representation of an html document and writes one sheet for + every table in the document. + The workbook is returned + """ + if not wb: + wb = Workbook() + wb.remove(wb.active) + + inline_styles_doc = Premailer(doc, base_url=base_url, remove_classes=False).transform() + tables = get_Tables(inline_styles_doc) + + for table in tables: + table_to_sheet(table, wb) + + return wb + + +def document_to_xl(doc, filename, base_url=None): + """ + Takes a string representation of an html document and writes one sheet for + every table in the document. The workbook is written out to a file called filename + """ + wb = document_to_workbook(doc, base_url=base_url) + wb.save(filename) + + +def insert_table(table, worksheet, column, row): + if table.head: + row = write_rows(worksheet, table.head, row, column) + if table.body: + row = write_rows(worksheet, table.body, row, column) + + +def insert_table_at_cell(table, cell): + """ + Inserts a table at the location of an openpyxl Cell object. + """ + ws = cell.parent + column, row = cell.column, cell.row + insert_table(table, ws, column, row) \ No newline at end of file diff --git a/ppstructure/utility.py b/ppstructure/utility.py new file mode 100644 index 0000000000000000000000000000000000000000..d909f1a8a165745a5c0df78cc3d89960ec4469e7 --- /dev/null +++ b/ppstructure/utility.py @@ -0,0 +1,156 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import random +import ast +from PIL import Image, ImageDraw, ImageFont +import numpy as np +from tools.infer.utility import draw_ocr_box_txt, str2bool, init_args as infer_args + + +def init_args(): + parser = infer_args() + + # params for output + parser.add_argument("--output", type=str, default='./output') + # params for table structure + parser.add_argument("--table_max_len", type=int, default=488) + parser.add_argument("--table_algorithm", type=str, default='TableAttn') + parser.add_argument("--table_model_dir", type=str) + parser.add_argument( + "--merge_no_span_structure", type=str2bool, default=True) + parser.add_argument( + "--table_char_dict_path", + type=str, + default="../ppocr/utils/dict/table_structure_dict_ch.txt") + # params for layout + parser.add_argument("--layout_model_dir", type=str) + parser.add_argument( + "--layout_dict_path", + type=str, + default="../ppocr/utils/dict/layout_dict/layout_publaynet_dict.txt") + parser.add_argument( + "--layout_score_threshold", + type=float, + default=0.5, + help="Threshold of score.") + parser.add_argument( + "--layout_nms_threshold", + type=float, + default=0.5, + help="Threshold of nms.") + # params for kie + parser.add_argument("--kie_algorithm", type=str, default='LayoutXLM') + parser.add_argument("--ser_model_dir", type=str) + parser.add_argument("--re_model_dir", type=str) + parser.add_argument("--use_visual_backbone", type=str2bool, default=True) + parser.add_argument( + "--ser_dict_path", + type=str, + default="../train_data/XFUND/class_list_xfun.txt") + # need to be None or tb-yx + parser.add_argument("--ocr_order_method", type=str, default=None) + # params for inference + parser.add_argument( + "--mode", + type=str, + choices=['structure', 'kie'], + default='structure', + help='structure and kie is supported') + parser.add_argument( + "--image_orientation", + type=bool, + default=False, + help='Whether to enable image orientation recognition') + parser.add_argument( + "--layout", + type=str2bool, + default=True, + help='Whether to enable layout analysis') + parser.add_argument( + "--table", + type=str2bool, + default=True, + help='In the forward, whether the table area uses table recognition') + parser.add_argument( + "--ocr", + type=str2bool, + default=True, + help='In the forward, whether the non-table area is recognition by ocr') + # param for recovery + parser.add_argument( + "--recovery", + type=str2bool, + default=False, + help='Whether to enable layout of recovery') + parser.add_argument( + "--use_pdf2docx_api", + type=str2bool, + default=False, + help='Whether to use pdf2docx api') + + return parser + + +def parse_args(): + parser = init_args() + return parser.parse_args() + + +def draw_structure_result(image, result, font_path): + if isinstance(image, np.ndarray): + image = Image.fromarray(image) + boxes, txts, scores = [], [], [] + + img_layout = image.copy() + draw_layout = ImageDraw.Draw(img_layout) + text_color = (255, 255, 255) + text_background_color = (80, 127, 255) + catid2color = {} + font_size = 15 + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + + for region in result: + if region['type'] not in catid2color: + box_color = (random.randint(0, 255), random.randint(0, 255), + random.randint(0, 255)) + catid2color[region['type']] = box_color + else: + box_color = catid2color[region['type']] + box_layout = region['bbox'] + draw_layout.rectangle( + [(box_layout[0], box_layout[1]), (box_layout[2], box_layout[3])], + outline=box_color, + width=3) + text_w, text_h = font.getsize(region['type']) + draw_layout.rectangle( + [(box_layout[0], box_layout[1]), + (box_layout[0] + text_w, box_layout[1] + text_h)], + fill=text_background_color) + draw_layout.text( + (box_layout[0], box_layout[1]), + region['type'], + fill=text_color, + font=font) + + if region['type'] == 'table': + pass + else: + for text_result in region['res']: + boxes.append(np.array(text_result['text_region'])) + txts.append(text_result['text']) + scores.append(text_result['confidence']) + + im_show = draw_ocr_box_txt( + img_layout, boxes, txts, scores, font_path=font_path, drop_score=0) + return im_show diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..959b42a2def341414f1c57b4908ba7323b7f0803 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +paddlepaddle==2.4.2 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/noavx/stable.html --no-index +paddleocr>=2.6 +Pillow +Gradio +Polygon3 -i https://pypi.tuna.tsinghua.edu.cn/simple +lanms-neo==1.0.2 -i https://pypi.tuna.tsinghua.edu.cn/simple \ No newline at end of file diff --git a/simfang.ttf b/simfang.ttf new file mode 100644 index 0000000000000000000000000000000000000000..b1905868c15c38c3aa3161a3b1e17447a38fe201 --- /dev/null +++ b/simfang.ttf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:521c6f7546b4eb64fa4b0cd604bbd36333a20a57e388c8e2ad2ad07b9e593864 +size 10576012 diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d56c9dbaa1304b160521da03c05db2352e341bf2 --- /dev/null +++ b/tools/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tools/__pycache__/__init__.cpython-37.pyc b/tools/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..645a99a8ec2fa4133ebf1c91986de98369a15fa7 Binary files /dev/null and b/tools/__pycache__/__init__.cpython-37.pyc differ diff --git a/tools/__pycache__/__init__.cpython-38.pyc b/tools/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16755a5ecf7f6a7054fbaabe36ff84df7ec0ae02 Binary files /dev/null and b/tools/__pycache__/__init__.cpython-38.pyc differ diff --git a/tools/end2end/convert_ppocr_label.py b/tools/end2end/convert_ppocr_label.py new file mode 100644 index 0000000000000000000000000000000000000000..c64b9ed168113879182262b609bea692fcb73165 --- /dev/null +++ b/tools/end2end/convert_ppocr_label.py @@ -0,0 +1,100 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import json +import os + + +def poly_to_string(poly): + if len(poly.shape) > 1: + poly = np.array(poly).flatten() + + string = "\t".join(str(i) for i in poly) + return string + + +def convert_label(label_dir, mode="gt", save_dir="./save_results/"): + if not os.path.exists(label_dir): + raise ValueError(f"The file {label_dir} does not exist!") + + assert label_dir != save_dir, "hahahhaha" + + label_file = open(label_dir, 'r') + data = label_file.readlines() + + gt_dict = {} + + for line in data: + try: + tmp = line.split('\t') + assert len(tmp) == 2, "" + except: + tmp = line.strip().split(' ') + + gt_lists = [] + + if tmp[0].split('/')[0] is not None: + img_path = tmp[0] + anno = json.loads(tmp[1]) + gt_collect = [] + for dic in anno: + #txt = dic['transcription'].replace(' ', '') # ignore blank + txt = dic['transcription'] + if 'score' in dic and float(dic['score']) < 0.5: + continue + if u'\u3000' in txt: txt = txt.replace(u'\u3000', u' ') + #while ' ' in txt: + # txt = txt.replace(' ', '') + poly = np.array(dic['points']).flatten() + if txt == "###": + txt_tag = 1 ## ignore 1 + else: + txt_tag = 0 + if mode == "gt": + gt_label = poly_to_string(poly) + "\t" + str( + txt_tag) + "\t" + txt + "\n" + else: + gt_label = poly_to_string(poly) + "\t" + txt + "\n" + + gt_lists.append(gt_label) + + gt_dict[img_path] = gt_lists + else: + continue + + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + for img_name in gt_dict.keys(): + save_name = img_name.split("/")[-1] + save_file = os.path.join(save_dir, save_name + ".txt") + with open(save_file, "w") as f: + f.writelines(gt_dict[img_name]) + + print("The convert label saved in {}".format(save_dir)) + + +def parse_args(): + import argparse + parser = argparse.ArgumentParser(description="args") + parser.add_argument("--label_path", type=str, required=True) + parser.add_argument("--save_folder", type=str, required=True) + parser.add_argument("--mode", type=str, default=False) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + args = parse_args() + convert_label(args.label_path, args.mode, args.save_folder) diff --git a/tools/end2end/draw_html.py b/tools/end2end/draw_html.py new file mode 100644 index 0000000000000000000000000000000000000000..fcac8ad3bfb6f0d1fcc48ea026b0febe60a001c0 --- /dev/null +++ b/tools/end2end/draw_html.py @@ -0,0 +1,73 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse + + +def str2bool(v): + return v.lower() in ("true", "t", "1") + + +def init_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--image_dir", type=str, default="") + parser.add_argument("--save_html_path", type=str, default="./default.html") + parser.add_argument("--width", type=int, default=640) + return parser + + +def parse_args(): + parser = init_args() + return parser.parse_args() + + +def draw_debug_img(args): + + html_path = args.save_html_path + + err_cnt = 0 + with open(html_path, 'w') as html: + html.write('\n\n') + html.write('\n') + html.write( + "" + ) + image_list = [] + path = args.image_dir + for i, filename in enumerate(sorted(os.listdir(path))): + if filename.endswith("txt"): continue + # The image path + base = "{}/{}".format(path, filename) + html.write("\n") + html.write(f'') + + html.write("\n") + html.write('\n') + html.write('
{filename}\n GT') + html.write(f'GT\n
\n') + html.write('\n\n') + print(f"The html file saved in {html_path}") + return + + +if __name__ == "__main__": + + args = parse_args() + + draw_debug_img(args) diff --git a/tools/end2end/eval_end2end.py b/tools/end2end/eval_end2end.py new file mode 100644 index 0000000000000000000000000000000000000000..dd37940845b60d55291b40b8cd50d6afd5b91f94 --- /dev/null +++ b/tools/end2end/eval_end2end.py @@ -0,0 +1,193 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import sys +import shapely +from shapely.geometry import Polygon +import numpy as np +from collections import defaultdict +import operator +import editdistance + + +def strQ2B(ustring): + rstring = "" + for uchar in ustring: + inside_code = ord(uchar) + if inside_code == 12288: + inside_code = 32 + elif (inside_code >= 65281 and inside_code <= 65374): + inside_code -= 65248 + rstring += chr(inside_code) + return rstring + + +def polygon_from_str(polygon_points): + """ + Create a shapely polygon object from gt or dt line. + """ + polygon_points = np.array(polygon_points).reshape(4, 2) + polygon = Polygon(polygon_points).convex_hull + return polygon + + +def polygon_iou(poly1, poly2): + """ + Intersection over union between two shapely polygons. + """ + if not poly1.intersects( + poly2): # this test is fast and can accelerate calculation + iou = 0 + else: + try: + inter_area = poly1.intersection(poly2).area + union_area = poly1.area + poly2.area - inter_area + iou = float(inter_area) / union_area + except shapely.geos.TopologicalError: + # except Exception as e: + # print(e) + print('shapely.geos.TopologicalError occurred, iou set to 0') + iou = 0 + return iou + + +def ed(str1, str2): + return editdistance.eval(str1, str2) + + +def e2e_eval(gt_dir, res_dir, ignore_blank=False): + print('start testing...') + iou_thresh = 0.5 + val_names = os.listdir(gt_dir) + num_gt_chars = 0 + gt_count = 0 + dt_count = 0 + hit = 0 + ed_sum = 0 + + for i, val_name in enumerate(val_names): + with open(os.path.join(gt_dir, val_name), encoding='utf-8') as f: + gt_lines = [o.strip() for o in f.readlines()] + gts = [] + ignore_masks = [] + for line in gt_lines: + parts = line.strip().split('\t') + # ignore illegal data + if len(parts) < 9: + continue + assert (len(parts) < 11) + if len(parts) == 9: + gts.append(parts[:8] + ['']) + else: + gts.append(parts[:8] + [parts[-1]]) + + ignore_masks.append(parts[8]) + + val_path = os.path.join(res_dir, val_name) + if not os.path.exists(val_path): + dt_lines = [] + else: + with open(val_path, encoding='utf-8') as f: + dt_lines = [o.strip() for o in f.readlines()] + dts = [] + for line in dt_lines: + # print(line) + parts = line.strip().split("\t") + assert (len(parts) < 10), "line error: {}".format(line) + if len(parts) == 8: + dts.append(parts + ['']) + else: + dts.append(parts) + + dt_match = [False] * len(dts) + gt_match = [False] * len(gts) + all_ious = defaultdict(tuple) + for index_gt, gt in enumerate(gts): + gt_coors = [float(gt_coor) for gt_coor in gt[0:8]] + gt_poly = polygon_from_str(gt_coors) + for index_dt, dt in enumerate(dts): + dt_coors = [float(dt_coor) for dt_coor in dt[0:8]] + dt_poly = polygon_from_str(dt_coors) + iou = polygon_iou(dt_poly, gt_poly) + if iou >= iou_thresh: + all_ious[(index_gt, index_dt)] = iou + sorted_ious = sorted( + all_ious.items(), key=operator.itemgetter(1), reverse=True) + sorted_gt_dt_pairs = [item[0] for item in sorted_ious] + + # matched gt and dt + for gt_dt_pair in sorted_gt_dt_pairs: + index_gt, index_dt = gt_dt_pair + if gt_match[index_gt] == False and dt_match[index_dt] == False: + gt_match[index_gt] = True + dt_match[index_dt] = True + if ignore_blank: + gt_str = strQ2B(gts[index_gt][8]).replace(" ", "") + dt_str = strQ2B(dts[index_dt][8]).replace(" ", "") + else: + gt_str = strQ2B(gts[index_gt][8]) + dt_str = strQ2B(dts[index_dt][8]) + if ignore_masks[index_gt] == '0': + ed_sum += ed(gt_str, dt_str) + num_gt_chars += len(gt_str) + if gt_str == dt_str: + hit += 1 + gt_count += 1 + dt_count += 1 + + # unmatched dt + for tindex, dt_match_flag in enumerate(dt_match): + if dt_match_flag == False: + dt_str = dts[tindex][8] + gt_str = '' + ed_sum += ed(dt_str, gt_str) + dt_count += 1 + + # unmatched gt + for tindex, gt_match_flag in enumerate(gt_match): + if gt_match_flag == False and ignore_masks[tindex] == '0': + dt_str = '' + gt_str = gts[tindex][8] + ed_sum += ed(gt_str, dt_str) + num_gt_chars += len(gt_str) + gt_count += 1 + + eps = 1e-9 + print('hit, dt_count, gt_count', hit, dt_count, gt_count) + precision = hit / (dt_count + eps) + recall = hit / (gt_count + eps) + fmeasure = 2.0 * precision * recall / (precision + recall + eps) + avg_edit_dist_img = ed_sum / len(val_names) + avg_edit_dist_field = ed_sum / (gt_count + eps) + character_acc = 1 - ed_sum / (num_gt_chars + eps) + + print('character_acc: %.2f' % (character_acc * 100) + "%") + print('avg_edit_dist_field: %.2f' % (avg_edit_dist_field)) + print('avg_edit_dist_img: %.2f' % (avg_edit_dist_img)) + print('precision: %.2f' % (precision * 100) + "%") + print('recall: %.2f' % (recall * 100) + "%") + print('fmeasure: %.2f' % (fmeasure * 100) + "%") + + +if __name__ == '__main__': + # if len(sys.argv) != 3: + # print("python3 ocr_e2e_eval.py gt_dir res_dir") + # exit(-1) + # gt_folder = sys.argv[1] + # pred_folder = sys.argv[2] + gt_folder = sys.argv[1] + pred_folder = sys.argv[2] + e2e_eval(gt_folder, pred_folder) diff --git a/tools/end2end/readme.md b/tools/end2end/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..636ee764aef35a551f729cce20c16357ddf4f495 --- /dev/null +++ b/tools/end2end/readme.md @@ -0,0 +1,63 @@ + +# 简介 + +`tools/end2end`目录下存放了文本检测+文本识别pipeline串联预测的指标评测代码以及可视化工具。本节介绍文本检测+文本识别的端对端指标评估方式。 + + +## 端对端评测步骤 + +**步骤一:** + +运行`tools/infer/predict_system.py`,得到保存的结果: + +``` +python3 tools/infer/predict_system.py --det_model_dir=./ch_PP-OCRv2_det_infer/ --rec_model_dir=./ch_PP-OCRv2_rec_infer/ --image_dir=./datasets/img_dir/ --draw_img_save_dir=./ch_PP-OCRv2_results/ --is_visualize=True +``` + +文本检测识别可视化图默认保存在`./ch_PP-OCRv2_results/`目录下,预测结果默认保存在`./ch_PP-OCRv2_results/system_results.txt`中,格式如下: +``` +all-sum-510/00224225.jpg [{"transcription": "超赞", "points": [[8.0, 48.0], [157.0, 44.0], [159.0, 115.0], [10.0, 119.0]], "score": "0.99396634"}, {"transcription": "中", "points": [[202.0, 152.0], [230.0, 152.0], [230.0, 163.0], [202.0, 163.0]], "score": "0.09310734"}, {"transcription": "58.0m", "points": [[196.0, 192.0], [444.0, 192.0], [444.0, 240.0], [196.0, 240.0]], "score": "0.44041982"}, {"transcription": "汽配", "points": [[55.0, 263.0], [95.0, 263.0], [95.0, 281.0], [55.0, 281.0]], "score": "0.9986651"}, {"transcription": "成总店", "points": [[120.0, 262.0], [176.0, 262.0], [176.0, 283.0], [120.0, 283.0]], "score": "0.9929402"}, {"transcription": "K", "points": [[237.0, 286.0], [311.0, 286.0], [311.0, 345.0], [237.0, 345.0]], "score": "0.6074794"}, {"transcription": "88:-8", "points": [[203.0, 405.0], [477.0, 414.0], [475.0, 459.0], [201.0, 450.0]], "score": "0.7106863"}] +``` + + +**步骤二:** + +将步骤一保存的数据转换为端对端评测需要的数据格式: + +修改 `tools/end2end/convert_ppocr_label.py`中的代码,convert_label函数中设置输入标签路径,Mode,保存标签路径等,对预测数据的GTlabel和预测结果的label格式进行转换。 + +``` +python3 tools/end2end/convert_ppocr_label.py --mode=gt --label_path=path/to/label_txt --save_folder=save_gt_label + +python3 tools/end2end/convert_ppocr_label.py --mode=pred --label_path=path/to/pred_txt --save_folder=save_PPOCRV2_infer +``` + +得到如下结果: +``` +├── ./save_gt_label/ +├── ./save_PPOCRV2_infer/ +``` + +**步骤三:** + +执行端对端评测,运行`tools/eval_end2end.py`计算端对端指标,运行方式如下: + +``` +python3 tools/eval_end2end.py "gt_label_dir" "predict_label_dir" +``` + +比如: + +``` +python3 tools/eval_end2end.py ./save_gt_label/ ./save_PPOCRV2_infer/ +``` +将得到如下结果,fmeasure为主要关注的指标: +``` +hit, dt_count, gt_count 1557 2693 3283 +character_acc: 61.77% +avg_edit_dist_field: 3.08 +avg_edit_dist_img: 51.82 +precision: 57.82% +recall: 47.43% +fmeasure: 52.11% +``` diff --git a/tools/eval.py b/tools/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..21f4d94d5e4ed560b8775c8827ffdbbd00355218 --- /dev/null +++ b/tools/eval.py @@ -0,0 +1,137 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, __dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) + +import paddle +from ppocr.data import build_dataloader +from ppocr.modeling.architectures import build_model +from ppocr.postprocess import build_post_process +from ppocr.metrics import build_metric +from ppocr.utils.save_load import load_model +import tools.program as program + + +def main(): + global_config = config['Global'] + # build dataloader + valid_dataloader = build_dataloader(config, 'Eval', device, logger) + + # build post process + post_process_class = build_post_process(config['PostProcess'], + global_config) + + # build model + # for rec algorithm + if hasattr(post_process_class, 'character'): + char_num = len(getattr(post_process_class, 'character')) + if config['Architecture']["algorithm"] in ["Distillation", + ]: # distillation model + for key in config['Architecture']["Models"]: + if config['Architecture']['Models'][key]['Head'][ + 'name'] == 'MultiHead': # for multi head + out_channels_list = {} + if config['PostProcess'][ + 'name'] == 'DistillationSARLabelDecode': + char_num = char_num - 2 + out_channels_list['CTCLabelDecode'] = char_num + out_channels_list['SARLabelDecode'] = char_num + 2 + config['Architecture']['Models'][key]['Head'][ + 'out_channels_list'] = out_channels_list + else: + config['Architecture']["Models"][key]["Head"][ + 'out_channels'] = char_num + elif config['Architecture']['Head'][ + 'name'] == 'MultiHead': # for multi head + out_channels_list = {} + if config['PostProcess']['name'] == 'SARLabelDecode': + char_num = char_num - 2 + out_channels_list['CTCLabelDecode'] = char_num + out_channels_list['SARLabelDecode'] = char_num + 2 + config['Architecture']['Head'][ + 'out_channels_list'] = out_channels_list + else: # base rec model + config['Architecture']["Head"]['out_channels'] = char_num + + model = build_model(config['Architecture']) + extra_input_models = [ + "SRN", "NRTR", "SAR", "SEED", "SVTR", "VisionLAN", "RobustScanner" + ] + extra_input = False + if config['Architecture']['algorithm'] == 'Distillation': + for key in config['Architecture']["Models"]: + extra_input = extra_input or config['Architecture']['Models'][key][ + 'algorithm'] in extra_input_models + else: + extra_input = config['Architecture']['algorithm'] in extra_input_models + if "model_type" in config['Architecture'].keys(): + if config['Architecture']['algorithm'] == 'CAN': + model_type = 'can' + else: + model_type = config['Architecture']['model_type'] + else: + model_type = None + + # build metric + eval_class = build_metric(config['Metric']) + # amp + use_amp = config["Global"].get("use_amp", False) + amp_level = config["Global"].get("amp_level", 'O2') + amp_custom_black_list = config['Global'].get('amp_custom_black_list', []) + if use_amp: + AMP_RELATED_FLAGS_SETTING = { + 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, + 'FLAGS_max_inplace_grad_add': 8, + } + paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) + scale_loss = config["Global"].get("scale_loss", 1.0) + use_dynamic_loss_scaling = config["Global"].get( + "use_dynamic_loss_scaling", False) + scaler = paddle.amp.GradScaler( + init_loss_scaling=scale_loss, + use_dynamic_loss_scaling=use_dynamic_loss_scaling) + if amp_level == "O2": + model = paddle.amp.decorate( + models=model, level=amp_level, master_weight=True) + else: + scaler = None + + best_model_dict = load_model( + config, model, model_type=config['Architecture']["model_type"]) + if len(best_model_dict): + logger.info('metric in ckpt ***************') + for k, v in best_model_dict.items(): + logger.info('{}:{}'.format(k, v)) + + # start eval + metric = program.eval(model, valid_dataloader, post_process_class, + eval_class, model_type, extra_input, scaler, + amp_level, amp_custom_black_list) + logger.info('metric eval ***************') + for k, v in metric.items(): + logger.info('{}:{}'.format(k, v)) + + +if __name__ == '__main__': + config, device, logger, vdl_writer = program.preprocess() + main() diff --git a/tools/export_center.py b/tools/export_center.py new file mode 100644 index 0000000000000000000000000000000000000000..30b9c33499b8d0c8044682c6a078e00f683c1d7c --- /dev/null +++ b/tools/export_center.py @@ -0,0 +1,77 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import pickle + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) + +from ppocr.data import build_dataloader +from ppocr.modeling.architectures import build_model +from ppocr.postprocess import build_post_process +from ppocr.utils.save_load import load_model +from ppocr.utils.utility import print_dict +import tools.program as program + + +def main(): + global_config = config['Global'] + # build dataloader + config['Eval']['dataset']['name'] = config['Train']['dataset']['name'] + config['Eval']['dataset']['data_dir'] = config['Train']['dataset'][ + 'data_dir'] + config['Eval']['dataset']['label_file_list'] = config['Train']['dataset'][ + 'label_file_list'] + eval_dataloader = build_dataloader(config, 'Eval', device, logger) + + # build post process + post_process_class = build_post_process(config['PostProcess'], + global_config) + + # build model + # for rec algorithm + if hasattr(post_process_class, 'character'): + char_num = len(getattr(post_process_class, 'character')) + config['Architecture']["Head"]['out_channels'] = char_num + + #set return_features = True + config['Architecture']["Head"]["return_feats"] = True + + model = build_model(config['Architecture']) + + best_model_dict = load_model(config, model) + if len(best_model_dict): + logger.info('metric in ckpt ***************') + for k, v in best_model_dict.items(): + logger.info('{}:{}'.format(k, v)) + + # get features from train data + char_center = program.get_center(model, eval_dataloader, post_process_class) + + #serialize to disk + with open("train_center.pkl", 'wb') as f: + pickle.dump(char_center, f) + return + + +if __name__ == '__main__': + config, device, logger, vdl_writer = program.preprocess() + main() diff --git a/tools/export_model.py b/tools/export_model.py new file mode 100644 index 0000000000000000000000000000000000000000..4b90fcae435619a53a3def8cc4dc46b4e2963bff --- /dev/null +++ b/tools/export_model.py @@ -0,0 +1,269 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, ".."))) + +import argparse + +import paddle +from paddle.jit import to_static + +from ppocr.modeling.architectures import build_model +from ppocr.postprocess import build_post_process +from ppocr.utils.save_load import load_model +from ppocr.utils.logging import get_logger +from tools.program import load_config, merge_config, ArgsParser + + +def export_single_model(model, + arch_config, + save_path, + logger, + input_shape=None, + quanter=None): + if arch_config["algorithm"] == "SRN": + max_text_length = arch_config["Head"]["max_text_length"] + other_shape = [ + paddle.static.InputSpec( + shape=[None, 1, 64, 256], dtype="float32"), [ + paddle.static.InputSpec( + shape=[None, 256, 1], + dtype="int64"), paddle.static.InputSpec( + shape=[None, max_text_length, 1], dtype="int64"), + paddle.static.InputSpec( + shape=[None, 8, max_text_length, max_text_length], + dtype="int64"), paddle.static.InputSpec( + shape=[None, 8, max_text_length, max_text_length], + dtype="int64") + ] + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "SAR": + other_shape = [ + paddle.static.InputSpec( + shape=[None, 3, 48, 160], dtype="float32"), + [paddle.static.InputSpec( + shape=[None], dtype="float32")] + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "SVTR": + if arch_config["Head"]["name"] == 'MultiHead': + other_shape = [ + paddle.static.InputSpec( + shape=[None, 3, 48, -1], dtype="float32"), + ] + else: + other_shape = [ + paddle.static.InputSpec( + shape=[None] + input_shape, dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "PREN": + other_shape = [ + paddle.static.InputSpec( + shape=[None, 3, 64, 256], dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["model_type"] == "sr": + other_shape = [ + paddle.static.InputSpec( + shape=[None, 3, 16, 64], dtype="float32") + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "ViTSTR": + other_shape = [ + paddle.static.InputSpec( + shape=[None, 1, 224, 224], dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "ABINet": + other_shape = [ + paddle.static.InputSpec( + shape=[None, 3, 32, 128], dtype="float32"), + ] + # print([None, 3, 32, 128]) + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] in ["NRTR", "SPIN", 'RFL']: + other_shape = [ + paddle.static.InputSpec( + shape=[None, 1, 32, 100], dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "VisionLAN": + other_shape = [ + paddle.static.InputSpec( + shape=[None, 3, 64, 256], dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "RobustScanner": + max_text_length = arch_config["Head"]["max_text_length"] + other_shape = [ + paddle.static.InputSpec( + shape=[None, 3, 48, 160], dtype="float32"), [ + paddle.static.InputSpec( + shape=[None, ], dtype="float32"), + paddle.static.InputSpec( + shape=[None, max_text_length], dtype="int64") + ] + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "CAN": + other_shape = [[ + paddle.static.InputSpec( + shape=[None, 1, None, None], + dtype="float32"), paddle.static.InputSpec( + shape=[None, 1, None, None], dtype="float32"), + paddle.static.InputSpec( + shape=[None, arch_config['Head']['max_text_length']], + dtype="int64") + ]] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] in ["LayoutLM", "LayoutLMv2", "LayoutXLM"]: + input_spec = [ + paddle.static.InputSpec( + shape=[None, 512], dtype="int64"), # input_ids + paddle.static.InputSpec( + shape=[None, 512, 4], dtype="int64"), # bbox + paddle.static.InputSpec( + shape=[None, 512], dtype="int64"), # attention_mask + paddle.static.InputSpec( + shape=[None, 512], dtype="int64"), # token_type_ids + paddle.static.InputSpec( + shape=[None, 3, 224, 224], dtype="int64"), # image + ] + if 'Re' in arch_config['Backbone']['name']: + input_spec.extend([ + paddle.static.InputSpec( + shape=[None, 512, 3], dtype="int64"), # entities + paddle.static.InputSpec( + shape=[None, None, 2], dtype="int64"), # relations + ]) + if model.backbone.use_visual_backbone is False: + input_spec.pop(4) + model = to_static(model, input_spec=[input_spec]) + else: + infer_shape = [3, -1, -1] + if arch_config["model_type"] == "rec": + infer_shape = [3, 32, -1] # for rec model, H must be 32 + if "Transform" in arch_config and arch_config[ + "Transform"] is not None and arch_config["Transform"][ + "name"] == "TPS": + logger.info( + "When there is tps in the network, variable length input is not supported, and the input size needs to be the same as during training" + ) + infer_shape[-1] = 100 + elif arch_config["model_type"] == "table": + infer_shape = [3, 488, 488] + if arch_config["algorithm"] == "TableMaster": + infer_shape = [3, 480, 480] + if arch_config["algorithm"] == "SLANet": + infer_shape = [3, -1, -1] + model = to_static( + model, + input_spec=[ + paddle.static.InputSpec( + shape=[None] + infer_shape, dtype="float32") + ]) + + if quanter is None: + paddle.jit.save(model, save_path) + else: + quanter.save_quantized_model(model, save_path) + logger.info("inference model is saved to {}".format(save_path)) + return + + +def main(): + FLAGS = ArgsParser().parse_args() + config = load_config(FLAGS.config) + config = merge_config(config, FLAGS.opt) + logger = get_logger() + # build post process + + post_process_class = build_post_process(config["PostProcess"], + config["Global"]) + + # build model + # for rec algorithm + if hasattr(post_process_class, "character"): + char_num = len(getattr(post_process_class, "character")) + if config["Architecture"]["algorithm"] in ["Distillation", + ]: # distillation model + for key in config["Architecture"]["Models"]: + if config["Architecture"]["Models"][key]["Head"][ + "name"] == 'MultiHead': # multi head + out_channels_list = {} + if config['PostProcess'][ + 'name'] == 'DistillationSARLabelDecode': + char_num = char_num - 2 + out_channels_list['CTCLabelDecode'] = char_num + out_channels_list['SARLabelDecode'] = char_num + 2 + config['Architecture']['Models'][key]['Head'][ + 'out_channels_list'] = out_channels_list + else: + config["Architecture"]["Models"][key]["Head"][ + "out_channels"] = char_num + # just one final tensor needs to exported for inference + config["Architecture"]["Models"][key][ + "return_all_feats"] = False + elif config['Architecture']['Head'][ + 'name'] == 'MultiHead': # multi head + out_channels_list = {} + char_num = len(getattr(post_process_class, 'character')) + if config['PostProcess']['name'] == 'SARLabelDecode': + char_num = char_num - 2 + out_channels_list['CTCLabelDecode'] = char_num + out_channels_list['SARLabelDecode'] = char_num + 2 + config['Architecture']['Head'][ + 'out_channels_list'] = out_channels_list + else: # base rec model + config["Architecture"]["Head"]["out_channels"] = char_num + + # for sr algorithm + if config["Architecture"]["model_type"] == "sr": + config['Architecture']["Transform"]['infer_mode'] = True + model = build_model(config["Architecture"]) + load_model(config, model, model_type=config['Architecture']["model_type"]) + model.eval() + + save_path = config["Global"]["save_inference_dir"] + + arch_config = config["Architecture"] + + if arch_config["algorithm"] == "SVTR" and arch_config["Head"][ + "name"] != 'MultiHead': + input_shape = config["Eval"]["dataset"]["transforms"][-2][ + 'SVTRRecResizeImg']['image_shape'] + else: + input_shape = None + + if arch_config["algorithm"] in ["Distillation", ]: # distillation model + archs = list(arch_config["Models"].values()) + for idx, name in enumerate(model.model_name_list): + sub_model_save_path = os.path.join(save_path, name, "inference") + export_single_model(model.model_list[idx], archs[idx], + sub_model_save_path, logger) + else: + save_path = os.path.join(save_path, "inference") + export_single_model( + model, arch_config, save_path, logger, input_shape=input_shape) + + +if __name__ == "__main__": + main() diff --git a/tools/infer/__pycache__/predict_cls.cpython-37.pyc b/tools/infer/__pycache__/predict_cls.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e1f97a23bbd31a19ac1605e5dfd201c42d4c3f2 Binary files /dev/null and b/tools/infer/__pycache__/predict_cls.cpython-37.pyc differ diff --git a/tools/infer/__pycache__/predict_cls.cpython-38.pyc b/tools/infer/__pycache__/predict_cls.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..368a4d392f5a5945a944953185084348b25364b8 Binary files /dev/null and b/tools/infer/__pycache__/predict_cls.cpython-38.pyc differ diff --git a/tools/infer/__pycache__/predict_det.cpython-37.pyc b/tools/infer/__pycache__/predict_det.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a51504410fc09c647ddeb9fb7660c2098c6a302 Binary files /dev/null and b/tools/infer/__pycache__/predict_det.cpython-37.pyc differ diff --git a/tools/infer/__pycache__/predict_det.cpython-38.pyc b/tools/infer/__pycache__/predict_det.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d243b7f2f27763d15c73c189fad4d2ea4a1151cb Binary files /dev/null and b/tools/infer/__pycache__/predict_det.cpython-38.pyc differ diff --git a/tools/infer/__pycache__/predict_rec.cpython-37.pyc b/tools/infer/__pycache__/predict_rec.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f454f9c994b1604534fd6ab340503bdecc01d0f1 Binary files /dev/null and b/tools/infer/__pycache__/predict_rec.cpython-37.pyc differ diff --git a/tools/infer/__pycache__/predict_rec.cpython-38.pyc b/tools/infer/__pycache__/predict_rec.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a4a365e1304035a61e0ee2085a12c3743990955 Binary files /dev/null and b/tools/infer/__pycache__/predict_rec.cpython-38.pyc differ diff --git a/tools/infer/__pycache__/predict_system.cpython-37.pyc b/tools/infer/__pycache__/predict_system.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..857115b209e404067f5bc14c88c6088bd4b26289 Binary files /dev/null and b/tools/infer/__pycache__/predict_system.cpython-37.pyc differ diff --git a/tools/infer/__pycache__/predict_system.cpython-38.pyc b/tools/infer/__pycache__/predict_system.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2682b8dbc606f2068c3ce6647047b32614b3554e Binary files /dev/null and b/tools/infer/__pycache__/predict_system.cpython-38.pyc differ diff --git a/tools/infer/__pycache__/utility.cpython-37.pyc b/tools/infer/__pycache__/utility.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56bfe8902d9d6c4a28b130be83fe6cf53cab9d44 Binary files /dev/null and b/tools/infer/__pycache__/utility.cpython-37.pyc differ diff --git a/tools/infer/__pycache__/utility.cpython-38.pyc b/tools/infer/__pycache__/utility.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f510691c52a4a6fb5ee613a9e3af86b8b4dfd05 Binary files /dev/null and b/tools/infer/__pycache__/utility.cpython-38.pyc differ diff --git a/tools/infer/predict_cls.py b/tools/infer/predict_cls.py new file mode 100644 index 0000000000000000000000000000000000000000..d2b7108ca35666acfa53e785686fd7b9dfc21ed5 --- /dev/null +++ b/tools/infer/predict_cls.py @@ -0,0 +1,151 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import copy +import numpy as np +import math +import time +import traceback + +import tools.infer.utility as utility +from ppocr.postprocess import build_post_process +from ppocr.utils.logging import get_logger +from ppocr.utils.utility import get_image_file_list, check_and_read + +logger = get_logger() + + +class TextClassifier(object): + def __init__(self, args): + self.cls_image_shape = [int(v) for v in args.cls_image_shape.split(",")] + self.cls_batch_num = args.cls_batch_num + self.cls_thresh = args.cls_thresh + postprocess_params = { + 'name': 'ClsPostProcess', + "label_list": args.label_list, + } + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor, self.output_tensors, _ = \ + utility.create_predictor(args, 'cls', logger) + self.use_onnx = args.use_onnx + + def resize_norm_img(self, img): + imgC, imgH, imgW = self.cls_image_shape + h = img.shape[0] + w = img.shape[1] + ratio = w / float(h) + if math.ceil(imgH * ratio) > imgW: + resized_w = imgW + else: + resized_w = int(math.ceil(imgH * ratio)) + resized_image = cv2.resize(img, (resized_w, imgH)) + resized_image = resized_image.astype('float32') + if self.cls_image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) + padding_im[:, :, 0:resized_w] = resized_image + return padding_im + + def __call__(self, img_list): + img_list = copy.deepcopy(img_list) + img_num = len(img_list) + # Calculate the aspect ratio of all text bars + width_list = [] + for img in img_list: + width_list.append(img.shape[1] / float(img.shape[0])) + # Sorting can speed up the cls process + indices = np.argsort(np.array(width_list)) + + cls_res = [['', 0.0]] * img_num + batch_num = self.cls_batch_num + elapse = 0 + for beg_img_no in range(0, img_num, batch_num): + + end_img_no = min(img_num, beg_img_no + batch_num) + norm_img_batch = [] + max_wh_ratio = 0 + starttime = time.time() + for ino in range(beg_img_no, end_img_no): + h, w = img_list[indices[ino]].shape[0:2] + wh_ratio = w * 1.0 / h + max_wh_ratio = max(max_wh_ratio, wh_ratio) + for ino in range(beg_img_no, end_img_no): + norm_img = self.resize_norm_img(img_list[indices[ino]]) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + norm_img_batch = np.concatenate(norm_img_batch) + norm_img_batch = norm_img_batch.copy() + + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = norm_img_batch + outputs = self.predictor.run(self.output_tensors, input_dict) + prob_out = outputs[0] + else: + self.input_tensor.copy_from_cpu(norm_img_batch) + self.predictor.run() + prob_out = self.output_tensors[0].copy_to_cpu() + self.predictor.try_shrink_memory() + cls_result = self.postprocess_op(prob_out) + elapse += time.time() - starttime + for rno in range(len(cls_result)): + label, score = cls_result[rno] + cls_res[indices[beg_img_no + rno]] = [label, score] + if '180' in label and score > self.cls_thresh: + img_list[indices[beg_img_no + rno]] = cv2.rotate( + img_list[indices[beg_img_no + rno]], 1) + return img_list, cls_res, elapse + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + text_classifier = TextClassifier(args) + valid_image_file_list = [] + img_list = [] + for image_file in image_file_list: + img, flag, _ = check_and_read(image_file) + if not flag: + img = cv2.imread(image_file) + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + valid_image_file_list.append(image_file) + img_list.append(img) + try: + img_list, cls_res, predict_time = text_classifier(img_list) + except Exception as E: + logger.info(traceback.format_exc()) + logger.info(E) + exit() + for ino in range(len(img_list)): + logger.info("Predicts of {}:{}".format(valid_image_file_list[ino], + cls_res[ino])) + + +if __name__ == "__main__": + main(utility.parse_args()) diff --git a/tools/infer/predict_det.py b/tools/infer/predict_det.py new file mode 100644 index 0000000000000000000000000000000000000000..1b4446a6717bccdc5b3de4ba70e058885479be84 --- /dev/null +++ b/tools/infer/predict_det.py @@ -0,0 +1,353 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import numpy as np +import time +import sys + +import tools.infer.utility as utility +from ppocr.utils.logging import get_logger +from ppocr.utils.utility import get_image_file_list, check_and_read +from ppocr.data import create_operators, transform +from ppocr.postprocess import build_post_process +import json +logger = get_logger() + + +class TextDetector(object): + def __init__(self, args): + self.args = args + self.det_algorithm = args.det_algorithm + self.use_onnx = args.use_onnx + pre_process_list = [{ + 'DetResizeForTest': { + 'limit_side_len': args.det_limit_side_len, + 'limit_type': args.det_limit_type, + } + }, { + 'NormalizeImage': { + 'std': [0.229, 0.224, 0.225], + 'mean': [0.485, 0.456, 0.406], + 'scale': '1./255.', + 'order': 'hwc' + } + }, { + 'ToCHWImage': None + }, { + 'KeepKeys': { + 'keep_keys': ['image', 'shape'] + } + }] + postprocess_params = {} + if self.det_algorithm == "DB": + postprocess_params['name'] = 'DBPostProcess' + postprocess_params["thresh"] = args.det_db_thresh + postprocess_params["box_thresh"] = args.det_db_box_thresh + postprocess_params["max_candidates"] = 1000 + postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio + postprocess_params["use_dilation"] = args.use_dilation + postprocess_params["score_mode"] = args.det_db_score_mode + postprocess_params["box_type"] = args.det_box_type + elif self.det_algorithm == "DB++": + postprocess_params['name'] = 'DBPostProcess' + postprocess_params["thresh"] = args.det_db_thresh + postprocess_params["box_thresh"] = args.det_db_box_thresh + postprocess_params["max_candidates"] = 1000 + postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio + postprocess_params["use_dilation"] = args.use_dilation + postprocess_params["score_mode"] = args.det_db_score_mode + postprocess_params["box_type"] = args.det_box_type + pre_process_list[1] = { + 'NormalizeImage': { + 'std': [1.0, 1.0, 1.0], + 'mean': + [0.48109378172549, 0.45752457890196, 0.40787054090196], + 'scale': '1./255.', + 'order': 'hwc' + } + } + elif self.det_algorithm == "EAST": + postprocess_params['name'] = 'EASTPostProcess' + postprocess_params["score_thresh"] = args.det_east_score_thresh + postprocess_params["cover_thresh"] = args.det_east_cover_thresh + postprocess_params["nms_thresh"] = args.det_east_nms_thresh + elif self.det_algorithm == "SAST": + pre_process_list[0] = { + 'DetResizeForTest': { + 'resize_long': args.det_limit_side_len + } + } + postprocess_params['name'] = 'SASTPostProcess' + postprocess_params["score_thresh"] = args.det_sast_score_thresh + postprocess_params["nms_thresh"] = args.det_sast_nms_thresh + + if args.det_box_type == 'poly': + postprocess_params["sample_pts_num"] = 6 + postprocess_params["expand_scale"] = 1.2 + postprocess_params["shrink_ratio_of_width"] = 0.2 + else: + postprocess_params["sample_pts_num"] = 2 + postprocess_params["expand_scale"] = 1.0 + postprocess_params["shrink_ratio_of_width"] = 0.3 + + elif self.det_algorithm == "PSE": + postprocess_params['name'] = 'PSEPostProcess' + postprocess_params["thresh"] = args.det_pse_thresh + postprocess_params["box_thresh"] = args.det_pse_box_thresh + postprocess_params["min_area"] = args.det_pse_min_area + postprocess_params["box_type"] = args.det_box_type + postprocess_params["scale"] = args.det_pse_scale + elif self.det_algorithm == "FCE": + pre_process_list[0] = { + 'DetResizeForTest': { + 'rescale_img': [1080, 736] + } + } + postprocess_params['name'] = 'FCEPostProcess' + postprocess_params["scales"] = args.scales + postprocess_params["alpha"] = args.alpha + postprocess_params["beta"] = args.beta + postprocess_params["fourier_degree"] = args.fourier_degree + postprocess_params["box_type"] = args.det_box_type + elif self.det_algorithm == "CT": + pre_process_list[0] = {'ScaleAlignedShort': {'short_size': 640}} + postprocess_params['name'] = 'CTPostProcess' + else: + logger.info("unknown det_algorithm:{}".format(self.det_algorithm)) + sys.exit(0) + + self.preprocess_op = create_operators(pre_process_list) + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor, self.output_tensors, self.config = utility.create_predictor( + args, 'det', logger) + + if self.use_onnx: + img_h, img_w = self.input_tensor.shape[2:] + if img_h is not None and img_w is not None and img_h > 0 and img_w > 0: + pre_process_list[0] = { + 'DetResizeForTest': { + 'image_shape': [img_h, img_w] + } + } + self.preprocess_op = create_operators(pre_process_list) + + if args.benchmark: + import auto_log + pid = os.getpid() + gpu_id = utility.get_infer_gpuid() + self.autolog = auto_log.AutoLogger( + model_name="det", + model_precision=args.precision, + batch_size=1, + data_shape="dynamic", + save_path=None, + inference_config=self.config, + pids=pid, + process_name=None, + gpu_ids=gpu_id if args.use_gpu else None, + time_keys=[ + 'preprocess_time', 'inference_time', 'postprocess_time' + ], + warmup=2, + logger=logger) + + def order_points_clockwise(self, pts): + rect = np.zeros((4, 2), dtype="float32") + s = pts.sum(axis=1) + rect[0] = pts[np.argmin(s)] + rect[2] = pts[np.argmax(s)] + tmp = np.delete(pts, (np.argmin(s), np.argmax(s)), axis=0) + diff = np.diff(np.array(tmp), axis=1) + rect[1] = tmp[np.argmin(diff)] + rect[3] = tmp[np.argmax(diff)] + return rect + + def clip_det_res(self, points, img_height, img_width): + for pno in range(points.shape[0]): + points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1)) + points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1)) + return points + + def filter_tag_det_res(self, dt_boxes, image_shape): + img_height, img_width = image_shape[0:2] + dt_boxes_new = [] + for box in dt_boxes: + if type(box) is list: + box = np.array(box) + box = self.order_points_clockwise(box) + box = self.clip_det_res(box, img_height, img_width) + rect_width = int(np.linalg.norm(box[0] - box[1])) + rect_height = int(np.linalg.norm(box[0] - box[3])) + if rect_width <= 3 or rect_height <= 3: + continue + dt_boxes_new.append(box) + dt_boxes = np.array(dt_boxes_new) + return dt_boxes + + def filter_tag_det_res_only_clip(self, dt_boxes, image_shape): + img_height, img_width = image_shape[0:2] + dt_boxes_new = [] + for box in dt_boxes: + if type(box) is list: + box = np.array(box) + box = self.clip_det_res(box, img_height, img_width) + dt_boxes_new.append(box) + dt_boxes = np.array(dt_boxes_new) + return dt_boxes + + def __call__(self, img): + ori_im = img.copy() + data = {'image': img} + + st = time.time() + + if self.args.benchmark: + self.autolog.times.start() + + data = transform(data, self.preprocess_op) + img, shape_list = data + if img is None: + return None, 0 + img = np.expand_dims(img, axis=0) + shape_list = np.expand_dims(shape_list, axis=0) + img = img.copy() + + if self.args.benchmark: + self.autolog.times.stamp() + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = img + outputs = self.predictor.run(self.output_tensors, input_dict) + else: + self.input_tensor.copy_from_cpu(img) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if self.args.benchmark: + self.autolog.times.stamp() + + preds = {} + if self.det_algorithm == "EAST": + preds['f_geo'] = outputs[0] + preds['f_score'] = outputs[1] + elif self.det_algorithm == 'SAST': + preds['f_border'] = outputs[0] + preds['f_score'] = outputs[1] + preds['f_tco'] = outputs[2] + preds['f_tvo'] = outputs[3] + elif self.det_algorithm in ['DB', 'PSE', 'DB++']: + preds['maps'] = outputs[0] + elif self.det_algorithm == 'FCE': + for i, output in enumerate(outputs): + preds['level_{}'.format(i)] = output + elif self.det_algorithm == "CT": + preds['maps'] = outputs[0] + preds['score'] = outputs[1] + else: + raise NotImplementedError + + post_result = self.postprocess_op(preds, shape_list) + dt_boxes = post_result[0]['points'] + + if self.args.det_box_type == 'poly': + dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape) + else: + dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape) + + if self.args.benchmark: + self.autolog.times.end(stamp=True) + et = time.time() + return dt_boxes, et - st + + +if __name__ == "__main__": + args = utility.parse_args() + image_file_list = get_image_file_list(args.image_dir) + text_detector = TextDetector(args) + total_time = 0 + draw_img_save_dir = args.draw_img_save_dir + os.makedirs(draw_img_save_dir, exist_ok=True) + + if args.warmup: + img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8) + for i in range(2): + res = text_detector(img) + + save_results = [] + for idx, image_file in enumerate(image_file_list): + img, flag_gif, flag_pdf = check_and_read(image_file) + if not flag_gif and not flag_pdf: + img = cv2.imread(image_file) + if not flag_pdf: + if img is None: + logger.debug("error in loading image:{}".format(image_file)) + continue + imgs = [img] + else: + page_num = args.page_num + if page_num > len(img) or page_num == 0: + page_num = len(img) + imgs = img[:page_num] + for index, img in enumerate(imgs): + st = time.time() + dt_boxes, _ = text_detector(img) + elapse = time.time() - st + total_time += elapse + if len(imgs) > 1: + save_pred = os.path.basename(image_file) + '_' + str( + index) + "\t" + str( + json.dumps([x.tolist() for x in dt_boxes])) + "\n" + else: + save_pred = os.path.basename(image_file) + "\t" + str( + json.dumps([x.tolist() for x in dt_boxes])) + "\n" + save_results.append(save_pred) + logger.info(save_pred) + if len(imgs) > 1: + logger.info("{}_{} The predict time of {}: {}".format( + idx, index, image_file, elapse)) + else: + logger.info("{} The predict time of {}: {}".format( + idx, image_file, elapse)) + + src_im = utility.draw_text_det_res(dt_boxes, img) + + if flag_gif: + save_file = image_file[:-3] + "png" + elif flag_pdf: + save_file = image_file.replace('.pdf', + '_' + str(index) + '.png') + else: + save_file = image_file + img_path = os.path.join( + draw_img_save_dir, + "det_res_{}".format(os.path.basename(save_file))) + cv2.imwrite(img_path, src_im) + logger.info("The visualized image saved in {}".format(img_path)) + + with open(os.path.join(draw_img_save_dir, "det_results.txt"), 'w') as f: + f.writelines(save_results) + f.close() + if args.benchmark: + text_detector.autolog.report() diff --git a/tools/infer/predict_e2e.py b/tools/infer/predict_e2e.py new file mode 100644 index 0000000000000000000000000000000000000000..de315d701c7172ded4d30e48e79abee367f42239 --- /dev/null +++ b/tools/infer/predict_e2e.py @@ -0,0 +1,169 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import numpy as np +import time +import sys + +import tools.infer.utility as utility +from ppocr.utils.logging import get_logger +from ppocr.utils.utility import get_image_file_list, check_and_read +from ppocr.data import create_operators, transform +from ppocr.postprocess import build_post_process + +logger = get_logger() + + +class TextE2E(object): + def __init__(self, args): + self.args = args + self.e2e_algorithm = args.e2e_algorithm + self.use_onnx = args.use_onnx + pre_process_list = [{ + 'E2EResizeForTest': {} + }, { + 'NormalizeImage': { + 'std': [0.229, 0.224, 0.225], + 'mean': [0.485, 0.456, 0.406], + 'scale': '1./255.', + 'order': 'hwc' + } + }, { + 'ToCHWImage': None + }, { + 'KeepKeys': { + 'keep_keys': ['image', 'shape'] + } + }] + postprocess_params = {} + if self.e2e_algorithm == "PGNet": + pre_process_list[0] = { + 'E2EResizeForTest': { + 'max_side_len': args.e2e_limit_side_len, + 'valid_set': 'totaltext' + } + } + postprocess_params['name'] = 'PGPostProcess' + postprocess_params["score_thresh"] = args.e2e_pgnet_score_thresh + postprocess_params["character_dict_path"] = args.e2e_char_dict_path + postprocess_params["valid_set"] = args.e2e_pgnet_valid_set + postprocess_params["mode"] = args.e2e_pgnet_mode + else: + logger.info("unknown e2e_algorithm:{}".format(self.e2e_algorithm)) + sys.exit(0) + + self.preprocess_op = create_operators(pre_process_list) + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor, self.output_tensors, _ = utility.create_predictor( + args, 'e2e', logger) # paddle.jit.load(args.det_model_dir) + # self.predictor.eval() + + def clip_det_res(self, points, img_height, img_width): + for pno in range(points.shape[0]): + points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1)) + points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1)) + return points + + def filter_tag_det_res_only_clip(self, dt_boxes, image_shape): + img_height, img_width = image_shape[0:2] + dt_boxes_new = [] + for box in dt_boxes: + box = self.clip_det_res(box, img_height, img_width) + dt_boxes_new.append(box) + dt_boxes = np.array(dt_boxes_new) + return dt_boxes + + def __call__(self, img): + + ori_im = img.copy() + data = {'image': img} + data = transform(data, self.preprocess_op) + img, shape_list = data + if img is None: + return None, 0 + img = np.expand_dims(img, axis=0) + shape_list = np.expand_dims(shape_list, axis=0) + img = img.copy() + starttime = time.time() + + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = img + outputs = self.predictor.run(self.output_tensors, input_dict) + preds = {} + preds['f_border'] = outputs[0] + preds['f_char'] = outputs[1] + preds['f_direction'] = outputs[2] + preds['f_score'] = outputs[3] + else: + self.input_tensor.copy_from_cpu(img) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + + preds = {} + if self.e2e_algorithm == 'PGNet': + preds['f_border'] = outputs[0] + preds['f_char'] = outputs[1] + preds['f_direction'] = outputs[2] + preds['f_score'] = outputs[3] + else: + raise NotImplementedError + post_result = self.postprocess_op(preds, shape_list) + points, strs = post_result['points'], post_result['texts'] + dt_boxes = self.filter_tag_det_res_only_clip(points, ori_im.shape) + elapse = time.time() - starttime + return dt_boxes, strs, elapse + + +if __name__ == "__main__": + args = utility.parse_args() + image_file_list = get_image_file_list(args.image_dir) + text_detector = TextE2E(args) + count = 0 + total_time = 0 + draw_img_save = "./inference_results" + if not os.path.exists(draw_img_save): + os.makedirs(draw_img_save) + for image_file in image_file_list: + img, flag, _ = check_and_read(image_file) + if not flag: + img = cv2.imread(image_file) + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + points, strs, elapse = text_detector(img) + if count > 0: + total_time += elapse + count += 1 + logger.info("Predict time of {}: {}".format(image_file, elapse)) + src_im = utility.draw_e2e_res(points, strs, image_file) + img_name_pure = os.path.split(image_file)[-1] + img_path = os.path.join(draw_img_save, + "e2e_res_{}".format(img_name_pure)) + cv2.imwrite(img_path, src_im) + logger.info("The visualized image saved in {}".format(img_path)) + if count > 1: + logger.info("Avg Time: {}".format(total_time / (count - 1))) diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py new file mode 100644 index 0000000000000000000000000000000000000000..b3ef557c09fb74990b65c266afa5d5c77960b7ed --- /dev/null +++ b/tools/infer/predict_rec.py @@ -0,0 +1,667 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +from PIL import Image +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import numpy as np +import math +import time +import traceback +import paddle + +import tools.infer.utility as utility +from ppocr.postprocess import build_post_process +from ppocr.utils.logging import get_logger +from ppocr.utils.utility import get_image_file_list, check_and_read + +logger = get_logger() + + +class TextRecognizer(object): + def __init__(self, args): + self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")] + self.rec_batch_num = args.rec_batch_num + self.rec_algorithm = args.rec_algorithm + postprocess_params = { + 'name': 'CTCLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + if self.rec_algorithm == "SRN": + postprocess_params = { + 'name': 'SRNLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "RARE": + postprocess_params = { + 'name': 'AttnLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == 'NRTR': + postprocess_params = { + 'name': 'NRTRLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "SAR": + postprocess_params = { + 'name': 'SARLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "VisionLAN": + postprocess_params = { + 'name': 'VLLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == 'ViTSTR': + postprocess_params = { + 'name': 'ViTSTRLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == 'ABINet': + postprocess_params = { + 'name': 'ABINetLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "SPIN": + postprocess_params = { + 'name': 'SPINLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "RobustScanner": + postprocess_params = { + 'name': 'SARLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char, + "rm_symbol": True + } + elif self.rec_algorithm == 'RFL': + postprocess_params = { + 'name': 'RFLLabelDecode', + "character_dict_path": None, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "PREN": + postprocess_params = {'name': 'PRENLabelDecode'} + elif self.rec_algorithm == "CAN": + self.inverse = args.rec_image_inverse + postprocess_params = { + 'name': 'CANLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + self.postprocess_op = build_post_process(postprocess_params) + self.predictor, self.input_tensor, self.output_tensors, self.config = \ + utility.create_predictor(args, 'rec', logger) + self.benchmark = args.benchmark + self.use_onnx = args.use_onnx + if args.benchmark: + import auto_log + pid = os.getpid() + gpu_id = utility.get_infer_gpuid() + self.autolog = auto_log.AutoLogger( + model_name="rec", + model_precision=args.precision, + batch_size=args.rec_batch_num, + data_shape="dynamic", + save_path=None, #args.save_log_path, + inference_config=self.config, + pids=pid, + process_name=None, + gpu_ids=gpu_id if args.use_gpu else None, + time_keys=[ + 'preprocess_time', 'inference_time', 'postprocess_time' + ], + warmup=0, + logger=logger) + + def resize_norm_img(self, img, max_wh_ratio): + imgC, imgH, imgW = self.rec_image_shape + if self.rec_algorithm == 'NRTR' or self.rec_algorithm == 'ViTSTR': + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + # return padding_im + image_pil = Image.fromarray(np.uint8(img)) + if self.rec_algorithm == 'ViTSTR': + img = image_pil.resize([imgW, imgH], Image.BICUBIC) + else: + img = image_pil.resize([imgW, imgH], Image.ANTIALIAS) + img = np.array(img) + norm_img = np.expand_dims(img, -1) + norm_img = norm_img.transpose((2, 0, 1)) + if self.rec_algorithm == 'ViTSTR': + norm_img = norm_img.astype(np.float32) / 255. + else: + norm_img = norm_img.astype(np.float32) / 128. - 1. + return norm_img + elif self.rec_algorithm == 'RFL': + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_CUBIC) + resized_image = resized_image.astype('float32') + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + resized_image -= 0.5 + resized_image /= 0.5 + return resized_image + + assert imgC == img.shape[2] + imgW = int((imgH * max_wh_ratio)) + if self.use_onnx: + w = self.input_tensor.shape[3:][0] + if w is not None and w > 0: + imgW = w + + h, w = img.shape[:2] + ratio = w / float(h) + if math.ceil(imgH * ratio) > imgW: + resized_w = imgW + else: + resized_w = int(math.ceil(imgH * ratio)) + if self.rec_algorithm == 'RARE': + if resized_w > self.rec_image_shape[2]: + resized_w = self.rec_image_shape[2] + imgW = self.rec_image_shape[2] + resized_image = cv2.resize(img, (resized_w, imgH)) + resized_image = resized_image.astype('float32') + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) + padding_im[:, :, 0:resized_w] = resized_image + return padding_im + + def resize_norm_img_vl(self, img, image_shape): + + imgC, imgH, imgW = image_shape + img = img[:, :, ::-1] # bgr2rgb + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) + resized_image = resized_image.astype('float32') + resized_image = resized_image.transpose((2, 0, 1)) / 255 + return resized_image + + def resize_norm_img_srn(self, img, image_shape): + imgC, imgH, imgW = image_shape + + img_black = np.zeros((imgH, imgW)) + im_hei = img.shape[0] + im_wid = img.shape[1] + + if im_wid <= im_hei * 1: + img_new = cv2.resize(img, (imgH * 1, imgH)) + elif im_wid <= im_hei * 2: + img_new = cv2.resize(img, (imgH * 2, imgH)) + elif im_wid <= im_hei * 3: + img_new = cv2.resize(img, (imgH * 3, imgH)) + else: + img_new = cv2.resize(img, (imgW, imgH)) + + img_np = np.asarray(img_new) + img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY) + img_black[:, 0:img_np.shape[1]] = img_np + img_black = img_black[:, :, np.newaxis] + + row, col, c = img_black.shape + c = 1 + + return np.reshape(img_black, (c, row, col)).astype(np.float32) + + def srn_other_inputs(self, image_shape, num_heads, max_text_length): + + imgC, imgH, imgW = image_shape + feature_dim = int((imgH / 8) * (imgW / 8)) + + encoder_word_pos = np.array(range(0, feature_dim)).reshape( + (feature_dim, 1)).astype('int64') + gsrm_word_pos = np.array(range(0, max_text_length)).reshape( + (max_text_length, 1)).astype('int64') + + gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length)) + gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape( + [-1, 1, max_text_length, max_text_length]) + gsrm_slf_attn_bias1 = np.tile( + gsrm_slf_attn_bias1, + [1, num_heads, 1, 1]).astype('float32') * [-1e9] + + gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape( + [-1, 1, max_text_length, max_text_length]) + gsrm_slf_attn_bias2 = np.tile( + gsrm_slf_attn_bias2, + [1, num_heads, 1, 1]).astype('float32') * [-1e9] + + encoder_word_pos = encoder_word_pos[np.newaxis, :] + gsrm_word_pos = gsrm_word_pos[np.newaxis, :] + + return [ + encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2 + ] + + def process_image_srn(self, img, image_shape, num_heads, max_text_length): + norm_img = self.resize_norm_img_srn(img, image_shape) + norm_img = norm_img[np.newaxis, :] + + [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \ + self.srn_other_inputs(image_shape, num_heads, max_text_length) + + gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32) + gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32) + encoder_word_pos = encoder_word_pos.astype(np.int64) + gsrm_word_pos = gsrm_word_pos.astype(np.int64) + + return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2) + + def resize_norm_img_sar(self, img, image_shape, + width_downsample_ratio=0.25): + imgC, imgH, imgW_min, imgW_max = image_shape + h = img.shape[0] + w = img.shape[1] + valid_ratio = 1.0 + # make sure new_width is an integral multiple of width_divisor. + width_divisor = int(1 / width_downsample_ratio) + # resize + ratio = w / float(h) + resize_w = math.ceil(imgH * ratio) + if resize_w % width_divisor != 0: + resize_w = round(resize_w / width_divisor) * width_divisor + if imgW_min is not None: + resize_w = max(imgW_min, resize_w) + if imgW_max is not None: + valid_ratio = min(1.0, 1.0 * resize_w / imgW_max) + resize_w = min(imgW_max, resize_w) + resized_image = cv2.resize(img, (resize_w, imgH)) + resized_image = resized_image.astype('float32') + # norm + if image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + resize_shape = resized_image.shape + padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32) + padding_im[:, :, 0:resize_w] = resized_image + pad_shape = padding_im.shape + + return padding_im, resize_shape, pad_shape, valid_ratio + + def resize_norm_img_spin(self, img): + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + # return padding_im + img = cv2.resize(img, tuple([100, 32]), cv2.INTER_CUBIC) + img = np.array(img, np.float32) + img = np.expand_dims(img, -1) + img = img.transpose((2, 0, 1)) + mean = [127.5] + std = [127.5] + mean = np.array(mean, dtype=np.float32) + std = np.array(std, dtype=np.float32) + mean = np.float32(mean.reshape(1, -1)) + stdinv = 1 / np.float32(std.reshape(1, -1)) + img -= mean + img *= stdinv + return img + + def resize_norm_img_svtr(self, img, image_shape): + + imgC, imgH, imgW = image_shape + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) + resized_image = resized_image.astype('float32') + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + return resized_image + + def resize_norm_img_abinet(self, img, image_shape): + + imgC, imgH, imgW = image_shape + + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) + resized_image = resized_image.astype('float32') + resized_image = resized_image / 255. + + mean = np.array([0.485, 0.456, 0.406]) + std = np.array([0.229, 0.224, 0.225]) + resized_image = ( + resized_image - mean[None, None, ...]) / std[None, None, ...] + resized_image = resized_image.transpose((2, 0, 1)) + resized_image = resized_image.astype('float32') + + return resized_image + + def norm_img_can(self, img, image_shape): + + img = cv2.cvtColor( + img, cv2.COLOR_BGR2GRAY) # CAN only predict gray scale image + + if self.inverse: + img = 255 - img + + if self.rec_image_shape[0] == 1: + h, w = img.shape + _, imgH, imgW = self.rec_image_shape + if h < imgH or w < imgW: + padding_h = max(imgH - h, 0) + padding_w = max(imgW - w, 0) + img_padded = np.pad(img, ((0, padding_h), (0, padding_w)), + 'constant', + constant_values=(255)) + img = img_padded + + img = np.expand_dims(img, 0) / 255.0 # h,w,c -> c,h,w + img = img.astype('float32') + + return img + + def __call__(self, img_list): + img_num = len(img_list) + # Calculate the aspect ratio of all text bars + width_list = [] + for img in img_list: + width_list.append(img.shape[1] / float(img.shape[0])) + # Sorting can speed up the recognition process + indices = np.argsort(np.array(width_list)) + rec_res = [['', 0.0]] * img_num + batch_num = self.rec_batch_num + st = time.time() + if self.benchmark: + self.autolog.times.start() + for beg_img_no in range(0, img_num, batch_num): + end_img_no = min(img_num, beg_img_no + batch_num) + norm_img_batch = [] + if self.rec_algorithm == "SRN": + encoder_word_pos_list = [] + gsrm_word_pos_list = [] + gsrm_slf_attn_bias1_list = [] + gsrm_slf_attn_bias2_list = [] + if self.rec_algorithm == "SAR": + valid_ratios = [] + imgC, imgH, imgW = self.rec_image_shape[:3] + max_wh_ratio = imgW / imgH + # max_wh_ratio = 0 + for ino in range(beg_img_no, end_img_no): + h, w = img_list[indices[ino]].shape[0:2] + wh_ratio = w * 1.0 / h + max_wh_ratio = max(max_wh_ratio, wh_ratio) + for ino in range(beg_img_no, end_img_no): + if self.rec_algorithm == "SAR": + norm_img, _, _, valid_ratio = self.resize_norm_img_sar( + img_list[indices[ino]], self.rec_image_shape) + norm_img = norm_img[np.newaxis, :] + valid_ratio = np.expand_dims(valid_ratio, axis=0) + valid_ratios.append(valid_ratio) + norm_img_batch.append(norm_img) + elif self.rec_algorithm == "SRN": + norm_img = self.process_image_srn( + img_list[indices[ino]], self.rec_image_shape, 8, 25) + encoder_word_pos_list.append(norm_img[1]) + gsrm_word_pos_list.append(norm_img[2]) + gsrm_slf_attn_bias1_list.append(norm_img[3]) + gsrm_slf_attn_bias2_list.append(norm_img[4]) + norm_img_batch.append(norm_img[0]) + elif self.rec_algorithm == "SVTR": + norm_img = self.resize_norm_img_svtr(img_list[indices[ino]], + self.rec_image_shape) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + elif self.rec_algorithm in ["VisionLAN", "PREN"]: + norm_img = self.resize_norm_img_vl(img_list[indices[ino]], + self.rec_image_shape) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + elif self.rec_algorithm == 'SPIN': + norm_img = self.resize_norm_img_spin(img_list[indices[ino]]) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + elif self.rec_algorithm == "ABINet": + norm_img = self.resize_norm_img_abinet( + img_list[indices[ino]], self.rec_image_shape) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + elif self.rec_algorithm == "RobustScanner": + norm_img, _, _, valid_ratio = self.resize_norm_img_sar( + img_list[indices[ino]], + self.rec_image_shape, + width_downsample_ratio=0.25) + norm_img = norm_img[np.newaxis, :] + valid_ratio = np.expand_dims(valid_ratio, axis=0) + valid_ratios = [] + valid_ratios.append(valid_ratio) + norm_img_batch.append(norm_img) + word_positions_list = [] + word_positions = np.array(range(0, 40)).astype('int64') + word_positions = np.expand_dims(word_positions, axis=0) + word_positions_list.append(word_positions) + elif self.rec_algorithm == "CAN": + norm_img = self.norm_img_can(img_list[indices[ino]], + max_wh_ratio) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + norm_image_mask = np.ones(norm_img.shape, dtype='float32') + word_label = np.ones([1, 36], dtype='int64') + norm_img_mask_batch = [] + word_label_list = [] + norm_img_mask_batch.append(norm_image_mask) + word_label_list.append(word_label) + else: + norm_img = self.resize_norm_img(img_list[indices[ino]], + max_wh_ratio) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + norm_img_batch = np.concatenate(norm_img_batch) + norm_img_batch = norm_img_batch.copy() + if self.benchmark: + self.autolog.times.stamp() + + if self.rec_algorithm == "SRN": + encoder_word_pos_list = np.concatenate(encoder_word_pos_list) + gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list) + gsrm_slf_attn_bias1_list = np.concatenate( + gsrm_slf_attn_bias1_list) + gsrm_slf_attn_bias2_list = np.concatenate( + gsrm_slf_attn_bias2_list) + + inputs = [ + norm_img_batch, + encoder_word_pos_list, + gsrm_word_pos_list, + gsrm_slf_attn_bias1_list, + gsrm_slf_attn_bias2_list, + ] + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = norm_img_batch + outputs = self.predictor.run(self.output_tensors, + input_dict) + preds = {"predict": outputs[2]} + else: + input_names = self.predictor.get_input_names() + for i in range(len(input_names)): + input_tensor = self.predictor.get_input_handle( + input_names[i]) + input_tensor.copy_from_cpu(inputs[i]) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if self.benchmark: + self.autolog.times.stamp() + preds = {"predict": outputs[2]} + elif self.rec_algorithm == "SAR": + valid_ratios = np.concatenate(valid_ratios) + inputs = [ + norm_img_batch, + np.array( + [valid_ratios], dtype=np.float32), + ] + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = norm_img_batch + outputs = self.predictor.run(self.output_tensors, + input_dict) + preds = outputs[0] + else: + input_names = self.predictor.get_input_names() + for i in range(len(input_names)): + input_tensor = self.predictor.get_input_handle( + input_names[i]) + input_tensor.copy_from_cpu(inputs[i]) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if self.benchmark: + self.autolog.times.stamp() + preds = outputs[0] + elif self.rec_algorithm == "RobustScanner": + valid_ratios = np.concatenate(valid_ratios) + word_positions_list = np.concatenate(word_positions_list) + inputs = [norm_img_batch, valid_ratios, word_positions_list] + + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = norm_img_batch + outputs = self.predictor.run(self.output_tensors, + input_dict) + preds = outputs[0] + else: + input_names = self.predictor.get_input_names() + for i in range(len(input_names)): + input_tensor = self.predictor.get_input_handle( + input_names[i]) + input_tensor.copy_from_cpu(inputs[i]) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if self.benchmark: + self.autolog.times.stamp() + preds = outputs[0] + elif self.rec_algorithm == "CAN": + norm_img_mask_batch = np.concatenate(norm_img_mask_batch) + word_label_list = np.concatenate(word_label_list) + inputs = [norm_img_batch, norm_img_mask_batch, word_label_list] + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = norm_img_batch + outputs = self.predictor.run(self.output_tensors, + input_dict) + preds = outputs + else: + input_names = self.predictor.get_input_names() + input_tensor = [] + for i in range(len(input_names)): + input_tensor_i = self.predictor.get_input_handle( + input_names[i]) + input_tensor_i.copy_from_cpu(inputs[i]) + input_tensor.append(input_tensor_i) + self.input_tensor = input_tensor + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if self.benchmark: + self.autolog.times.stamp() + preds = outputs + else: + if self.use_onnx: + input_dict = {} + input_dict[self.input_tensor.name] = norm_img_batch + outputs = self.predictor.run(self.output_tensors, + input_dict) + preds = outputs[0] + else: + self.input_tensor.copy_from_cpu(norm_img_batch) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if self.benchmark: + self.autolog.times.stamp() + if len(outputs) != 1: + preds = outputs + else: + preds = outputs[0] + rec_result = self.postprocess_op(preds) + for rno in range(len(rec_result)): + rec_res[indices[beg_img_no + rno]] = rec_result[rno] + if self.benchmark: + self.autolog.times.end(stamp=True) + return rec_res, time.time() - st + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + text_recognizer = TextRecognizer(args) + valid_image_file_list = [] + img_list = [] + + logger.info( + "In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', " + "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320" + ) + # warmup 2 times + if args.warmup: + img = np.random.uniform(0, 255, [48, 320, 3]).astype(np.uint8) + for i in range(2): + res = text_recognizer([img] * int(args.rec_batch_num)) + + for image_file in image_file_list: + img, flag, _ = check_and_read(image_file) + if not flag: + img = cv2.imread(image_file) + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + valid_image_file_list.append(image_file) + img_list.append(img) + try: + rec_res, _ = text_recognizer(img_list) + + except Exception as E: + logger.info(traceback.format_exc()) + logger.info(E) + exit() + for ino in range(len(img_list)): + logger.info("Predicts of {}:{}".format(valid_image_file_list[ino], + rec_res[ino])) + if args.benchmark: + text_recognizer.autolog.report() + + +if __name__ == "__main__": + main(utility.parse_args()) diff --git a/tools/infer/predict_sr.py b/tools/infer/predict_sr.py new file mode 100644 index 0000000000000000000000000000000000000000..ca99f6819f4b207ecc0f0d1383fe1d26d07fbf50 --- /dev/null +++ b/tools/infer/predict_sr.py @@ -0,0 +1,155 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +from PIL import Image +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, __dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import numpy as np +import math +import time +import traceback +import paddle + +import tools.infer.utility as utility +from ppocr.postprocess import build_post_process +from ppocr.utils.logging import get_logger +from ppocr.utils.utility import get_image_file_list, check_and_read + +logger = get_logger() + + +class TextSR(object): + def __init__(self, args): + self.sr_image_shape = [int(v) for v in args.sr_image_shape.split(",")] + self.sr_batch_num = args.sr_batch_num + + self.predictor, self.input_tensor, self.output_tensors, self.config = \ + utility.create_predictor(args, 'sr', logger) + self.benchmark = args.benchmark + if args.benchmark: + import auto_log + pid = os.getpid() + gpu_id = utility.get_infer_gpuid() + self.autolog = auto_log.AutoLogger( + model_name="sr", + model_precision=args.precision, + batch_size=args.sr_batch_num, + data_shape="dynamic", + save_path=None, #args.save_log_path, + inference_config=self.config, + pids=pid, + process_name=None, + gpu_ids=gpu_id if args.use_gpu else None, + time_keys=[ + 'preprocess_time', 'inference_time', 'postprocess_time' + ], + warmup=0, + logger=logger) + + def resize_norm_img(self, img): + imgC, imgH, imgW = self.sr_image_shape + img = img.resize((imgW // 2, imgH // 2), Image.BICUBIC) + img_numpy = np.array(img).astype("float32") + img_numpy = img_numpy.transpose((2, 0, 1)) / 255 + return img_numpy + + def __call__(self, img_list): + img_num = len(img_list) + batch_num = self.sr_batch_num + st = time.time() + st = time.time() + all_result = [] * img_num + if self.benchmark: + self.autolog.times.start() + for beg_img_no in range(0, img_num, batch_num): + end_img_no = min(img_num, beg_img_no + batch_num) + norm_img_batch = [] + imgC, imgH, imgW = self.sr_image_shape + for ino in range(beg_img_no, end_img_no): + norm_img = self.resize_norm_img(img_list[ino]) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + + norm_img_batch = np.concatenate(norm_img_batch) + norm_img_batch = norm_img_batch.copy() + if self.benchmark: + self.autolog.times.stamp() + self.input_tensor.copy_from_cpu(norm_img_batch) + self.predictor.run() + outputs = [] + for output_tensor in self.output_tensors: + output = output_tensor.copy_to_cpu() + outputs.append(output) + if len(outputs) != 1: + preds = outputs + else: + preds = outputs[0] + all_result.append(outputs) + if self.benchmark: + self.autolog.times.end(stamp=True) + return all_result, time.time() - st + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + text_recognizer = TextSR(args) + valid_image_file_list = [] + img_list = [] + + # warmup 2 times + if args.warmup: + img = np.random.uniform(0, 255, [16, 64, 3]).astype(np.uint8) + for i in range(2): + res = text_recognizer([img] * int(args.sr_batch_num)) + + for image_file in image_file_list: + img, flag, _ = check_and_read(image_file) + if not flag: + img = Image.open(image_file).convert("RGB") + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + valid_image_file_list.append(image_file) + img_list.append(img) + try: + preds, _ = text_recognizer(img_list) + for beg_no in range(len(preds)): + sr_img = preds[beg_no][1] + lr_img = preds[beg_no][0] + for i in (range(sr_img.shape[0])): + fm_sr = (sr_img[i] * 255).transpose(1, 2, 0).astype(np.uint8) + fm_lr = (lr_img[i] * 255).transpose(1, 2, 0).astype(np.uint8) + img_name_pure = os.path.split(valid_image_file_list[ + beg_no * args.sr_batch_num + i])[-1] + cv2.imwrite("infer_result/sr_{}".format(img_name_pure), + fm_sr[:, :, ::-1]) + logger.info("The visualized image saved in infer_result/sr_{}". + format(img_name_pure)) + + except Exception as E: + logger.info(traceback.format_exc()) + logger.info(E) + exit() + if args.benchmark: + text_recognizer.autolog.report() + + +if __name__ == "__main__": + main(utility.parse_args()) diff --git a/tools/infer/predict_system.py b/tools/infer/predict_system.py new file mode 100644 index 0000000000000000000000000000000000000000..1f9e2e1dc690bf6208e1d72c0ff460b95e202702 --- /dev/null +++ b/tools/infer/predict_system.py @@ -0,0 +1,262 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +import subprocess + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import copy +import numpy as np +import json +import time +import logging +from PIL import Image +import tools.infer.utility as utility +import tools.infer.predict_rec as predict_rec +import tools.infer.predict_det as predict_det +import tools.infer.predict_cls as predict_cls +from ppocr.utils.utility import get_image_file_list, check_and_read +from ppocr.utils.logging import get_logger +from tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop +logger = get_logger() + + +class TextSystem(object): + def __init__(self, args): + if not args.show_log: + logger.setLevel(logging.INFO) + + self.text_detector = predict_det.TextDetector(args) + self.text_recognizer = predict_rec.TextRecognizer(args) + self.use_angle_cls = args.use_angle_cls + self.drop_score = args.drop_score + if self.use_angle_cls: + self.text_classifier = predict_cls.TextClassifier(args) + + self.args = args + self.crop_image_res_index = 0 + + def draw_crop_rec_res(self, output_dir, img_crop_list, rec_res): + os.makedirs(output_dir, exist_ok=True) + bbox_num = len(img_crop_list) + for bno in range(bbox_num): + cv2.imwrite( + os.path.join(output_dir, + f"mg_crop_{bno+self.crop_image_res_index}.jpg"), + img_crop_list[bno]) + logger.debug(f"{bno}, {rec_res[bno]}") + self.crop_image_res_index += bbox_num + + def __call__(self, img, cls=True): + time_dict = {'det': 0, 'rec': 0, 'csl': 0, 'all': 0} + start = time.time() + ori_im = img.copy() + dt_boxes, elapse = self.text_detector(img) + time_dict['det'] = elapse + logger.debug("dt_boxes num : {}, elapse : {}".format( + len(dt_boxes), elapse)) + if dt_boxes is None: + return None, None + img_crop_list = [] + + dt_boxes = sorted_boxes(dt_boxes) + + for bno in range(len(dt_boxes)): + tmp_box = copy.deepcopy(dt_boxes[bno]) + if self.args.det_box_type == "quad": + img_crop = get_rotate_crop_image(ori_im, tmp_box) + else: + img_crop = get_minarea_rect_crop(ori_im, tmp_box) + img_crop_list.append(img_crop) + if self.use_angle_cls and cls: + img_crop_list, angle_list, elapse = self.text_classifier( + img_crop_list) + time_dict['cls'] = elapse + logger.debug("cls num : {}, elapse : {}".format( + len(img_crop_list), elapse)) + + rec_res, elapse = self.text_recognizer(img_crop_list) + time_dict['rec'] = elapse + logger.debug("rec_res num : {}, elapse : {}".format( + len(rec_res), elapse)) + if self.args.save_crop_res: + self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list, + rec_res) + filter_boxes, filter_rec_res = [], [] + for box, rec_result in zip(dt_boxes, rec_res): + text, score = rec_result + if score >= self.drop_score: + filter_boxes.append(box) + filter_rec_res.append(rec_result) + end = time.time() + time_dict['all'] = end - start + return filter_boxes, filter_rec_res, time_dict + + +def sorted_boxes(dt_boxes): + """ + Sort text boxes in order from top to bottom, left to right + args: + dt_boxes(array):detected text boxes with shape [4, 2] + return: + sorted boxes(array) with shape [4, 2] + """ + num_boxes = dt_boxes.shape[0] + sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0])) + _boxes = list(sorted_boxes) + + for i in range(num_boxes - 1): + for j in range(i, -1, -1): + if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \ + (_boxes[j + 1][0][0] < _boxes[j][0][0]): + tmp = _boxes[j] + _boxes[j] = _boxes[j + 1] + _boxes[j + 1] = tmp + else: + break + return _boxes + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + image_file_list = image_file_list[args.process_id::args.total_process_num] + text_sys = TextSystem(args) + is_visualize = True + font_path = args.vis_font_path + drop_score = args.drop_score + draw_img_save_dir = args.draw_img_save_dir + os.makedirs(draw_img_save_dir, exist_ok=True) + save_results = [] + + logger.info( + "In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', " + "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320" + ) + + # warm up 10 times + if args.warmup: + img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8) + for i in range(10): + res = text_sys(img) + + total_time = 0 + cpu_mem, gpu_mem, gpu_util = 0, 0, 0 + _st = time.time() + count = 0 + for idx, image_file in enumerate(image_file_list): + + img, flag_gif, flag_pdf = check_and_read(image_file) + if not flag_gif and not flag_pdf: + img = cv2.imread(image_file) + if not flag_pdf: + if img is None: + logger.debug("error in loading image:{}".format(image_file)) + continue + imgs = [img] + else: + page_num = args.page_num + if page_num > len(img) or page_num == 0: + page_num = len(img) + imgs = img[:page_num] + for index, img in enumerate(imgs): + starttime = time.time() + dt_boxes, rec_res, time_dict = text_sys(img) + elapse = time.time() - starttime + total_time += elapse + if len(imgs) > 1: + logger.debug( + str(idx) + '_' + str(index) + " Predict time of %s: %.3fs" + % (image_file, elapse)) + else: + logger.debug( + str(idx) + " Predict time of %s: %.3fs" % (image_file, + elapse)) + for text, score in rec_res: + logger.debug("{}, {:.3f}".format(text, score)) + + res = [{ + "transcription": rec_res[i][0], + "points": np.array(dt_boxes[i]).astype(np.int32).tolist(), + } for i in range(len(dt_boxes))] + if len(imgs) > 1: + save_pred = os.path.basename(image_file) + '_' + str( + index) + "\t" + json.dumps( + res, ensure_ascii=False) + "\n" + else: + save_pred = os.path.basename(image_file) + "\t" + json.dumps( + res, ensure_ascii=False) + "\n" + save_results.append(save_pred) + + if is_visualize: + image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) + boxes = dt_boxes + txts = [rec_res[i][0] for i in range(len(rec_res))] + scores = [rec_res[i][1] for i in range(len(rec_res))] + + draw_img = draw_ocr_box_txt( + image, + boxes, + txts, + scores, + drop_score=drop_score, + font_path=font_path) + if flag_gif: + save_file = image_file[:-3] + "png" + elif flag_pdf: + save_file = image_file.replace('.pdf', + '_' + str(index) + '.png') + else: + save_file = image_file + cv2.imwrite( + os.path.join(draw_img_save_dir, + os.path.basename(save_file)), + draw_img[:, :, ::-1]) + logger.debug("The visualized image saved in {}".format( + os.path.join(draw_img_save_dir, os.path.basename( + save_file)))) + + logger.info("The predict total time is {}".format(time.time() - _st)) + if args.benchmark: + text_sys.text_detector.autolog.report() + text_sys.text_recognizer.autolog.report() + + with open( + os.path.join(draw_img_save_dir, "system_results.txt"), + 'w', + encoding='utf-8') as f: + f.writelines(save_results) + + +if __name__ == "__main__": + args = utility.parse_args() + if args.use_mp: + p_list = [] + total_process_num = args.total_process_num + for process_id in range(total_process_num): + cmd = [sys.executable, "-u"] + sys.argv + [ + "--process_id={}".format(process_id), + "--use_mp={}".format(False) + ] + p = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stdout) + p_list.append(p) + for p in p_list: + p.wait() + else: + main(args) diff --git a/tools/infer/utility.py b/tools/infer/utility.py new file mode 100644 index 0000000000000000000000000000000000000000..959373cd79315bc16521beb70c66b6c3556e6963 --- /dev/null +++ b/tools/infer/utility.py @@ -0,0 +1,663 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import sys +import platform +import cv2 +import numpy as np +import paddle +from PIL import Image, ImageDraw, ImageFont +import math +from paddle import inference +import time +import random +from ppocr.utils.logging import get_logger + + +def str2bool(v): + return v.lower() in ("true", "t", "1") + + +def init_args(): + parser = argparse.ArgumentParser() + # params for prediction engine + parser.add_argument("--use_gpu", type=str2bool, default=True) + parser.add_argument("--use_xpu", type=str2bool, default=False) + parser.add_argument("--use_npu", type=str2bool, default=False) + parser.add_argument("--ir_optim", type=str2bool, default=True) + parser.add_argument("--use_tensorrt", type=str2bool, default=False) + parser.add_argument("--min_subgraph_size", type=int, default=15) + parser.add_argument("--precision", type=str, default="fp32") + parser.add_argument("--gpu_mem", type=int, default=500) + parser.add_argument("--gpu_id", type=int, default=0) + + # params for text detector + parser.add_argument("--image_dir", type=str) + parser.add_argument("--page_num", type=int, default=0) + parser.add_argument("--det_algorithm", type=str, default='DB') + parser.add_argument("--det_model_dir", type=str) + parser.add_argument("--det_limit_side_len", type=float, default=960) + parser.add_argument("--det_limit_type", type=str, default='max') + parser.add_argument("--det_box_type", type=str, default='quad') + + # DB parmas + parser.add_argument("--det_db_thresh", type=float, default=0.3) + parser.add_argument("--det_db_box_thresh", type=float, default=0.6) + parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5) + parser.add_argument("--max_batch_size", type=int, default=10) + parser.add_argument("--use_dilation", type=str2bool, default=False) + parser.add_argument("--det_db_score_mode", type=str, default="fast") + + # EAST parmas + parser.add_argument("--det_east_score_thresh", type=float, default=0.8) + parser.add_argument("--det_east_cover_thresh", type=float, default=0.1) + parser.add_argument("--det_east_nms_thresh", type=float, default=0.2) + + # SAST parmas + parser.add_argument("--det_sast_score_thresh", type=float, default=0.5) + parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2) + + # PSE parmas + parser.add_argument("--det_pse_thresh", type=float, default=0) + parser.add_argument("--det_pse_box_thresh", type=float, default=0.85) + parser.add_argument("--det_pse_min_area", type=float, default=16) + parser.add_argument("--det_pse_scale", type=int, default=1) + + # FCE parmas + parser.add_argument("--scales", type=list, default=[8, 16, 32]) + parser.add_argument("--alpha", type=float, default=1.0) + parser.add_argument("--beta", type=float, default=1.0) + parser.add_argument("--fourier_degree", type=int, default=5) + + # params for text recognizer + parser.add_argument("--rec_algorithm", type=str, default='SVTR_LCNet') + parser.add_argument("--rec_model_dir", type=str) + parser.add_argument("--rec_image_inverse", type=str2bool, default=True) + parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320") + parser.add_argument("--rec_batch_num", type=int, default=6) + parser.add_argument("--max_text_length", type=int, default=25) + parser.add_argument( + "--rec_char_dict_path", + type=str, + default="./ppocr/utils/ppocr_keys_v1.txt") + parser.add_argument("--use_space_char", type=str2bool, default=True) + parser.add_argument( + "--vis_font_path", type=str, default="./doc/fonts/simfang.ttf") + parser.add_argument("--drop_score", type=float, default=0.5) + + # params for e2e + parser.add_argument("--e2e_algorithm", type=str, default='PGNet') + parser.add_argument("--e2e_model_dir", type=str) + parser.add_argument("--e2e_limit_side_len", type=float, default=768) + parser.add_argument("--e2e_limit_type", type=str, default='max') + + # PGNet parmas + parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5) + parser.add_argument( + "--e2e_char_dict_path", type=str, default="./ppocr/utils/ic15_dict.txt") + parser.add_argument("--e2e_pgnet_valid_set", type=str, default='totaltext') + parser.add_argument("--e2e_pgnet_mode", type=str, default='fast') + + # params for text classifier + parser.add_argument("--use_angle_cls", type=str2bool, default=False) + parser.add_argument("--cls_model_dir", type=str) + parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192") + parser.add_argument("--label_list", type=list, default=['0', '180']) + parser.add_argument("--cls_batch_num", type=int, default=6) + parser.add_argument("--cls_thresh", type=float, default=0.9) + + parser.add_argument("--enable_mkldnn", type=str2bool, default=False) + parser.add_argument("--cpu_threads", type=int, default=10) + parser.add_argument("--use_pdserving", type=str2bool, default=False) + parser.add_argument("--warmup", type=str2bool, default=False) + + # SR parmas + parser.add_argument("--sr_model_dir", type=str) + parser.add_argument("--sr_image_shape", type=str, default="3, 32, 128") + parser.add_argument("--sr_batch_num", type=int, default=1) + + # + parser.add_argument( + "--draw_img_save_dir", type=str, default="./inference_results") + parser.add_argument("--save_crop_res", type=str2bool, default=False) + parser.add_argument("--crop_res_save_dir", type=str, default="./output") + + # multi-process + parser.add_argument("--use_mp", type=str2bool, default=False) + parser.add_argument("--total_process_num", type=int, default=1) + parser.add_argument("--process_id", type=int, default=0) + + parser.add_argument("--benchmark", type=str2bool, default=False) + parser.add_argument("--save_log_path", type=str, default="./log_output/") + + parser.add_argument("--show_log", type=str2bool, default=True) + parser.add_argument("--use_onnx", type=str2bool, default=False) + return parser + + +def parse_args(): + parser = init_args() + return parser.parse_args() + + +def create_predictor(args, mode, logger): + if mode == "det": + model_dir = args.det_model_dir + elif mode == 'cls': + model_dir = args.cls_model_dir + elif mode == 'rec': + model_dir = args.rec_model_dir + elif mode == 'table': + model_dir = args.table_model_dir + elif mode == 'ser': + model_dir = args.ser_model_dir + elif mode == 're': + model_dir = args.re_model_dir + elif mode == "sr": + model_dir = args.sr_model_dir + elif mode == 'layout': + model_dir = args.layout_model_dir + else: + model_dir = args.e2e_model_dir + + if model_dir is None: + logger.info("not find {} model file path {}".format(mode, model_dir)) + sys.exit(0) + if args.use_onnx: + import onnxruntime as ort + model_file_path = model_dir + if not os.path.exists(model_file_path): + raise ValueError("not find model file path {}".format( + model_file_path)) + sess = ort.InferenceSession(model_file_path) + return sess, sess.get_inputs()[0], None, None + + else: + file_names = ['model', 'inference'] + for file_name in file_names: + model_file_path = '{}/{}.pdmodel'.format(model_dir, file_name) + params_file_path = '{}/{}.pdiparams'.format(model_dir, file_name) + if os.path.exists(model_file_path) and os.path.exists( + params_file_path): + break + if not os.path.exists(model_file_path): + raise ValueError( + "not find model.pdmodel or inference.pdmodel in {}".format( + model_dir)) + if not os.path.exists(params_file_path): + raise ValueError( + "not find model.pdiparams or inference.pdiparams in {}".format( + model_dir)) + + config = inference.Config(model_file_path, params_file_path) + + if hasattr(args, 'precision'): + if args.precision == "fp16" and args.use_tensorrt: + precision = inference.PrecisionType.Half + elif args.precision == "int8": + precision = inference.PrecisionType.Int8 + else: + precision = inference.PrecisionType.Float32 + else: + precision = inference.PrecisionType.Float32 + + if args.use_gpu: + gpu_id = get_infer_gpuid() + if gpu_id is None: + logger.warning( + "GPU is not found in current device by nvidia-smi. Please check your device or ignore it if run on jetson." + ) + config.enable_use_gpu(args.gpu_mem, args.gpu_id) + if args.use_tensorrt: + config.enable_tensorrt_engine( + workspace_size=1 << 30, + precision_mode=precision, + max_batch_size=args.max_batch_size, + min_subgraph_size=args. + min_subgraph_size, # skip the minmum trt subgraph + use_calib_mode=False) + + # collect shape + trt_shape_f = os.path.join(model_dir, + f"{mode}_trt_dynamic_shape.txt") + + if not os.path.exists(trt_shape_f): + config.collect_shape_range_info(trt_shape_f) + logger.info( + f"collect dynamic shape info into : {trt_shape_f}") + try: + config.enable_tuned_tensorrt_dynamic_shape(trt_shape_f, + True) + except Exception as E: + logger.info(E) + logger.info("Please keep your paddlepaddle-gpu >= 2.3.0!") + + elif args.use_npu: + config.enable_custom_device("npu") + elif args.use_xpu: + config.enable_xpu(10 * 1024 * 1024) + else: + config.disable_gpu() + if args.enable_mkldnn: + # cache 10 different shapes for mkldnn to avoid memory leak + config.set_mkldnn_cache_capacity(10) + config.enable_mkldnn() + if args.precision == "fp16": + config.enable_mkldnn_bfloat16() + if hasattr(args, "cpu_threads"): + config.set_cpu_math_library_num_threads(args.cpu_threads) + else: + # default cpu threads as 10 + config.set_cpu_math_library_num_threads(10) + # enable memory optim + config.enable_memory_optim() + config.disable_glog_info() + config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") + config.delete_pass("matmul_transpose_reshape_fuse_pass") + if mode == 're': + config.delete_pass("simplify_with_basic_ops_pass") + if mode == 'table': + config.delete_pass("fc_fuse_pass") # not supported for table + config.switch_use_feed_fetch_ops(False) + config.switch_ir_optim(True) + + # create predictor + predictor = inference.create_predictor(config) + input_names = predictor.get_input_names() + if mode in ['ser', 're']: + input_tensor = [] + for name in input_names: + input_tensor.append(predictor.get_input_handle(name)) + else: + for name in input_names: + input_tensor = predictor.get_input_handle(name) + output_tensors = get_output_tensors(args, mode, predictor) + return predictor, input_tensor, output_tensors, config + + +def get_output_tensors(args, mode, predictor): + output_names = predictor.get_output_names() + output_tensors = [] + if mode == "rec" and args.rec_algorithm in ["CRNN", "SVTR_LCNet"]: + output_name = 'softmax_0.tmp_0' + if output_name in output_names: + return [predictor.get_output_handle(output_name)] + else: + for output_name in output_names: + output_tensor = predictor.get_output_handle(output_name) + output_tensors.append(output_tensor) + else: + for output_name in output_names: + output_tensor = predictor.get_output_handle(output_name) + output_tensors.append(output_tensor) + return output_tensors + + +def get_infer_gpuid(): + sysstr = platform.system() + if sysstr == "Windows": + return 0 + + if not paddle.fluid.core.is_compiled_with_rocm(): + cmd = "env | grep CUDA_VISIBLE_DEVICES" + else: + cmd = "env | grep HIP_VISIBLE_DEVICES" + env_cuda = os.popen(cmd).readlines() + if len(env_cuda) == 0: + return 0 + else: + gpu_id = env_cuda[0].strip().split("=")[1] + return int(gpu_id[0]) + + +def draw_e2e_res(dt_boxes, strs, img_path): + src_im = cv2.imread(img_path) + for box, str in zip(dt_boxes, strs): + box = box.astype(np.int32).reshape((-1, 1, 2)) + cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2) + cv2.putText( + src_im, + str, + org=(int(box[0, 0, 0]), int(box[0, 0, 1])), + fontFace=cv2.FONT_HERSHEY_COMPLEX, + fontScale=0.7, + color=(0, 255, 0), + thickness=1) + return src_im + + +def draw_text_det_res(dt_boxes, img): + for box in dt_boxes: + box = np.array(box).astype(np.int32).reshape(-1, 2) + cv2.polylines(img, [box], True, color=(255, 255, 0), thickness=2) + return img + + +def resize_img(img, input_size=600): + """ + resize img and limit the longest side of the image to input_size + """ + img = np.array(img) + im_shape = img.shape + im_size_max = np.max(im_shape[0:2]) + im_scale = float(input_size) / float(im_size_max) + img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale) + return img + + +def draw_ocr(image, + boxes, + txts=None, + scores=None, + drop_score=0.5, + font_path="./doc/fonts/simfang.ttf"): + """ + Visualize the results of OCR detection and recognition + args: + image(Image|array): RGB image + boxes(list): boxes with shape(N, 4, 2) + txts(list): the texts + scores(list): txxs corresponding scores + drop_score(float): only scores greater than drop_threshold will be visualized + font_path: the path of font which is used to draw text + return(array): + the visualized img + """ + if scores is None: + scores = [1] * len(boxes) + box_num = len(boxes) + for i in range(box_num): + if scores is not None and (scores[i] < drop_score or + math.isnan(scores[i])): + continue + box = np.reshape(np.array(boxes[i]), [-1, 1, 2]).astype(np.int64) + image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2) + if txts is not None: + img = np.array(resize_img(image, input_size=600)) + txt_img = text_visual( + txts, + scores, + img_h=img.shape[0], + img_w=600, + threshold=drop_score, + font_path=font_path) + img = np.concatenate([np.array(img), np.array(txt_img)], axis=1) + return img + return image + + +def draw_ocr_box_txt(image, + boxes, + txts=None, + scores=None, + drop_score=0.5, + font_path="./doc/fonts/simfang.ttf"): + h, w = image.height, image.width + img_left = image.copy() + img_right = np.ones((h, w, 3), dtype=np.uint8) * 255 + random.seed(0) + + draw_left = ImageDraw.Draw(img_left) + if txts is None or len(txts) != len(boxes): + txts = [None] * len(boxes) + for idx, (box, txt) in enumerate(zip(boxes, txts)): + if scores is not None and scores[idx] < drop_score: + continue + color = (random.randint(0, 255), random.randint(0, 255), + random.randint(0, 255)) + draw_left.polygon(box, fill=color) + img_right_text = draw_box_txt_fine((w, h), box, txt, font_path) + pts = np.array(box, np.int32).reshape((-1, 1, 2)) + cv2.polylines(img_right_text, [pts], True, color, 1) + img_right = cv2.bitwise_and(img_right, img_right_text) + img_left = Image.blend(image, img_left, 0.5) + img_show = Image.new('RGB', (w * 2, h), (255, 255, 255)) + img_show.paste(img_left, (0, 0, w, h)) + img_show.paste(Image.fromarray(img_right), (w, 0, w * 2, h)) + return np.array(img_show) + + +def draw_box_txt_fine(img_size, box, txt, font_path="./doc/fonts/simfang.ttf"): + box_height = int( + math.sqrt((box[0][0] - box[3][0])**2 + (box[0][1] - box[3][1])**2)) + box_width = int( + math.sqrt((box[0][0] - box[1][0])**2 + (box[0][1] - box[1][1])**2)) + + if box_height > 2 * box_width and box_height > 30: + img_text = Image.new('RGB', (box_height, box_width), (255, 255, 255)) + draw_text = ImageDraw.Draw(img_text) + if txt: + font = create_font(txt, (box_height, box_width), font_path) + draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font) + img_text = img_text.transpose(Image.ROTATE_270) + else: + img_text = Image.new('RGB', (box_width, box_height), (255, 255, 255)) + draw_text = ImageDraw.Draw(img_text) + if txt: + font = create_font(txt, (box_width, box_height), font_path) + draw_text.text([0, 0], txt, fill=(0, 0, 0), font=font) + + pts1 = np.float32( + [[0, 0], [box_width, 0], [box_width, box_height], [0, box_height]]) + pts2 = np.array(box, dtype=np.float32) + M = cv2.getPerspectiveTransform(pts1, pts2) + + img_text = np.array(img_text, dtype=np.uint8) + img_right_text = cv2.warpPerspective( + img_text, + M, + img_size, + flags=cv2.INTER_NEAREST, + borderMode=cv2.BORDER_CONSTANT, + borderValue=(255, 255, 255)) + return img_right_text + + +def create_font(txt, sz, font_path="./doc/fonts/simfang.ttf"): + font_size = int(sz[1] * 0.99) + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + length = font.getsize(txt)[0] + if length > sz[0]: + font_size = int(font_size * sz[0] / length) + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + return font + + +def str_count(s): + """ + Count the number of Chinese characters, + a single English character and a single number + equal to half the length of Chinese characters. + args: + s(string): the input of string + return(int): + the number of Chinese characters + """ + import string + count_zh = count_pu = 0 + s_len = len(s) + en_dg_count = 0 + for c in s: + if c in string.ascii_letters or c.isdigit() or c.isspace(): + en_dg_count += 1 + elif c.isalpha(): + count_zh += 1 + else: + count_pu += 1 + return s_len - math.ceil(en_dg_count / 2) + + +def text_visual(texts, + scores, + img_h=400, + img_w=600, + threshold=0., + font_path="./doc/simfang.ttf"): + """ + create new blank img and draw txt on it + args: + texts(list): the text will be draw + scores(list|None): corresponding score of each txt + img_h(int): the height of blank img + img_w(int): the width of blank img + font_path: the path of font which is used to draw text + return(array): + """ + if scores is not None: + assert len(texts) == len( + scores), "The number of txts and corresponding scores must match" + + def create_blank_img(): + blank_img = np.ones(shape=[img_h, img_w], dtype=np.int8) * 255 + blank_img[:, img_w - 1:] = 0 + blank_img = Image.fromarray(blank_img).convert("RGB") + draw_txt = ImageDraw.Draw(blank_img) + return blank_img, draw_txt + + blank_img, draw_txt = create_blank_img() + + font_size = 20 + txt_color = (0, 0, 0) + font = ImageFont.truetype(font_path, font_size, encoding="utf-8") + + gap = font_size + 5 + txt_img_list = [] + count, index = 1, 0 + for idx, txt in enumerate(texts): + index += 1 + if scores[idx] < threshold or math.isnan(scores[idx]): + index -= 1 + continue + first_line = True + while str_count(txt) >= img_w // font_size - 4: + tmp = txt + txt = tmp[:img_w // font_size - 4] + if first_line: + new_txt = str(index) + ': ' + txt + first_line = False + else: + new_txt = ' ' + txt + draw_txt.text((0, gap * count), new_txt, txt_color, font=font) + txt = tmp[img_w // font_size - 4:] + if count >= img_h // gap - 1: + txt_img_list.append(np.array(blank_img)) + blank_img, draw_txt = create_blank_img() + count = 0 + count += 1 + if first_line: + new_txt = str(index) + ': ' + txt + ' ' + '%.3f' % (scores[idx]) + else: + new_txt = " " + txt + " " + '%.3f' % (scores[idx]) + draw_txt.text((0, gap * count), new_txt, txt_color, font=font) + # whether add new blank img or not + if count >= img_h // gap - 1 and idx + 1 < len(texts): + txt_img_list.append(np.array(blank_img)) + blank_img, draw_txt = create_blank_img() + count = 0 + count += 1 + txt_img_list.append(np.array(blank_img)) + if len(txt_img_list) == 1: + blank_img = np.array(txt_img_list[0]) + else: + blank_img = np.concatenate(txt_img_list, axis=1) + return np.array(blank_img) + + +def base64_to_cv2(b64str): + import base64 + data = base64.b64decode(b64str.encode('utf8')) + data = np.frombuffer(data, np.uint8) + data = cv2.imdecode(data, cv2.IMREAD_COLOR) + return data + + +def draw_boxes(image, boxes, scores=None, drop_score=0.5): + if scores is None: + scores = [1] * len(boxes) + for (box, score) in zip(boxes, scores): + if score < drop_score: + continue + box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64) + image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2) + return image + + +def get_rotate_crop_image(img, points): + ''' + img_height, img_width = img.shape[0:2] + left = int(np.min(points[:, 0])) + right = int(np.max(points[:, 0])) + top = int(np.min(points[:, 1])) + bottom = int(np.max(points[:, 1])) + img_crop = img[top:bottom, left:right, :].copy() + points[:, 0] = points[:, 0] - left + points[:, 1] = points[:, 1] - top + ''' + assert len(points) == 4, "shape of points must be 4*2" + img_crop_width = int( + max( + np.linalg.norm(points[0] - points[1]), + np.linalg.norm(points[2] - points[3]))) + img_crop_height = int( + max( + np.linalg.norm(points[0] - points[3]), + np.linalg.norm(points[1] - points[2]))) + pts_std = np.float32([[0, 0], [img_crop_width, 0], + [img_crop_width, img_crop_height], + [0, img_crop_height]]) + M = cv2.getPerspectiveTransform(points, pts_std) + dst_img = cv2.warpPerspective( + img, + M, (img_crop_width, img_crop_height), + borderMode=cv2.BORDER_REPLICATE, + flags=cv2.INTER_CUBIC) + dst_img_height, dst_img_width = dst_img.shape[0:2] + if dst_img_height * 1.0 / dst_img_width >= 1.5: + dst_img = np.rot90(dst_img) + return dst_img + + +def get_minarea_rect_crop(img, points): + bounding_box = cv2.minAreaRect(np.array(points).astype(np.int32)) + points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) + + index_a, index_b, index_c, index_d = 0, 1, 2, 3 + if points[1][1] > points[0][1]: + index_a = 0 + index_d = 1 + else: + index_a = 1 + index_d = 0 + if points[3][1] > points[2][1]: + index_b = 2 + index_c = 3 + else: + index_b = 3 + index_c = 2 + + box = [points[index_a], points[index_b], points[index_c], points[index_d]] + crop_img = get_rotate_crop_image(img, np.array(box)) + return crop_img + + +def check_gpu(use_gpu): + if use_gpu and not paddle.is_compiled_with_cuda(): + use_gpu = False + return use_gpu + + +if __name__ == '__main__': + pass diff --git a/tools/infer_cls.py b/tools/infer_cls.py new file mode 100644 index 0000000000000000000000000000000000000000..7fd6b536fbe50fb1240d84ca3a5e87236940c0f5 --- /dev/null +++ b/tools/infer_cls.py @@ -0,0 +1,85 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import paddle + +from ppocr.data import create_operators, transform +from ppocr.modeling.architectures import build_model +from ppocr.postprocess import build_post_process +from ppocr.utils.save_load import load_model +from ppocr.utils.utility import get_image_file_list +import tools.program as program + + +def main(): + global_config = config['Global'] + + # build post process + post_process_class = build_post_process(config['PostProcess'], + global_config) + + # build model + model = build_model(config['Architecture']) + + load_model(config, model) + + # create data ops + transforms = [] + for op in config['Eval']['dataset']['transforms']: + op_name = list(op)[0] + if 'Label' in op_name: + continue + elif op_name == 'KeepKeys': + op[op_name]['keep_keys'] = ['image'] + elif op_name == "SSLRotateResize": + op[op_name]["mode"] = "test" + transforms.append(op) + global_config['infer_mode'] = True + ops = create_operators(transforms, global_config) + + model.eval() + for file in get_image_file_list(config['Global']['infer_img']): + logger.info("infer_img: {}".format(file)) + with open(file, 'rb') as f: + img = f.read() + data = {'image': img} + batch = transform(data, ops) + + images = np.expand_dims(batch[0], axis=0) + images = paddle.to_tensor(images) + preds = model(images) + post_result = post_process_class(preds) + for rec_result in post_result: + logger.info('\t result: {}'.format(rec_result)) + logger.info("success!") + + +if __name__ == '__main__': + config, device, logger, vdl_writer = program.preprocess() + main() diff --git a/tools/infer_det.py b/tools/infer_det.py new file mode 100644 index 0000000000000000000000000000000000000000..f253e8f2876a5942538f18e93dfdada4391875b2 --- /dev/null +++ b/tools/infer_det.py @@ -0,0 +1,134 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import json +import paddle + +from ppocr.data import create_operators, transform +from ppocr.modeling.architectures import build_model +from ppocr.postprocess import build_post_process +from ppocr.utils.save_load import load_model +from ppocr.utils.utility import get_image_file_list +import tools.program as program + + +def draw_det_res(dt_boxes, config, img, img_name, save_path): + if len(dt_boxes) > 0: + import cv2 + src_im = img + for box in dt_boxes: + box = np.array(box).astype(np.int32).reshape((-1, 1, 2)) + cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2) + if not os.path.exists(save_path): + os.makedirs(save_path) + save_path = os.path.join(save_path, os.path.basename(img_name)) + cv2.imwrite(save_path, src_im) + logger.info("The detected Image saved in {}".format(save_path)) + + +@paddle.no_grad() +def main(): + global_config = config['Global'] + + # build model + model = build_model(config['Architecture']) + + load_model(config, model) + # build post process + post_process_class = build_post_process(config['PostProcess']) + + # create data ops + transforms = [] + for op in config['Eval']['dataset']['transforms']: + op_name = list(op)[0] + if 'Label' in op_name: + continue + elif op_name == 'KeepKeys': + op[op_name]['keep_keys'] = ['image', 'shape'] + transforms.append(op) + + ops = create_operators(transforms, global_config) + + save_res_path = config['Global']['save_res_path'] + if not os.path.exists(os.path.dirname(save_res_path)): + os.makedirs(os.path.dirname(save_res_path)) + + model.eval() + with open(save_res_path, "wb") as fout: + for file in get_image_file_list(config['Global']['infer_img']): + logger.info("infer_img: {}".format(file)) + with open(file, 'rb') as f: + img = f.read() + data = {'image': img} + batch = transform(data, ops) + + images = np.expand_dims(batch[0], axis=0) + shape_list = np.expand_dims(batch[1], axis=0) + images = paddle.to_tensor(images) + preds = model(images) + post_result = post_process_class(preds, shape_list) + + src_img = cv2.imread(file) + + dt_boxes_json = [] + # parser boxes if post_result is dict + if isinstance(post_result, dict): + det_box_json = {} + for k in post_result.keys(): + boxes = post_result[k][0]['points'] + dt_boxes_list = [] + for box in boxes: + tmp_json = {"transcription": ""} + tmp_json['points'] = np.array(box).tolist() + dt_boxes_list.append(tmp_json) + det_box_json[k] = dt_boxes_list + save_det_path = os.path.dirname(config['Global'][ + 'save_res_path']) + "/det_results_{}/".format(k) + draw_det_res(boxes, config, src_img, file, save_det_path) + else: + boxes = post_result[0]['points'] + dt_boxes_json = [] + # write result + for box in boxes: + tmp_json = {"transcription": ""} + tmp_json['points'] = np.array(box).tolist() + dt_boxes_json.append(tmp_json) + save_det_path = os.path.dirname(config['Global'][ + 'save_res_path']) + "/det_results/" + draw_det_res(boxes, config, src_img, file, save_det_path) + otstr = file + "\t" + json.dumps(dt_boxes_json) + "\n" + fout.write(otstr.encode()) + + logger.info("success!") + + +if __name__ == '__main__': + config, device, logger, vdl_writer = program.preprocess() + main() diff --git a/tools/infer_e2e.py b/tools/infer_e2e.py new file mode 100644 index 0000000000000000000000000000000000000000..37fdcbaadc2984c9cf4fb105b7122db31b99be30 --- /dev/null +++ b/tools/infer_e2e.py @@ -0,0 +1,174 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import json +import paddle + +from ppocr.data import create_operators, transform +from ppocr.modeling.architectures import build_model +from ppocr.postprocess import build_post_process +from ppocr.utils.save_load import load_model +from ppocr.utils.utility import get_image_file_list +import tools.program as program +from PIL import Image, ImageDraw, ImageFont +import math + + +def draw_e2e_res_for_chinese(image, + boxes, + txts, + config, + img_name, + font_path="./doc/simfang.ttf"): + h, w = image.height, image.width + img_left = image.copy() + img_right = Image.new('RGB', (w, h), (255, 255, 255)) + + import random + + random.seed(0) + draw_left = ImageDraw.Draw(img_left) + draw_right = ImageDraw.Draw(img_right) + for idx, (box, txt) in enumerate(zip(boxes, txts)): + box = np.array(box) + box = [tuple(x) for x in box] + color = (random.randint(0, 255), random.randint(0, 255), + random.randint(0, 255)) + draw_left.polygon(box, fill=color) + draw_right.polygon(box, outline=color) + font = ImageFont.truetype(font_path, 15, encoding="utf-8") + draw_right.text([box[0][0], box[0][1]], txt, fill=(0, 0, 0), font=font) + img_left = Image.blend(image, img_left, 0.5) + img_show = Image.new('RGB', (w * 2, h), (255, 255, 255)) + img_show.paste(img_left, (0, 0, w, h)) + img_show.paste(img_right, (w, 0, w * 2, h)) + + save_e2e_path = os.path.dirname(config['Global'][ + 'save_res_path']) + "/e2e_results/" + if not os.path.exists(save_e2e_path): + os.makedirs(save_e2e_path) + save_path = os.path.join(save_e2e_path, os.path.basename(img_name)) + cv2.imwrite(save_path, np.array(img_show)[:, :, ::-1]) + logger.info("The e2e Image saved in {}".format(save_path)) + + +def draw_e2e_res(dt_boxes, strs, config, img, img_name): + if len(dt_boxes) > 0: + src_im = img + for box, str in zip(dt_boxes, strs): + box = box.astype(np.int32).reshape((-1, 1, 2)) + cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2) + cv2.putText( + src_im, + str, + org=(int(box[0, 0, 0]), int(box[0, 0, 1])), + fontFace=cv2.FONT_HERSHEY_COMPLEX, + fontScale=0.7, + color=(0, 255, 0), + thickness=1) + save_det_path = os.path.dirname(config['Global'][ + 'save_res_path']) + "/e2e_results/" + if not os.path.exists(save_det_path): + os.makedirs(save_det_path) + save_path = os.path.join(save_det_path, os.path.basename(img_name)) + cv2.imwrite(save_path, src_im) + logger.info("The e2e Image saved in {}".format(save_path)) + + +def main(): + global_config = config['Global'] + + # build model + model = build_model(config['Architecture']) + + load_model(config, model) + + # build post process + post_process_class = build_post_process(config['PostProcess'], + global_config) + + # create data ops + transforms = [] + for op in config['Eval']['dataset']['transforms']: + op_name = list(op)[0] + if 'Label' in op_name: + continue + elif op_name == 'KeepKeys': + op[op_name]['keep_keys'] = ['image', 'shape'] + transforms.append(op) + + ops = create_operators(transforms, global_config) + + save_res_path = config['Global']['save_res_path'] + if not os.path.exists(os.path.dirname(save_res_path)): + os.makedirs(os.path.dirname(save_res_path)) + + model.eval() + with open(save_res_path, "wb") as fout: + for file in get_image_file_list(config['Global']['infer_img']): + logger.info("infer_img: {}".format(file)) + with open(file, 'rb') as f: + img = f.read() + data = {'image': img} + batch = transform(data, ops) + images = np.expand_dims(batch[0], axis=0) + shape_list = np.expand_dims(batch[1], axis=0) + images = paddle.to_tensor(images) + preds = model(images) + post_result = post_process_class(preds, shape_list) + points, strs = post_result['points'], post_result['texts'] + # write result + dt_boxes_json = [] + for poly, str in zip(points, strs): + tmp_json = {"transcription": str} + tmp_json['points'] = poly.tolist() + dt_boxes_json.append(tmp_json) + otstr = file + "\t" + json.dumps(dt_boxes_json) + "\n" + fout.write(otstr.encode()) + src_img = cv2.imread(file) + if global_config['infer_visual_type'] == 'EN': + draw_e2e_res(points, strs, config, src_img, file) + elif global_config['infer_visual_type'] == 'CN': + src_img = Image.fromarray( + cv2.cvtColor(src_img, cv2.COLOR_BGR2RGB)) + draw_e2e_res_for_chinese( + src_img, + points, + strs, + config, + file, + font_path="./doc/fonts/simfang.ttf") + + logger.info("success!") + + +if __name__ == '__main__': + config, device, logger, vdl_writer = program.preprocess() + main() diff --git a/tools/infer_kie.py b/tools/infer_kie.py new file mode 100644 index 0000000000000000000000000000000000000000..9375434cc887b08dfa746420a6c73c58c6e04797 --- /dev/null +++ b/tools/infer_kie.py @@ -0,0 +1,176 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import paddle.nn.functional as F + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import cv2 +import paddle + +from ppocr.data import create_operators, transform +from ppocr.modeling.architectures import build_model +from ppocr.utils.save_load import load_model +import tools.program as program +import time + + +def read_class_list(filepath): + ret = {} + with open(filepath, "r") as f: + lines = f.readlines() + for idx, line in enumerate(lines): + ret[idx] = line.strip("\n") + return ret + + +def draw_kie_result(batch, node, idx_to_cls, count): + img = batch[6].copy() + boxes = batch[7] + h, w = img.shape[:2] + pred_img = np.ones((h, w * 2, 3), dtype=np.uint8) * 255 + max_value, max_idx = paddle.max(node, -1), paddle.argmax(node, -1) + node_pred_label = max_idx.numpy().tolist() + node_pred_score = max_value.numpy().tolist() + + for i, box in enumerate(boxes): + if i >= len(node_pred_label): + break + new_box = [[box[0], box[1]], [box[2], box[1]], [box[2], box[3]], + [box[0], box[3]]] + Pts = np.array([new_box], np.int32) + cv2.polylines( + img, [Pts.reshape((-1, 1, 2))], + True, + color=(255, 255, 0), + thickness=1) + x_min = int(min([point[0] for point in new_box])) + y_min = int(min([point[1] for point in new_box])) + + pred_label = node_pred_label[i] + if pred_label in idx_to_cls: + pred_label = idx_to_cls[pred_label] + pred_score = '{:.2f}'.format(node_pred_score[i]) + text = pred_label + '(' + pred_score + ')' + cv2.putText(pred_img, text, (x_min * 2, y_min), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1) + vis_img = np.ones((h, w * 3, 3), dtype=np.uint8) * 255 + vis_img[:, :w] = img + vis_img[:, w:] = pred_img + save_kie_path = os.path.dirname(config['Global'][ + 'save_res_path']) + "/kie_results/" + if not os.path.exists(save_kie_path): + os.makedirs(save_kie_path) + save_path = os.path.join(save_kie_path, str(count) + ".png") + cv2.imwrite(save_path, vis_img) + logger.info("The Kie Image saved in {}".format(save_path)) + +def write_kie_result(fout, node, data): + """ + Write infer result to output file, sorted by the predict label of each line. + The format keeps the same as the input with additional score attribute. + """ + import json + label = data['label'] + annotations = json.loads(label) + max_value, max_idx = paddle.max(node, -1), paddle.argmax(node, -1) + node_pred_label = max_idx.numpy().tolist() + node_pred_score = max_value.numpy().tolist() + res = [] + for i, label in enumerate(node_pred_label): + pred_score = '{:.2f}'.format(node_pred_score[i]) + pred_res = { + 'label': label, + 'transcription': annotations[i]['transcription'], + 'score': pred_score, + 'points': annotations[i]['points'], + } + res.append(pred_res) + res.sort(key=lambda x: x['label']) + fout.writelines([json.dumps(res, ensure_ascii=False) + '\n']) + +def main(): + global_config = config['Global'] + + # build model + model = build_model(config['Architecture']) + load_model(config, model) + + # create data ops + transforms = [] + for op in config['Eval']['dataset']['transforms']: + transforms.append(op) + + data_dir = config['Eval']['dataset']['data_dir'] + + ops = create_operators(transforms, global_config) + + save_res_path = config['Global']['save_res_path'] + class_path = config['Global']['class_path'] + idx_to_cls = read_class_list(class_path) + os.makedirs(os.path.dirname(save_res_path), exist_ok=True) + + model.eval() + + warmup_times = 0 + count_t = [] + with open(save_res_path, "w") as fout: + with open(config['Global']['infer_img'], "rb") as f: + lines = f.readlines() + for index, data_line in enumerate(lines): + if index == 10: + warmup_t = time.time() + data_line = data_line.decode('utf-8') + substr = data_line.strip("\n").split("\t") + img_path, label = data_dir + "/" + substr[0], substr[1] + data = {'img_path': img_path, 'label': label} + with open(data['img_path'], 'rb') as f: + img = f.read() + data['image'] = img + st = time.time() + batch = transform(data, ops) + batch_pred = [0] * len(batch) + for i in range(len(batch)): + batch_pred[i] = paddle.to_tensor( + np.expand_dims( + batch[i], axis=0)) + st = time.time() + node, edge = model(batch_pred) + node = F.softmax(node, -1) + count_t.append(time.time() - st) + draw_kie_result(batch, node, idx_to_cls, index) + write_kie_result(fout, node, data) + fout.close() + logger.info("success!") + logger.info("It took {} s for predict {} images.".format( + np.sum(count_t), len(count_t))) + ips = len(count_t[warmup_times:]) / np.sum(count_t[warmup_times:]) + logger.info("The ips is {} images/s".format(ips)) + + +if __name__ == '__main__': + config, device, logger, vdl_writer = program.preprocess() + main() diff --git a/tools/infer_kie_token_ser.py b/tools/infer_kie_token_ser.py new file mode 100644 index 0000000000000000000000000000000000000000..2fc5749b9c10b9c89bc16e561fbe9c5ce58eb13c --- /dev/null +++ b/tools/infer_kie_token_ser.py @@ -0,0 +1,157 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' +import cv2 +import json +import paddle + +from ppocr.data import create_operators, transform +from ppocr.modeling.architectures import build_model +from ppocr.postprocess import build_post_process +from ppocr.utils.save_load import load_model +from ppocr.utils.visual import draw_ser_results +from ppocr.utils.utility import get_image_file_list, load_vqa_bio_label_maps +import tools.program as program + + +def to_tensor(data): + import numbers + from collections import defaultdict + data_dict = defaultdict(list) + to_tensor_idxs = [] + + for idx, v in enumerate(data): + if isinstance(v, (np.ndarray, paddle.Tensor, numbers.Number)): + if idx not in to_tensor_idxs: + to_tensor_idxs.append(idx) + data_dict[idx].append(v) + for idx in to_tensor_idxs: + data_dict[idx] = paddle.to_tensor(data_dict[idx]) + return list(data_dict.values()) + + +class SerPredictor(object): + def __init__(self, config): + global_config = config['Global'] + self.algorithm = config['Architecture']["algorithm"] + + # build post process + self.post_process_class = build_post_process(config['PostProcess'], + global_config) + + # build model + self.model = build_model(config['Architecture']) + + load_model( + config, self.model, model_type=config['Architecture']["model_type"]) + + from paddleocr import PaddleOCR + + self.ocr_engine = PaddleOCR( + use_angle_cls=False, + show_log=False, + rec_model_dir=global_config.get("kie_rec_model_dir", None), + det_model_dir=global_config.get("kie_det_model_dir", None), + use_gpu=global_config['use_gpu']) + + # create data ops + transforms = [] + for op in config['Eval']['dataset']['transforms']: + op_name = list(op)[0] + if 'Label' in op_name: + op[op_name]['ocr_engine'] = self.ocr_engine + elif op_name == 'KeepKeys': + op[op_name]['keep_keys'] = [ + 'input_ids', 'bbox', 'attention_mask', 'token_type_ids', + 'image', 'labels', 'segment_offset_id', 'ocr_info', + 'entities' + ] + + transforms.append(op) + if config["Global"].get("infer_mode", None) is None: + global_config['infer_mode'] = True + self.ops = create_operators(config['Eval']['dataset']['transforms'], + global_config) + self.model.eval() + + def __call__(self, data): + with open(data["img_path"], 'rb') as f: + img = f.read() + data["image"] = img + batch = transform(data, self.ops) + batch = to_tensor(batch) + preds = self.model(batch) + + post_result = self.post_process_class( + preds, segment_offset_ids=batch[6], ocr_infos=batch[7]) + return post_result, batch + + +if __name__ == '__main__': + config, device, logger, vdl_writer = program.preprocess() + os.makedirs(config['Global']['save_res_path'], exist_ok=True) + + ser_engine = SerPredictor(config) + + if config["Global"].get("infer_mode", None) is False: + data_dir = config['Eval']['dataset']['data_dir'] + with open(config['Global']['infer_img'], "rb") as f: + infer_imgs = f.readlines() + else: + infer_imgs = get_image_file_list(config['Global']['infer_img']) + + with open( + os.path.join(config['Global']['save_res_path'], + "infer_results.txt"), + "w", + encoding='utf-8') as fout: + for idx, info in enumerate(infer_imgs): + if config["Global"].get("infer_mode", None) is False: + data_line = info.decode('utf-8') + substr = data_line.strip("\n").split("\t") + img_path = os.path.join(data_dir, substr[0]) + data = {'img_path': img_path, 'label': substr[1]} + else: + img_path = info + data = {'img_path': img_path} + + save_img_path = os.path.join( + config['Global']['save_res_path'], + os.path.splitext(os.path.basename(img_path))[0] + "_ser.jpg") + + result, _ = ser_engine(data) + result = result[0] + fout.write(img_path + "\t" + json.dumps( + { + "ocr_info": result, + }, ensure_ascii=False) + "\n") + img_res = draw_ser_results(img_path, result) + cv2.imwrite(save_img_path, img_res) + + logger.info("process: [{}/{}], save result to {}".format( + idx, len(infer_imgs), save_img_path)) diff --git a/tools/infer_kie_token_ser_re.py b/tools/infer_kie_token_ser_re.py new file mode 100644 index 0000000000000000000000000000000000000000..76120a913f36c815cbd3b9314523ea91e3290065 --- /dev/null +++ b/tools/infer_kie_token_ser_re.py @@ -0,0 +1,225 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' +import cv2 +import json +import paddle +import paddle.distributed as dist + +from ppocr.data import create_operators, transform +from ppocr.modeling.architectures import build_model +from ppocr.postprocess import build_post_process +from ppocr.utils.save_load import load_model +from ppocr.utils.visual import draw_re_results +from ppocr.utils.logging import get_logger +from ppocr.utils.utility import get_image_file_list, load_vqa_bio_label_maps, print_dict +from tools.program import ArgsParser, load_config, merge_config +from tools.infer_kie_token_ser import SerPredictor + + +class ReArgsParser(ArgsParser): + def __init__(self): + super(ReArgsParser, self).__init__() + self.add_argument( + "-c_ser", "--config_ser", help="ser configuration file to use") + self.add_argument( + "-o_ser", + "--opt_ser", + nargs='+', + help="set ser configuration options ") + + def parse_args(self, argv=None): + args = super(ReArgsParser, self).parse_args(argv) + assert args.config_ser is not None, \ + "Please specify --config_ser=ser_configure_file_path." + args.opt_ser = self._parse_opt(args.opt_ser) + return args + + +def make_input(ser_inputs, ser_results): + entities_labels = {'HEADER': 0, 'QUESTION': 1, 'ANSWER': 2} + batch_size, max_seq_len = ser_inputs[0].shape[:2] + entities = ser_inputs[8][0] + ser_results = ser_results[0] + assert len(entities) == len(ser_results) + + # entities + start = [] + end = [] + label = [] + entity_idx_dict = {} + for i, (res, entity) in enumerate(zip(ser_results, entities)): + if res['pred'] == 'O': + continue + entity_idx_dict[len(start)] = i + start.append(entity['start']) + end.append(entity['end']) + label.append(entities_labels[res['pred']]) + + entities = np.full([max_seq_len + 1, 3], fill_value=-1, dtype=np.int64) + entities[0, 0] = len(start) + entities[1:len(start) + 1, 0] = start + entities[0, 1] = len(end) + entities[1:len(end) + 1, 1] = end + entities[0, 2] = len(label) + entities[1:len(label) + 1, 2] = label + + # relations + head = [] + tail = [] + for i in range(len(label)): + for j in range(len(label)): + if label[i] == 1 and label[j] == 2: + head.append(i) + tail.append(j) + + relations = np.full([len(head) + 1, 2], fill_value=-1, dtype=np.int64) + relations[0, 0] = len(head) + relations[1:len(head) + 1, 0] = head + relations[0, 1] = len(tail) + relations[1:len(tail) + 1, 1] = tail + + entities = np.expand_dims(entities, axis=0) + entities = np.repeat(entities, batch_size, axis=0) + relations = np.expand_dims(relations, axis=0) + relations = np.repeat(relations, batch_size, axis=0) + + # remove ocr_info segment_offset_id and label in ser input + if isinstance(ser_inputs[0], paddle.Tensor): + entities = paddle.to_tensor(entities) + relations = paddle.to_tensor(relations) + ser_inputs = ser_inputs[:5] + [entities, relations] + + entity_idx_dict_batch = [] + for b in range(batch_size): + entity_idx_dict_batch.append(entity_idx_dict) + return ser_inputs, entity_idx_dict_batch + + +class SerRePredictor(object): + def __init__(self, config, ser_config): + global_config = config['Global'] + if "infer_mode" in global_config: + ser_config["Global"]["infer_mode"] = global_config["infer_mode"] + + self.ser_engine = SerPredictor(ser_config) + + # init re model + + # build post process + self.post_process_class = build_post_process(config['PostProcess'], + global_config) + + # build model + self.model = build_model(config['Architecture']) + + load_model( + config, self.model, model_type=config['Architecture']["model_type"]) + + self.model.eval() + + def __call__(self, data): + ser_results, ser_inputs = self.ser_engine(data) + re_input, entity_idx_dict_batch = make_input(ser_inputs, ser_results) + if self.model.backbone.use_visual_backbone is False: + re_input.pop(4) + preds = self.model(re_input) + post_result = self.post_process_class( + preds, + ser_results=ser_results, + entity_idx_dict_batch=entity_idx_dict_batch) + return post_result + + +def preprocess(): + FLAGS = ReArgsParser().parse_args() + config = load_config(FLAGS.config) + config = merge_config(config, FLAGS.opt) + + ser_config = load_config(FLAGS.config_ser) + ser_config = merge_config(ser_config, FLAGS.opt_ser) + + logger = get_logger() + + # check if set use_gpu=True in paddlepaddle cpu version + use_gpu = config['Global']['use_gpu'] + + device = 'gpu:{}'.format(dist.ParallelEnv().dev_id) if use_gpu else 'cpu' + device = paddle.set_device(device) + + logger.info('{} re config {}'.format('*' * 10, '*' * 10)) + print_dict(config, logger) + logger.info('\n') + logger.info('{} ser config {}'.format('*' * 10, '*' * 10)) + print_dict(ser_config, logger) + logger.info('train with paddle {} and device {}'.format(paddle.__version__, + device)) + return config, ser_config, device, logger + + +if __name__ == '__main__': + config, ser_config, device, logger = preprocess() + os.makedirs(config['Global']['save_res_path'], exist_ok=True) + + ser_re_engine = SerRePredictor(config, ser_config) + + if config["Global"].get("infer_mode", None) is False: + data_dir = config['Eval']['dataset']['data_dir'] + with open(config['Global']['infer_img'], "rb") as f: + infer_imgs = f.readlines() + else: + infer_imgs = get_image_file_list(config['Global']['infer_img']) + + with open( + os.path.join(config['Global']['save_res_path'], + "infer_results.txt"), + "w", + encoding='utf-8') as fout: + for idx, info in enumerate(infer_imgs): + if config["Global"].get("infer_mode", None) is False: + data_line = info.decode('utf-8') + substr = data_line.strip("\n").split("\t") + img_path = os.path.join(data_dir, substr[0]) + data = {'img_path': img_path, 'label': substr[1]} + else: + img_path = info + data = {'img_path': img_path} + + save_img_path = os.path.join( + config['Global']['save_res_path'], + os.path.splitext(os.path.basename(img_path))[0] + "_ser_re.jpg") + + result = ser_re_engine(data) + result = result[0] + fout.write(img_path + "\t" + json.dumps( + result, ensure_ascii=False) + "\n") + img_res = draw_re_results(img_path, result) + cv2.imwrite(save_img_path, img_res) + + logger.info("process: [{}/{}], save result to {}".format( + idx, len(infer_imgs), save_img_path)) diff --git a/tools/infer_rec.py b/tools/infer_rec.py new file mode 100644 index 0000000000000000000000000000000000000000..29aab9b57853b16bf615c893c30351a403270b57 --- /dev/null +++ b/tools/infer_rec.py @@ -0,0 +1,188 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import os +import sys +import json + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import paddle + +from ppocr.data import create_operators, transform +from ppocr.modeling.architectures import build_model +from ppocr.postprocess import build_post_process +from ppocr.utils.save_load import load_model +from ppocr.utils.utility import get_image_file_list +import tools.program as program + + +def main(): + global_config = config['Global'] + + # build post process + post_process_class = build_post_process(config['PostProcess'], + global_config) + + # build model + if hasattr(post_process_class, 'character'): + char_num = len(getattr(post_process_class, 'character')) + if config['Architecture']["algorithm"] in ["Distillation", + ]: # distillation model + for key in config['Architecture']["Models"]: + if config['Architecture']['Models'][key]['Head'][ + 'name'] == 'MultiHead': # for multi head + out_channels_list = {} + if config['PostProcess'][ + 'name'] == 'DistillationSARLabelDecode': + char_num = char_num - 2 + out_channels_list['CTCLabelDecode'] = char_num + out_channels_list['SARLabelDecode'] = char_num + 2 + config['Architecture']['Models'][key]['Head'][ + 'out_channels_list'] = out_channels_list + else: + config['Architecture']["Models"][key]["Head"][ + 'out_channels'] = char_num + elif config['Architecture']['Head'][ + 'name'] == 'MultiHead': # for multi head loss + out_channels_list = {} + if config['PostProcess']['name'] == 'SARLabelDecode': + char_num = char_num - 2 + out_channels_list['CTCLabelDecode'] = char_num + out_channels_list['SARLabelDecode'] = char_num + 2 + config['Architecture']['Head'][ + 'out_channels_list'] = out_channels_list + else: # base rec model + config['Architecture']["Head"]['out_channels'] = char_num + + model = build_model(config['Architecture']) + + load_model(config, model) + + # create data ops + transforms = [] + for op in config['Eval']['dataset']['transforms']: + op_name = list(op)[0] + if 'Label' in op_name: + continue + elif op_name in ['RecResizeImg']: + op[op_name]['infer_mode'] = True + elif op_name == 'KeepKeys': + if config['Architecture']['algorithm'] == "SRN": + op[op_name]['keep_keys'] = [ + 'image', 'encoder_word_pos', 'gsrm_word_pos', + 'gsrm_slf_attn_bias1', 'gsrm_slf_attn_bias2' + ] + elif config['Architecture']['algorithm'] == "SAR": + op[op_name]['keep_keys'] = ['image', 'valid_ratio'] + elif config['Architecture']['algorithm'] == "RobustScanner": + op[op_name][ + 'keep_keys'] = ['image', 'valid_ratio', 'word_positons'] + else: + op[op_name]['keep_keys'] = ['image'] + transforms.append(op) + global_config['infer_mode'] = True + ops = create_operators(transforms, global_config) + + save_res_path = config['Global'].get('save_res_path', + "./output/rec/predicts_rec.txt") + if not os.path.exists(os.path.dirname(save_res_path)): + os.makedirs(os.path.dirname(save_res_path)) + + model.eval() + + with open(save_res_path, "w") as fout: + for file in get_image_file_list(config['Global']['infer_img']): + logger.info("infer_img: {}".format(file)) + with open(file, 'rb') as f: + img = f.read() + data = {'image': img} + batch = transform(data, ops) + if config['Architecture']['algorithm'] == "SRN": + encoder_word_pos_list = np.expand_dims(batch[1], axis=0) + gsrm_word_pos_list = np.expand_dims(batch[2], axis=0) + gsrm_slf_attn_bias1_list = np.expand_dims(batch[3], axis=0) + gsrm_slf_attn_bias2_list = np.expand_dims(batch[4], axis=0) + + others = [ + paddle.to_tensor(encoder_word_pos_list), + paddle.to_tensor(gsrm_word_pos_list), + paddle.to_tensor(gsrm_slf_attn_bias1_list), + paddle.to_tensor(gsrm_slf_attn_bias2_list) + ] + if config['Architecture']['algorithm'] == "SAR": + valid_ratio = np.expand_dims(batch[-1], axis=0) + img_metas = [paddle.to_tensor(valid_ratio)] + if config['Architecture']['algorithm'] == "RobustScanner": + valid_ratio = np.expand_dims(batch[1], axis=0) + word_positons = np.expand_dims(batch[2], axis=0) + img_metas = [ + paddle.to_tensor(valid_ratio), + paddle.to_tensor(word_positons), + ] + if config['Architecture']['algorithm'] == "CAN": + image_mask = paddle.ones( + (np.expand_dims( + batch[0], axis=0).shape), dtype='float32') + label = paddle.ones((1, 36), dtype='int64') + images = np.expand_dims(batch[0], axis=0) + images = paddle.to_tensor(images) + if config['Architecture']['algorithm'] == "SRN": + preds = model(images, others) + elif config['Architecture']['algorithm'] == "SAR": + preds = model(images, img_metas) + elif config['Architecture']['algorithm'] == "RobustScanner": + preds = model(images, img_metas) + elif config['Architecture']['algorithm'] == "CAN": + preds = model([images, image_mask, label]) + else: + preds = model(images) + post_result = post_process_class(preds) + info = None + if isinstance(post_result, dict): + rec_info = dict() + for key in post_result: + if len(post_result[key][0]) >= 2: + rec_info[key] = { + "label": post_result[key][0][0], + "score": float(post_result[key][0][1]), + } + info = json.dumps(rec_info, ensure_ascii=False) + elif isinstance(post_result, list) and isinstance(post_result[0], + int): + # for RFLearning CNT branch + info = str(post_result[0]) + else: + if len(post_result[0]) >= 2: + info = post_result[0][0] + "\t" + str(post_result[0][1]) + + if info is not None: + logger.info("\t result: {}".format(info)) + fout.write(file + "\t" + info + "\n") + logger.info("success!") + + +if __name__ == '__main__': + config, device, logger, vdl_writer = program.preprocess() + main() diff --git a/tools/infer_sr.py b/tools/infer_sr.py new file mode 100644 index 0000000000000000000000000000000000000000..df4334f3427e57b9062dd819aa16c110fd771e8c --- /dev/null +++ b/tools/infer_sr.py @@ -0,0 +1,100 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import os +import sys +import json +from PIL import Image +import cv2 + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, __dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import paddle + +from ppocr.data import create_operators, transform +from ppocr.modeling.architectures import build_model +from ppocr.postprocess import build_post_process +from ppocr.utils.save_load import load_model +from ppocr.utils.utility import get_image_file_list +import tools.program as program + + +def main(): + global_config = config['Global'] + + # build post process + post_process_class = build_post_process(config['PostProcess'], + global_config) + + # sr transform + config['Architecture']["Transform"]['infer_mode'] = True + + model = build_model(config['Architecture']) + + load_model(config, model) + + # create data ops + transforms = [] + for op in config['Eval']['dataset']['transforms']: + op_name = list(op)[0] + if 'Label' in op_name: + continue + elif op_name in ['SRResize']: + op[op_name]['infer_mode'] = True + elif op_name == 'KeepKeys': + op[op_name]['keep_keys'] = ['img_lr'] + transforms.append(op) + global_config['infer_mode'] = True + ops = create_operators(transforms, global_config) + + save_visual_path = config['Global'].get('save_visual', "infer_result/") + if not os.path.exists(os.path.dirname(save_visual_path)): + os.makedirs(os.path.dirname(save_visual_path)) + + model.eval() + for file in get_image_file_list(config['Global']['infer_img']): + logger.info("infer_img: {}".format(file)) + img = Image.open(file).convert("RGB") + data = {'image_lr': img} + batch = transform(data, ops) + images = np.expand_dims(batch[0], axis=0) + images = paddle.to_tensor(images) + + preds = model(images) + sr_img = preds["sr_img"][0] + lr_img = preds["lr_img"][0] + fm_sr = (sr_img.numpy() * 255).transpose(1, 2, 0).astype(np.uint8) + fm_lr = (lr_img.numpy() * 255).transpose(1, 2, 0).astype(np.uint8) + img_name_pure = os.path.split(file)[-1] + cv2.imwrite("{}/sr_{}".format(save_visual_path, img_name_pure), + fm_sr[:, :, ::-1]) + logger.info("The visualized image saved in infer_result/sr_{}".format( + img_name_pure)) + + logger.info("success!") + + +if __name__ == '__main__': + config, device, logger, vdl_writer = program.preprocess() + main() diff --git a/tools/infer_table.py b/tools/infer_table.py new file mode 100644 index 0000000000000000000000000000000000000000..6dde5d67d061f4d0593928759db34bb9b22cde0d --- /dev/null +++ b/tools/infer_table.py @@ -0,0 +1,121 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import os +import sys +import json + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) + +os.environ["FLAGS_allocator_strategy"] = 'auto_growth' + +import paddle +from paddle.jit import to_static + +from ppocr.data import create_operators, transform +from ppocr.modeling.architectures import build_model +from ppocr.postprocess import build_post_process +from ppocr.utils.save_load import load_model +from ppocr.utils.utility import get_image_file_list +from ppocr.utils.visual import draw_rectangle +from tools.infer.utility import draw_boxes +import tools.program as program +import cv2 + + +@paddle.no_grad() +def main(config, device, logger, vdl_writer): + global_config = config['Global'] + + # build post process + post_process_class = build_post_process(config['PostProcess'], + global_config) + + # build model + if hasattr(post_process_class, 'character'): + config['Architecture']["Head"]['out_channels'] = len( + getattr(post_process_class, 'character')) + + model = build_model(config['Architecture']) + algorithm = config['Architecture']['algorithm'] + + load_model(config, model) + + # create data ops + transforms = [] + for op in config['Eval']['dataset']['transforms']: + op_name = list(op)[0] + if 'Encode' in op_name: + continue + if op_name == 'KeepKeys': + op[op_name]['keep_keys'] = ['image', 'shape'] + transforms.append(op) + + global_config['infer_mode'] = True + ops = create_operators(transforms, global_config) + + save_res_path = config['Global']['save_res_path'] + os.makedirs(save_res_path, exist_ok=True) + + model.eval() + with open( + os.path.join(save_res_path, 'infer.txt'), mode='w', + encoding='utf-8') as f_w: + for file in get_image_file_list(config['Global']['infer_img']): + logger.info("infer_img: {}".format(file)) + with open(file, 'rb') as f: + img = f.read() + data = {'image': img} + batch = transform(data, ops) + images = np.expand_dims(batch[0], axis=0) + shape_list = np.expand_dims(batch[1], axis=0) + + images = paddle.to_tensor(images) + preds = model(images) + post_result = post_process_class(preds, [shape_list]) + + structure_str_list = post_result['structure_batch_list'][0] + bbox_list = post_result['bbox_batch_list'][0] + structure_str_list = structure_str_list[0] + structure_str_list = [ + '', '', '' + ] + structure_str_list + ['
', '', ''] + bbox_list_str = json.dumps(bbox_list.tolist()) + + logger.info("result: {}, {}".format(structure_str_list, + bbox_list_str)) + f_w.write("result: {}, {}\n".format(structure_str_list, + bbox_list_str)) + + if len(bbox_list) > 0 and len(bbox_list[0]) == 4: + img = draw_rectangle(file, bbox_list) + else: + img = draw_boxes(cv2.imread(file), bbox_list) + cv2.imwrite( + os.path.join(save_res_path, os.path.basename(file)), img) + logger.info('save result to {}'.format(save_res_path)) + logger.info("success!") + + +if __name__ == '__main__': + config, device, logger, vdl_writer = program.preprocess() + main(config, device, logger, vdl_writer) diff --git a/tools/program.py b/tools/program.py new file mode 100644 index 0000000000000000000000000000000000000000..209212fde1e1e803b2144d000611688d14ffc6c0 --- /dev/null +++ b/tools/program.py @@ -0,0 +1,702 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import platform +import yaml +import time +import datetime +import paddle +import paddle.distributed as dist +from tqdm import tqdm +import cv2 +import numpy as np +from argparse import ArgumentParser, RawDescriptionHelpFormatter + +from ppocr.utils.stats import TrainingStats +from ppocr.utils.save_load import save_model +from ppocr.utils.utility import print_dict, AverageMeter +from ppocr.utils.logging import get_logger +from ppocr.utils.loggers import VDLLogger, WandbLogger, Loggers +from ppocr.utils import profiler +from ppocr.data import build_dataloader + + +class ArgsParser(ArgumentParser): + def __init__(self): + super(ArgsParser, self).__init__( + formatter_class=RawDescriptionHelpFormatter) + self.add_argument("-c", "--config", help="configuration file to use") + self.add_argument( + "-o", "--opt", nargs='+', help="set configuration options") + self.add_argument( + '-p', + '--profiler_options', + type=str, + default=None, + help='The option of profiler, which should be in format ' \ + '\"key1=value1;key2=value2;key3=value3\".' + ) + + def parse_args(self, argv=None): + args = super(ArgsParser, self).parse_args(argv) + assert args.config is not None, \ + "Please specify --config=configure_file_path." + args.opt = self._parse_opt(args.opt) + return args + + def _parse_opt(self, opts): + config = {} + if not opts: + return config + for s in opts: + s = s.strip() + k, v = s.split('=') + config[k] = yaml.load(v, Loader=yaml.Loader) + return config + + +def load_config(file_path): + """ + Load config from yml/yaml file. + Args: + file_path (str): Path of the config file to be loaded. + Returns: global config + """ + _, ext = os.path.splitext(file_path) + assert ext in ['.yml', '.yaml'], "only support yaml files for now" + config = yaml.load(open(file_path, 'rb'), Loader=yaml.Loader) + return config + + +def merge_config(config, opts): + """ + Merge config into global config. + Args: + config (dict): Config to be merged. + Returns: global config + """ + for key, value in opts.items(): + if "." not in key: + if isinstance(value, dict) and key in config: + config[key].update(value) + else: + config[key] = value + else: + sub_keys = key.split('.') + assert ( + sub_keys[0] in config + ), "the sub_keys can only be one of global_config: {}, but get: " \ + "{}, please check your running command".format( + config.keys(), sub_keys[0]) + cur = config[sub_keys[0]] + for idx, sub_key in enumerate(sub_keys[1:]): + if idx == len(sub_keys) - 2: + cur[sub_key] = value + else: + cur = cur[sub_key] + return config + + +def check_device(use_gpu, use_xpu=False, use_npu=False, use_mlu=False): + """ + Log error and exit when set use_gpu=true in paddlepaddle + cpu version. + """ + err = "Config {} cannot be set as true while your paddle " \ + "is not compiled with {} ! \nPlease try: \n" \ + "\t1. Install paddlepaddle to run model on {} \n" \ + "\t2. Set {} as false in config file to run " \ + "model on CPU" + + try: + if use_gpu and use_xpu: + print("use_xpu and use_gpu can not both be ture.") + if use_gpu and not paddle.is_compiled_with_cuda(): + print(err.format("use_gpu", "cuda", "gpu", "use_gpu")) + sys.exit(1) + if use_xpu and not paddle.device.is_compiled_with_xpu(): + print(err.format("use_xpu", "xpu", "xpu", "use_xpu")) + sys.exit(1) + if use_npu: + if int(paddle.version.major) != 0 and int( + paddle.version.major) <= 2 and int( + paddle.version.minor) <= 4: + if not paddle.device.is_compiled_with_npu(): + print(err.format("use_npu", "npu", "npu", "use_npu")) + sys.exit(1) + # is_compiled_with_npu() has been updated after paddle-2.4 + else: + if not paddle.device.is_compiled_with_custom_device("npu"): + print(err.format("use_npu", "npu", "npu", "use_npu")) + sys.exit(1) + if use_mlu and not paddle.device.is_compiled_with_mlu(): + print(err.format("use_mlu", "mlu", "mlu", "use_mlu")) + sys.exit(1) + except Exception as e: + pass + + +def to_float32(preds): + if isinstance(preds, dict): + for k in preds: + if isinstance(preds[k], dict) or isinstance(preds[k], list): + preds[k] = to_float32(preds[k]) + elif isinstance(preds[k], paddle.Tensor): + preds[k] = preds[k].astype(paddle.float32) + elif isinstance(preds, list): + for k in range(len(preds)): + if isinstance(preds[k], dict): + preds[k] = to_float32(preds[k]) + elif isinstance(preds[k], list): + preds[k] = to_float32(preds[k]) + elif isinstance(preds[k], paddle.Tensor): + preds[k] = preds[k].astype(paddle.float32) + elif isinstance(preds, paddle.Tensor): + preds = preds.astype(paddle.float32) + return preds + + +def train(config, + train_dataloader, + valid_dataloader, + device, + model, + loss_class, + optimizer, + lr_scheduler, + post_process_class, + eval_class, + pre_best_model_dict, + logger, + log_writer=None, + scaler=None, + amp_level='O2', + amp_custom_black_list=[]): + cal_metric_during_train = config['Global'].get('cal_metric_during_train', + False) + calc_epoch_interval = config['Global'].get('calc_epoch_interval', 1) + log_smooth_window = config['Global']['log_smooth_window'] + epoch_num = config['Global']['epoch_num'] + print_batch_step = config['Global']['print_batch_step'] + eval_batch_step = config['Global']['eval_batch_step'] + profiler_options = config['profiler_options'] + + global_step = 0 + if 'global_step' in pre_best_model_dict: + global_step = pre_best_model_dict['global_step'] + start_eval_step = 0 + if type(eval_batch_step) == list and len(eval_batch_step) >= 2: + start_eval_step = eval_batch_step[0] + eval_batch_step = eval_batch_step[1] + if len(valid_dataloader) == 0: + logger.info( + 'No Images in eval dataset, evaluation during training ' \ + 'will be disabled' + ) + start_eval_step = 1e111 + logger.info( + "During the training process, after the {}th iteration, " \ + "an evaluation is run every {} iterations". + format(start_eval_step, eval_batch_step)) + save_epoch_step = config['Global']['save_epoch_step'] + save_model_dir = config['Global']['save_model_dir'] + if not os.path.exists(save_model_dir): + os.makedirs(save_model_dir) + main_indicator = eval_class.main_indicator + best_model_dict = {main_indicator: 0} + best_model_dict.update(pre_best_model_dict) + train_stats = TrainingStats(log_smooth_window, ['lr']) + model_average = False + model.train() + + use_srn = config['Architecture']['algorithm'] == "SRN" + extra_input_models = [ + "SRN", "NRTR", "SAR", "SEED", "SVTR", "SPIN", "VisionLAN", + "RobustScanner", "RFL", 'DRRG' + ] + extra_input = False + if config['Architecture']['algorithm'] == 'Distillation': + for key in config['Architecture']["Models"]: + extra_input = extra_input or config['Architecture']['Models'][key][ + 'algorithm'] in extra_input_models + else: + extra_input = config['Architecture']['algorithm'] in extra_input_models + try: + model_type = config['Architecture']['model_type'] + except: + model_type = None + + algorithm = config['Architecture']['algorithm'] + + start_epoch = best_model_dict[ + 'start_epoch'] if 'start_epoch' in best_model_dict else 1 + + total_samples = 0 + train_reader_cost = 0.0 + train_batch_cost = 0.0 + reader_start = time.time() + eta_meter = AverageMeter() + + max_iter = len(train_dataloader) - 1 if platform.system( + ) == "Windows" else len(train_dataloader) + + for epoch in range(start_epoch, epoch_num + 1): + if train_dataloader.dataset.need_reset: + train_dataloader = build_dataloader( + config, 'Train', device, logger, seed=epoch) + max_iter = len(train_dataloader) - 1 if platform.system( + ) == "Windows" else len(train_dataloader) + + for idx, batch in enumerate(train_dataloader): + profiler.add_profiler_step(profiler_options) + train_reader_cost += time.time() - reader_start + if idx >= max_iter: + break + lr = optimizer.get_lr() + images = batch[0] + if use_srn: + model_average = True + # use amp + if scaler: + with paddle.amp.auto_cast( + level=amp_level, + custom_black_list=amp_custom_black_list): + if model_type == 'table' or extra_input: + preds = model(images, data=batch[1:]) + elif model_type in ["kie"]: + preds = model(batch) + elif algorithm in ['CAN']: + preds = model(batch[:3]) + else: + preds = model(images) + preds = to_float32(preds) + loss = loss_class(preds, batch) + avg_loss = loss['loss'] + scaled_avg_loss = scaler.scale(avg_loss) + scaled_avg_loss.backward() + scaler.minimize(optimizer, scaled_avg_loss) + else: + if model_type == 'table' or extra_input: + preds = model(images, data=batch[1:]) + elif model_type in ["kie", 'sr']: + preds = model(batch) + elif algorithm in ['CAN']: + preds = model(batch[:3]) + else: + preds = model(images) + loss = loss_class(preds, batch) + avg_loss = loss['loss'] + avg_loss.backward() + optimizer.step() + + optimizer.clear_grad() + + if cal_metric_during_train and epoch % calc_epoch_interval == 0: # only rec and cls need + batch = [item.numpy() for item in batch] + if model_type in ['kie', 'sr']: + eval_class(preds, batch) + elif model_type in ['table']: + post_result = post_process_class(preds, batch) + eval_class(post_result, batch) + elif algorithm in ['CAN']: + model_type = 'can' + eval_class(preds[0], batch[2:], epoch_reset=(idx == 0)) + else: + if config['Loss']['name'] in ['MultiLoss', 'MultiLoss_v2' + ]: # for multi head loss + post_result = post_process_class( + preds['ctc'], batch[1]) # for CTC head out + elif config['Loss']['name'] in ['VLLoss']: + post_result = post_process_class(preds, batch[1], + batch[-1]) + else: + post_result = post_process_class(preds, batch[1]) + eval_class(post_result, batch) + metric = eval_class.get_metric() + train_stats.update(metric) + + train_batch_time = time.time() - reader_start + train_batch_cost += train_batch_time + eta_meter.update(train_batch_time) + global_step += 1 + total_samples += len(images) + + if not isinstance(lr_scheduler, float): + lr_scheduler.step() + + # logger and visualdl + stats = {k: v.numpy().mean() for k, v in loss.items()} + stats['lr'] = lr + train_stats.update(stats) + + if log_writer is not None and dist.get_rank() == 0: + log_writer.log_metrics( + metrics=train_stats.get(), prefix="TRAIN", step=global_step) + + if dist.get_rank() == 0 and ( + (global_step > 0 and global_step % print_batch_step == 0) or + (idx >= len(train_dataloader) - 1)): + logs = train_stats.log() + + eta_sec = ((epoch_num + 1 - epoch) * \ + len(train_dataloader) - idx - 1) * eta_meter.avg + eta_sec_format = str(datetime.timedelta(seconds=int(eta_sec))) + strs = 'epoch: [{}/{}], global_step: {}, {}, avg_reader_cost: ' \ + '{:.5f} s, avg_batch_cost: {:.5f} s, avg_samples: {}, ' \ + 'ips: {:.5f} samples/s, eta: {}'.format( + epoch, epoch_num, global_step, logs, + train_reader_cost / print_batch_step, + train_batch_cost / print_batch_step, + total_samples / print_batch_step, + total_samples / train_batch_cost, eta_sec_format) + logger.info(strs) + + total_samples = 0 + train_reader_cost = 0.0 + train_batch_cost = 0.0 + # eval + if global_step > start_eval_step and \ + (global_step - start_eval_step) % eval_batch_step == 0 \ + and dist.get_rank() == 0: + if model_average: + Model_Average = paddle.incubate.optimizer.ModelAverage( + 0.15, + parameters=model.parameters(), + min_average_window=10000, + max_average_window=15625) + Model_Average.apply() + cur_metric = eval( + model, + valid_dataloader, + post_process_class, + eval_class, + model_type, + extra_input=extra_input, + scaler=scaler, + amp_level=amp_level, + amp_custom_black_list=amp_custom_black_list) + cur_metric_str = 'cur metric, {}'.format(', '.join( + ['{}: {}'.format(k, v) for k, v in cur_metric.items()])) + logger.info(cur_metric_str) + + # logger metric + if log_writer is not None: + log_writer.log_metrics( + metrics=cur_metric, prefix="EVAL", step=global_step) + + if cur_metric[main_indicator] >= best_model_dict[ + main_indicator]: + best_model_dict.update(cur_metric) + best_model_dict['best_epoch'] = epoch + save_model( + model, + optimizer, + save_model_dir, + logger, + config, + is_best=True, + prefix='best_accuracy', + best_model_dict=best_model_dict, + epoch=epoch, + global_step=global_step) + best_str = 'best metric, {}'.format(', '.join([ + '{}: {}'.format(k, v) for k, v in best_model_dict.items() + ])) + logger.info(best_str) + # logger best metric + if log_writer is not None: + log_writer.log_metrics( + metrics={ + "best_{}".format(main_indicator): + best_model_dict[main_indicator] + }, + prefix="EVAL", + step=global_step) + + log_writer.log_model( + is_best=True, + prefix="best_accuracy", + metadata=best_model_dict) + + reader_start = time.time() + if dist.get_rank() == 0: + save_model( + model, + optimizer, + save_model_dir, + logger, + config, + is_best=False, + prefix='latest', + best_model_dict=best_model_dict, + epoch=epoch, + global_step=global_step) + + if log_writer is not None: + log_writer.log_model(is_best=False, prefix="latest") + + if dist.get_rank() == 0 and epoch > 0 and epoch % save_epoch_step == 0: + save_model( + model, + optimizer, + save_model_dir, + logger, + config, + is_best=False, + prefix='iter_epoch_{}'.format(epoch), + best_model_dict=best_model_dict, + epoch=epoch, + global_step=global_step) + if log_writer is not None: + log_writer.log_model( + is_best=False, prefix='iter_epoch_{}'.format(epoch)) + + best_str = 'best metric, {}'.format(', '.join( + ['{}: {}'.format(k, v) for k, v in best_model_dict.items()])) + logger.info(best_str) + if dist.get_rank() == 0 and log_writer is not None: + log_writer.close() + return + + +def eval(model, + valid_dataloader, + post_process_class, + eval_class, + model_type=None, + extra_input=False, + scaler=None, + amp_level='O2', + amp_custom_black_list=[]): + model.eval() + with paddle.no_grad(): + total_frame = 0.0 + total_time = 0.0 + pbar = tqdm( + total=len(valid_dataloader), + desc='eval model:', + position=0, + leave=True) + max_iter = len(valid_dataloader) - 1 if platform.system( + ) == "Windows" else len(valid_dataloader) + sum_images = 0 + for idx, batch in enumerate(valid_dataloader): + if idx >= max_iter: + break + images = batch[0] + start = time.time() + + # use amp + if scaler: + with paddle.amp.auto_cast( + level=amp_level, + custom_black_list=amp_custom_black_list): + if model_type == 'table' or extra_input: + preds = model(images, data=batch[1:]) + elif model_type in ["kie"]: + preds = model(batch) + elif model_type in ['can']: + preds = model(batch[:3]) + elif model_type in ['sr']: + preds = model(batch) + sr_img = preds["sr_img"] + lr_img = preds["lr_img"] + else: + preds = model(images) + preds = to_float32(preds) + else: + if model_type == 'table' or extra_input: + preds = model(images, data=batch[1:]) + elif model_type in ["kie"]: + preds = model(batch) + elif model_type in ['can']: + preds = model(batch[:3]) + elif model_type in ['sr']: + preds = model(batch) + sr_img = preds["sr_img"] + lr_img = preds["lr_img"] + else: + preds = model(images) + + batch_numpy = [] + for item in batch: + if isinstance(item, paddle.Tensor): + batch_numpy.append(item.numpy()) + else: + batch_numpy.append(item) + # Obtain usable results from post-processing methods + total_time += time.time() - start + # Evaluate the results of the current batch + if model_type in ['table', 'kie']: + if post_process_class is None: + eval_class(preds, batch_numpy) + else: + post_result = post_process_class(preds, batch_numpy) + eval_class(post_result, batch_numpy) + elif model_type in ['sr']: + eval_class(preds, batch_numpy) + elif model_type in ['can']: + eval_class(preds[0], batch_numpy[2:], epoch_reset=(idx == 0)) + else: + post_result = post_process_class(preds, batch_numpy[1]) + eval_class(post_result, batch_numpy) + + pbar.update(1) + total_frame += len(images) + sum_images += 1 + # Get final metric,eg. acc or hmean + metric = eval_class.get_metric() + + pbar.close() + model.train() + metric['fps'] = total_frame / total_time + return metric + + +def update_center(char_center, post_result, preds): + result, label = post_result + feats, logits = preds + logits = paddle.argmax(logits, axis=-1) + feats = feats.numpy() + logits = logits.numpy() + + for idx_sample in range(len(label)): + if result[idx_sample][0] == label[idx_sample][0]: + feat = feats[idx_sample] + logit = logits[idx_sample] + for idx_time in range(len(logit)): + index = logit[idx_time] + if index in char_center.keys(): + char_center[index][0] = ( + char_center[index][0] * char_center[index][1] + + feat[idx_time]) / (char_center[index][1] + 1) + char_center[index][1] += 1 + else: + char_center[index] = [feat[idx_time], 1] + return char_center + + +def get_center(model, eval_dataloader, post_process_class): + pbar = tqdm(total=len(eval_dataloader), desc='get center:') + max_iter = len(eval_dataloader) - 1 if platform.system( + ) == "Windows" else len(eval_dataloader) + char_center = dict() + for idx, batch in enumerate(eval_dataloader): + if idx >= max_iter: + break + images = batch[0] + start = time.time() + preds = model(images) + + batch = [item.numpy() for item in batch] + # Obtain usable results from post-processing methods + post_result = post_process_class(preds, batch[1]) + + #update char_center + char_center = update_center(char_center, post_result, preds) + pbar.update(1) + + pbar.close() + for key in char_center.keys(): + char_center[key] = char_center[key][0] + return char_center + + +def preprocess(is_train=False): + FLAGS = ArgsParser().parse_args() + profiler_options = FLAGS.profiler_options + config = load_config(FLAGS.config) + config = merge_config(config, FLAGS.opt) + profile_dic = {"profiler_options": FLAGS.profiler_options} + config = merge_config(config, profile_dic) + + if is_train: + # save_config + save_model_dir = config['Global']['save_model_dir'] + os.makedirs(save_model_dir, exist_ok=True) + with open(os.path.join(save_model_dir, 'config.yml'), 'w') as f: + yaml.dump( + dict(config), f, default_flow_style=False, sort_keys=False) + log_file = '{}/train.log'.format(save_model_dir) + else: + log_file = None + logger = get_logger(log_file=log_file) + + # check if set use_gpu=True in paddlepaddle cpu version + use_gpu = config['Global'].get('use_gpu', False) + use_xpu = config['Global'].get('use_xpu', False) + use_npu = config['Global'].get('use_npu', False) + use_mlu = config['Global'].get('use_mlu', False) + + alg = config['Architecture']['algorithm'] + assert alg in [ + 'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN', + 'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE', + 'SEED', 'SDMGR', 'LayoutXLM', 'LayoutLM', 'LayoutLMv2', 'PREN', 'FCE', + 'SVTR', 'ViTSTR', 'ABINet', 'DB++', 'TableMaster', 'SPIN', 'VisionLAN', + 'Gestalt', 'SLANet', 'RobustScanner', 'CT', 'RFL', 'DRRG', 'CAN', + 'Telescope' + ] + + if use_xpu: + device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0)) + elif use_npu: + device = 'npu:{0}'.format(os.getenv('FLAGS_selected_npus', 0)) + elif use_mlu: + device = 'mlu:{0}'.format(os.getenv('FLAGS_selected_mlus', 0)) + else: + device = 'gpu:{}'.format(dist.ParallelEnv() + .dev_id) if use_gpu else 'cpu' + check_device(use_gpu, use_xpu, use_npu, use_mlu) + + device = paddle.set_device(device) + + config['Global']['distributed'] = dist.get_world_size() != 1 + + loggers = [] + + if 'use_visualdl' in config['Global'] and config['Global']['use_visualdl']: + save_model_dir = config['Global']['save_model_dir'] + vdl_writer_path = '{}/vdl/'.format(save_model_dir) + log_writer = VDLLogger(vdl_writer_path) + loggers.append(log_writer) + if ('use_wandb' in config['Global'] and + config['Global']['use_wandb']) or 'wandb' in config: + save_dir = config['Global']['save_model_dir'] + wandb_writer_path = "{}/wandb".format(save_dir) + if "wandb" in config: + wandb_params = config['wandb'] + else: + wandb_params = dict() + wandb_params.update({'save_dir': save_dir}) + log_writer = WandbLogger(**wandb_params, config=config) + loggers.append(log_writer) + else: + log_writer = None + print_dict(config, logger) + + if loggers: + log_writer = Loggers(loggers) + else: + log_writer = None + + logger.info('train with paddle {} and device {}'.format(paddle.__version__, + device)) + return config, device, logger, log_writer diff --git a/tools/test_hubserving.py b/tools/test_hubserving.py new file mode 100644 index 0000000000000000000000000000000000000000..ec17a9413e15b3ae92843990cfcbb05fc5f991a8 --- /dev/null +++ b/tools/test_hubserving.py @@ -0,0 +1,157 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import sys +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) + +from ppocr.utils.logging import get_logger +logger = get_logger() + +import cv2 +import numpy as np +import time +from PIL import Image +from ppocr.utils.utility import get_image_file_list +from tools.infer.utility import draw_ocr, draw_boxes, str2bool +from ppstructure.utility import draw_structure_result +from ppstructure.predict_system import to_excel + +import requests +import json +import base64 + + +def cv2_to_base64(image): + return base64.b64encode(image).decode('utf8') + + +def draw_server_result(image_file, res): + img = cv2.imread(image_file) + image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) + if len(res) == 0: + return np.array(image) + keys = res[0].keys() + if 'text_region' not in keys: # for ocr_rec, draw function is invalid + logger.info("draw function is invalid for ocr_rec!") + return None + elif 'text' not in keys: # for ocr_det + logger.info("draw text boxes only!") + boxes = [] + for dno in range(len(res)): + boxes.append(res[dno]['text_region']) + boxes = np.array(boxes) + draw_img = draw_boxes(image, boxes) + return draw_img + else: # for ocr_system + logger.info("draw boxes and texts!") + boxes = [] + texts = [] + scores = [] + for dno in range(len(res)): + boxes.append(res[dno]['text_region']) + texts.append(res[dno]['text']) + scores.append(res[dno]['confidence']) + boxes = np.array(boxes) + scores = np.array(scores) + draw_img = draw_ocr( + image, boxes, texts, scores, draw_txt=True, drop_score=0.5) + return draw_img + + +def save_structure_res(res, save_folder, image_file): + img = cv2.imread(image_file) + excel_save_folder = os.path.join(save_folder, os.path.basename(image_file)) + os.makedirs(excel_save_folder, exist_ok=True) + # save res + with open( + os.path.join(excel_save_folder, 'res.txt'), 'w', + encoding='utf8') as f: + for region in res: + if region['type'] == 'Table': + excel_path = os.path.join(excel_save_folder, + '{}.xlsx'.format(region['bbox'])) + to_excel(region['res'], excel_path) + elif region['type'] == 'Figure': + x1, y1, x2, y2 = region['bbox'] + print(region['bbox']) + roi_img = img[y1:y2, x1:x2, :] + img_path = os.path.join(excel_save_folder, + '{}.jpg'.format(region['bbox'])) + cv2.imwrite(img_path, roi_img) + else: + for text_result in region['res']: + f.write('{}\n'.format(json.dumps(text_result))) + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + is_visualize = False + headers = {"Content-type": "application/json"} + cnt = 0 + total_time = 0 + for image_file in image_file_list: + img = open(image_file, 'rb').read() + if img is None: + logger.info("error in loading image:{}".format(image_file)) + continue + img_name = os.path.basename(image_file) + # seed http request + starttime = time.time() + data = {'images': [cv2_to_base64(img)]} + r = requests.post( + url=args.server_url, headers=headers, data=json.dumps(data)) + elapse = time.time() - starttime + total_time += elapse + logger.info("Predict time of %s: %.3fs" % (image_file, elapse)) + res = r.json()["results"][0] + logger.info(res) + + if args.visualize: + draw_img = None + if 'structure_table' in args.server_url: + to_excel(res['html'], './{}.xlsx'.format(img_name)) + elif 'structure_system' in args.server_url: + save_structure_res(res['regions'], args.output, image_file) + else: + draw_img = draw_server_result(image_file, res) + if draw_img is not None: + if not os.path.exists(args.output): + os.makedirs(args.output) + cv2.imwrite( + os.path.join(args.output, os.path.basename(image_file)), + draw_img[:, :, ::-1]) + logger.info("The visualized image saved in {}".format( + os.path.join(args.output, os.path.basename(image_file)))) + cnt += 1 + if cnt % 100 == 0: + logger.info("{} processed".format(cnt)) + logger.info("avg time cost: {}".format(float(total_time) / cnt)) + + +def parse_args(): + import argparse + parser = argparse.ArgumentParser(description="args for hub serving") + parser.add_argument("--server_url", type=str, required=True) + parser.add_argument("--image_dir", type=str, required=True) + parser.add_argument("--visualize", type=str2bool, default=False) + parser.add_argument("--output", type=str, default='./hubserving_result') + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + main(args) diff --git a/tools/train.py b/tools/train.py new file mode 100644 index 0000000000000000000000000000000000000000..ff261e85fec10ec974ff763d6c3747faaa47c8d9 --- /dev/null +++ b/tools/train.py @@ -0,0 +1,209 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '..'))) + +import yaml +import paddle +import paddle.distributed as dist + +from ppocr.data import build_dataloader +from ppocr.modeling.architectures import build_model +from ppocr.losses import build_loss +from ppocr.optimizer import build_optimizer +from ppocr.postprocess import build_post_process +from ppocr.metrics import build_metric +from ppocr.utils.save_load import load_model +from ppocr.utils.utility import set_seed +from ppocr.modeling.architectures import apply_to_static +import tools.program as program + +dist.get_world_size() + + +def main(config, device, logger, vdl_writer): + # init dist environment + if config['Global']['distributed']: + dist.init_parallel_env() + + global_config = config['Global'] + + # build dataloader + train_dataloader = build_dataloader(config, 'Train', device, logger) + if len(train_dataloader) == 0: + logger.error( + "No Images in train dataset, please ensure\n" + + "\t1. The images num in the train label_file_list should be larger than or equal with batch size.\n" + + + "\t2. The annotation file and path in the configuration file are provided normally." + ) + return + + if config['Eval']: + valid_dataloader = build_dataloader(config, 'Eval', device, logger) + else: + valid_dataloader = None + + # build post process + post_process_class = build_post_process(config['PostProcess'], + global_config) + + # build model + # for rec algorithm + if hasattr(post_process_class, 'character'): + char_num = len(getattr(post_process_class, 'character')) + if config['Architecture']["algorithm"] in ["Distillation", + ]: # distillation model + for key in config['Architecture']["Models"]: + if config['Architecture']['Models'][key]['Head'][ + 'name'] == 'MultiHead': # for multi head + if config['PostProcess'][ + 'name'] == 'DistillationSARLabelDecode': + char_num = char_num - 2 + # update SARLoss params + assert list(config['Loss']['loss_config_list'][-1].keys())[ + 0] == 'DistillationSARLoss' + config['Loss']['loss_config_list'][-1][ + 'DistillationSARLoss']['ignore_index'] = char_num + 1 + out_channels_list = {} + out_channels_list['CTCLabelDecode'] = char_num + out_channels_list['SARLabelDecode'] = char_num + 2 + config['Architecture']['Models'][key]['Head'][ + 'out_channels_list'] = out_channels_list + else: + config['Architecture']["Models"][key]["Head"][ + 'out_channels'] = char_num + elif config['Architecture']['Head'][ + 'name'] == 'MultiHead': # for multi head + if config['PostProcess']['name'] == 'SARLabelDecode': + char_num = char_num - 2 + # update SARLoss params + assert list(config['Loss']['loss_config_list'][1].keys())[ + 0] == 'SARLoss' + if config['Loss']['loss_config_list'][1]['SARLoss'] is None: + config['Loss']['loss_config_list'][1]['SARLoss'] = { + 'ignore_index': char_num + 1 + } + else: + config['Loss']['loss_config_list'][1]['SARLoss'][ + 'ignore_index'] = char_num + 1 + out_channels_list = {} + out_channels_list['CTCLabelDecode'] = char_num + out_channels_list['SARLabelDecode'] = char_num + 2 + config['Architecture']['Head'][ + 'out_channels_list'] = out_channels_list + else: # base rec model + config['Architecture']["Head"]['out_channels'] = char_num + + if config['PostProcess']['name'] == 'SARLabelDecode': # for SAR model + config['Loss']['ignore_index'] = char_num - 1 + + model = build_model(config['Architecture']) + + use_sync_bn = config["Global"].get("use_sync_bn", False) + if use_sync_bn: + model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model) + logger.info('convert_sync_batchnorm') + + model = apply_to_static(model, config, logger) + + # build loss + loss_class = build_loss(config['Loss']) + + # build optim + optimizer, lr_scheduler = build_optimizer( + config['Optimizer'], + epochs=config['Global']['epoch_num'], + step_each_epoch=len(train_dataloader), + model=model) + + # build metric + eval_class = build_metric(config['Metric']) + + logger.info('train dataloader has {} iters'.format(len(train_dataloader))) + if valid_dataloader is not None: + logger.info('valid dataloader has {} iters'.format( + len(valid_dataloader))) + + use_amp = config["Global"].get("use_amp", False) + amp_level = config["Global"].get("amp_level", 'O2') + amp_custom_black_list = config['Global'].get('amp_custom_black_list', []) + if use_amp: + AMP_RELATED_FLAGS_SETTING = {'FLAGS_max_inplace_grad_add': 8, } + if paddle.is_compiled_with_cuda(): + AMP_RELATED_FLAGS_SETTING.update({ + 'FLAGS_cudnn_batchnorm_spatial_persistent': 1 + }) + paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) + scale_loss = config["Global"].get("scale_loss", 1.0) + use_dynamic_loss_scaling = config["Global"].get( + "use_dynamic_loss_scaling", False) + scaler = paddle.amp.GradScaler( + init_loss_scaling=scale_loss, + use_dynamic_loss_scaling=use_dynamic_loss_scaling) + if amp_level == "O2": + model, optimizer = paddle.amp.decorate( + models=model, + optimizers=optimizer, + level=amp_level, + master_weight=True) + else: + scaler = None + + # load pretrain model + pre_best_model_dict = load_model(config, model, optimizer, + config['Architecture']["model_type"]) + + if config['Global']['distributed']: + model = paddle.DataParallel(model) + # start train + program.train(config, train_dataloader, valid_dataloader, device, model, + loss_class, optimizer, lr_scheduler, post_process_class, + eval_class, pre_best_model_dict, logger, vdl_writer, scaler, + amp_level, amp_custom_black_list) + + +def test_reader(config, device, logger): + loader = build_dataloader(config, 'Train', device, logger) + import time + starttime = time.time() + count = 0 + try: + for data in loader(): + count += 1 + if count % 1 == 0: + batch_time = time.time() - starttime + starttime = time.time() + logger.info("reader: {}, {}, {}".format( + count, len(data[0]), batch_time)) + except Exception as e: + logger.info(e) + logger.info("finish reader: {}, Success!".format(count)) + + +if __name__ == '__main__': + config, device, logger, vdl_writer = program.preprocess(is_train=True) + seed = config['Global']['seed'] if 'seed' in config['Global'] else 1024 + set_seed(seed) + main(config, device, logger, vdl_writer) + # test_reader(config, device, logger)