Spaces:

tomofi
/

MaskTextSpotterV3-OCR

Runtime error

File size: 9,096 Bytes

c310e19


# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
"""
Simple dataset class that wraps a list of path names
"""

import os
import numpy as np
import torch
from maskrcnn_benchmark.structures.bounding_box import BoxList
from maskrcnn_benchmark.structures.segmentation_mask import (
    SegmentationCharMask,
    SegmentationMask,
)
from PIL import Image, ImageDraw


class SynthtextDataset(object):
    def __init__(self, use_charann, list_file_path, imgs_dir, gts_dir, transforms=None, ignore_difficult=False):
        self.use_charann = use_charann
        with open(list_file_path, "r") as list_file:
            image_lines = list_file.readlines()
        self.image_lists = [
            os.path.join(imgs_dir, line.strip()) for line in image_lines
        ]
        self.gt_lists = [
            os.path.join(gts_dir, line.strip() + ".txt") for line in image_lines
        ]
        self.filtered_gts = []
        self.transforms = transforms
        self.min_proposal_size = 2
        self.char_classes = "_0123456789abcdefghijklmnopqrstuvwxyz"
        self.vis = False
        self.ignore_difficult = ignore_difficult

    def __getitem__(self, item):
        while True:
            img_path = self.image_lists[item]
            try:
                img = Image.open(img_path).convert("RGB")
                break
            except BaseException:
                item += 1
        im_name = os.path.basename(img_path)
        width, height = img.size
        gt_path = self.gt_lists[item]
        words, boxes, charsbbs, segmentations = self.load_gt_from_txt(
            gt_path, height, width
        )
        target = BoxList(
            boxes[:, :4], img.size, mode="xyxy", use_char_ann=self.use_charann
        )
        classes = torch.ones(len(boxes))
        target.add_field("labels", classes)
        masks = SegmentationMask(segmentations, img.size)
        target.add_field("masks", masks)
        if words[0] == "":
            use_char_ann = False
        else:
            use_char_ann = True
        if not self.use_charann:
            use_char_ann = False
        char_masks = SegmentationCharMask(
            charsbbs, words=words, use_char_ann=use_char_ann, size=img.size, char_num_classes=len(self.char_classes)
        )
        target.add_field("char_masks", char_masks)
        if self.transforms is not None:
            img, target = self.transforms(img, target)
        if self.vis:
            new_im = img.numpy().copy().transpose([1, 2, 0]) + [
                102.9801,
                115.9465,
                122.7717,
            ]
            new_im = Image.fromarray(new_im.astype(np.uint8)).convert("RGB")
            mask = target.extra_fields["masks"].polygons[0].convert("mask")
            mask = Image.fromarray((mask.numpy() * 255).astype(np.uint8)).convert("RGB")
            if self.use_charann:
                m, _ = (
                    target.extra_fields["char_masks"]
                    .chars_boxes[0]
                    .convert("char_mask")
                )
                color = self.creat_color_map(37, 255)
                color_map = color[m.numpy().astype(np.uint8)]
                char = Image.fromarray(color_map.astype(np.uint8)).convert("RGB")
                char = Image.blend(char, new_im, 0.5)
            else:
                char = new_im
            new = Image.blend(char, mask, 0.5)
            img_draw = ImageDraw.Draw(new)
            for box in target.bbox.numpy():
                box = list(box)
                box = box[:2] + [box[2], box[1]] + box[2:] + [box[0], box[3]] + box[:2]
                img_draw.line(box, fill=(255, 0, 0), width=2)
            new.save("./vis/char_" + im_name)
        return img, target, self.image_lists[item]

    def creat_color_map(self, n_class, width):
        splits = int(np.ceil(np.power((n_class * 1.0), 1.0 / 3)))
        maps = []
        for i in range(splits):
            r = int(i * width * 1.0 / (splits - 1))
            for j in range(splits):
                g = int(j * width * 1.0 / (splits - 1))
                for k in range(splits - 1):
                    b = int(k * width * 1.0 / (splits - 1))
                    maps.append([r, g, b])
        return np.array(maps)

    def __len__(self):
        return len(self.image_lists)

    def load_gt_from_txt(self, gt_path, height=None, width=None):
        words, boxes, charsboxes, segmentations = [], [], [], []
        lines = open(gt_path).readlines()
        for line in lines:
            charbbs = []
            strs, loc = self.line2boxes(line)
            word = strs[0]
            if word == "###":
                continue
            else:
                rect = list(loc[0])
                min_x = min(rect[::2]) - 1
                min_y = min(rect[1::2]) - 1
                max_x = max(rect[::2]) - 1
                max_y = max(rect[1::2]) - 1
                box = [min_x, min_y, max_x, max_y]
                segmentations.append([loc[0, :]])
                tindex = len(boxes)
                boxes.append(box)
                words.append(word)
                c_class = self.char2num(strs[1:])
                charbb = np.zeros((10,), dtype=np.float32)
                if loc.shape[0] > 1:
                    for i in range(1, loc.shape[0]):
                        charbb[:8] = loc[i, :]
                        charbb[8] = c_class[i - 1]
                        charbb[9] = tindex
                        charbbs.append(charbb.copy())
                else:
                    charbbs.append(charbb.copy())
                charsboxes.append(charbbs)
        num_boxes = len(boxes)
        if len(boxes) > 0:
            keep_boxes = np.zeros((num_boxes, 5))
            keep_boxes[:, :4] = np.array(boxes)
            keep_boxes[:, 4] = range(
                num_boxes
            )
            # the 5th column is the box label,
            # same as the 10th column of all charsboxes which belong to the box
            if self.use_charann:
                return words, np.array(keep_boxes), charsboxes, segmentations
            else:
                charbbs = np.zeros((10,), dtype=np.float32)
                for _ in range(len(words)):
                    charsboxes.append([charbbs])
                return words, np.array(keep_boxes), [[charbbs]], segmentations
        else:
            words.append("")
            charbbs = np.zeros((10,), dtype=np.float32)
            return (
                words,
                np.zeros((1, 5), dtype=np.float32),
                [[charbbs]],
                [[np.zeros((8,), dtype=np.float32)]],
            )

    def line2boxes(self, line):
        parts = line.strip().split(",")
        if "\xef\xbb\xbf" in parts[0]:
            parts[0] = parts[0][3:]
        if "\ufeff" in parts[0]:
            parts[0] = parts[0].replace("\ufeff", "")
        x1 = np.array([int(float(x)) for x in parts[::9]])
        y1 = np.array([int(float(x)) for x in parts[1::9]])
        x2 = np.array([int(float(x)) for x in parts[2::9]])
        y2 = np.array([int(float(x)) for x in parts[3::9]])
        x3 = np.array([int(float(x)) for x in parts[4::9]])
        y3 = np.array([int(float(x)) for x in parts[5::9]])
        x4 = np.array([int(float(x)) for x in parts[6::9]])
        y4 = np.array([int(float(x)) for x in parts[7::9]])
        strs = parts[8::9]
        loc = np.vstack((x1, y1, x2, y2, x3, y3, x4, y4)).transpose()
        return strs, loc

    def check_charbbs(self, charbbs):
        xmins = np.minimum.reduce(
            [charbbs[:, 0], charbbs[:, 2], charbbs[:, 4], charbbs[:, 6]]
        )
        xmaxs = np.maximum.reduce(
            [charbbs[:, 0], charbbs[:, 2], charbbs[:, 4], charbbs[:, 6]]
        )
        ymins = np.minimum.reduce(
            [charbbs[:, 1], charbbs[:, 3], charbbs[:, 5], charbbs[:, 7]]
        )
        ymaxs = np.maximum.reduce(
            [charbbs[:, 1], charbbs[:, 3], charbbs[:, 5], charbbs[:, 7]]
        )
        return np.logical_and(
            xmaxs - xmins > self.min_proposal_size,
            ymaxs - ymins > self.min_proposal_size,
        )

    def check_charbb(self, charbb):
        xmins = min(charbb[0], charbb[2], charbb[4], charbb[6])
        xmaxs = max(charbb[0], charbb[2], charbb[4], charbb[6])
        ymins = min(charbb[1], charbb[3], charbb[5], charbb[7])
        ymaxs = max(charbb[1], charbb[3], charbb[5], charbb[7])
        return (
            xmaxs - xmins > self.min_proposal_size
            and ymaxs - ymins > self.min_proposal_size
        )

    def char2num(self, chars):
        ## chars ['h', 'e', 'l', 'l', 'o']
        nums = [self.char_classes.index(c.lower()) for c in chars]
        return nums

    def get_img_info(self, item):
        """
        Return the image dimensions for the image, without
        loading and pre-processing it
        """

        im_name = os.path.basename(self.image_lists[item])
        img = Image.open(self.image_lists[item])
        width, height = img.size
        img_info = {"im_name": im_name, "height": height, "width": width}
        return img_info