unimer_demo / unimernet /processors /formula_processor.py
wufan's picture
Upload 111 files
18e4b60 verified
raw
history blame
6.14 kB
from unimernet.common.registry import registry
from omegaconf import OmegaConf
import albumentations as alb
from albumentations.pytorch import ToTensorV2
from unimernet.processors.base_processor import BaseProcessor
import numpy as np
import cv2
from PIL import Image, ImageOps
from torchvision.transforms.functional import resize
import random
from unimernet.processors.formula_processor_helper.nougat import Bitmap, Dilation, Erosion
from unimernet.processors.formula_processor_helper.weather import Fog, Frost, Snow, Rain, Shadow
class FormulaImageBaseProcessor(BaseProcessor):
def __init__(self, image_size):
super(FormulaImageBaseProcessor, self).__init__()
self.input_size = [int(_) for _ in image_size]
assert len(self.input_size) == 2
@staticmethod
def crop_margin(img: Image.Image) -> Image.Image:
data = np.array(img.convert("L"))
data = data.astype(np.uint8)
max_val = data.max()
min_val = data.min()
if max_val == min_val:
return img
data = (data - min_val) / (max_val - min_val) * 255
gray = 255 * (data < 200).astype(np.uint8)
coords = cv2.findNonZero(gray) # Find all non-zero points (text)
a, b, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box
return img.crop((a, b, w + a, h + b))
def prepare_input(self, img: Image.Image, random_padding: bool = False):
"""
Convert PIL Image to tensor according to specified input_size after following steps below:
- resize
- rotate (if align_long_axis is True and image is not aligned longer axis with canvas)
- pad
"""
if img is None:
return
# crop margins
try:
img = self.crop_margin(img.convert("RGB"))
except OSError:
# might throw an error for broken files
return
if img.height == 0 or img.width == 0:
return
img = resize(img, min(self.input_size))
img.thumbnail((self.input_size[1], self.input_size[0]))
delta_width = self.input_size[1] - img.width
delta_height = self.input_size[0] - img.height
if random_padding:
pad_width = np.random.randint(low=0, high=delta_width + 1)
pad_height = np.random.randint(low=0, high=delta_height + 1)
else:
pad_width = delta_width // 2
pad_height = delta_height // 2
padding = (
pad_width,
pad_height,
delta_width - pad_width,
delta_height - pad_height,
)
return ImageOps.expand(img, padding)
@registry.register_processor("formula_image_train")
class FormulaImageTrainProcessor(FormulaImageBaseProcessor):
def __init__(self, image_size=384):
super().__init__(image_size)
self.transform = alb.Compose(
[
alb.Compose(
[
Bitmap(p=0.05),
alb.OneOf([Fog(), Frost(), Snow(), Rain(), Shadow()], p=0.2),
alb.OneOf([Erosion((2, 3)), Dilation((2, 3))], p=0.2),
alb.ShiftScaleRotate(shift_limit=0, scale_limit=(-.15, 0), rotate_limit=1, border_mode=0,
interpolation=3,
value=[255, 255, 255],
p=1),
alb.GridDistortion(distort_limit=0.1, border_mode=0, interpolation=3, value=[255, 255, 255],
p=.5)],
p=.15),
# alb.InvertImg(p=.15),
alb.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.3),
alb.GaussNoise(10, p=.2),
alb.RandomBrightnessContrast(.05, (-.2, 0), True, p=0.2),
alb.ImageCompression(95, p=.3),
alb.ToGray(always_apply=True),
alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)),
# alb.Sharpen()
ToTensorV2(),
]
)
def __call__(self, item):
img = self.prepare_input(item, random_padding=True)
if img is None:
return img
return self.transform(image=np.array(img))['image'][:1]
@classmethod
def from_config(cls, cfg=None):
if cfg is None:
cfg = OmegaConf.create()
image_size = cfg.get("image_size", [384, 384])
return cls(
image_size=image_size,
)
@registry.register_processor("formula_image_multi_scale_train")
class FormulaImageMultiScaleTrainProcessor(FormulaImageTrainProcessor):
def __init__(self, all_scales):
for i, scales in enumerate(all_scales):
all_scales[i] = [int(_) for _ in scales]
super(FormulaImageMultiScaleTrainProcessor, self).__init__(all_scales[0])
self.all_scales = all_scales
@classmethod
def from_config(cls, cfg=None):
if cfg is None:
cfg = OmegaConf.create()
all_scales = cfg.get("all_scales", [[384, 384]])
return cls(
all_scales=all_scales
)
def reset_scale(self):
self.input_size = random.choice(self.all_scales)
@registry.register_processor("formula_image_eval")
class FormulaImageEvalProcessor(FormulaImageBaseProcessor):
def __init__(self, image_size):
super().__init__(image_size)
self.transform = alb.Compose(
[
alb.ToGray(always_apply=True),
alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)),
# alb.Sharpen()
ToTensorV2(),
]
)
def __call__(self, item):
image = self.prepare_input(item)
return self.transform(image=np.array(image))['image'][:1]
@classmethod
def from_config(cls, cfg=None):
if cfg is None:
cfg = OmegaConf.create()
image_size = cfg.get("image_size", [384, 384])
return cls(image_size=image_size)