Spaces:

tomofi
/

MaskTextSpotterV3-OCR

Runtime error

App Files Files Community

MaskTextSpotterV3-OCR / tools /demo.py

3v324v23

add

c310e19 over 2 years ago

raw

history blame

9.63 kB

	import os
	import cv2
	import torch
	from torchvision import transforms as T

	from maskrcnn_benchmark.modeling.detector import build_detection_model
	from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
	from maskrcnn_benchmark.structures.image_list import to_image_list
	from maskrcnn_benchmark.config import cfg
	from maskrcnn_benchmark.utils.chars import getstr_grid, get_tight_rect

	from PIL import Image
	import numpy as np
	import argparse

	class TextDemo(object):
	def __init__(
	self,
	cfg,
	confidence_threshold=0.7,
	min_image_size=224,
	output_polygon=True
	):
	self.cfg = cfg.clone()
	self.model = build_detection_model(cfg)
	self.model.eval()
	self.device = torch.device(cfg.MODEL.DEVICE)
	self.model.to(self.device)
	self.min_image_size = min_image_size

	checkpointer = DetectronCheckpointer(cfg, self.model)
	_ = checkpointer.load(cfg.MODEL.WEIGHT)

	self.transforms = self.build_transform()
	self.cpu_device = torch.device("cpu")
	self.confidence_threshold = confidence_threshold
	self.output_polygon = output_polygon

	def build_transform(self):
	"""
	Creates a basic transformation that was used to train the models
	"""
	cfg = self.cfg
	# we are loading images with OpenCV, so we don't need to convert them
	# to BGR, they are already! So all we need to do is to normalize
	# by 255 if we want to convert to BGR255 format, or flip the channels
	# if we want it to be in RGB in [0-1] range.
	if cfg.INPUT.TO_BGR255:
	to_bgr_transform = T.Lambda(lambda x: x * 255)
	else:
	to_bgr_transform = T.Lambda(lambda x: x[[2, 1, 0]])

	normalize_transform = T.Normalize(
	mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD
	)

	transform = T.Compose(
	[
	T.ToPILImage(),
	T.Resize(self.min_image_size),
	T.ToTensor(),
	to_bgr_transform,
	normalize_transform,
	]
	)
	return transform

	def run_on_opencv_image(self, image):
	"""
	Arguments:
	image (np.ndarray): an image as returned by OpenCV
	Returns:
	result_polygons (list): detection results
	result_words (list): recognition results
	"""
	result_polygons, result_words = self.compute_prediction(image)
	return result_polygons, result_words

	def compute_prediction(self, original_image):
	# apply pre-processing to image
	image = self.transforms(original_image)
	# convert to an ImageList, padded so that it is divisible by
	# cfg.DATALOADER.SIZE_DIVISIBILITY
	image_list = to_image_list(image, self.cfg.DATALOADER.SIZE_DIVISIBILITY)
	image_list = image_list.to(self.device)
	# compute predictions
	with torch.no_grad():
	predictions, _, _ = self.model(image_list)
	global_predictions = predictions[0]
	char_predictions = predictions[1]
	char_mask = char_predictions['char_mask']
	char_boxes = char_predictions['boxes']
	words, rec_scores = self.process_char_mask(char_mask, char_boxes)
	seq_words = char_predictions['seq_outputs']
	seq_scores = char_predictions['seq_scores']

	global_predictions = [o.to(self.cpu_device) for o in global_predictions]

	# always single image is passed at a time
	global_prediction = global_predictions[0]

	# reshape prediction (a BoxList) into the original image size
	height, width = original_image.shape[:-1]
	global_prediction = global_prediction.resize((width, height))
	boxes = global_prediction.bbox.tolist()
	scores = global_prediction.get_field("scores").tolist()
	masks = global_prediction.get_field("mask").cpu().numpy()

	result_polygons = []
	result_words = []
	for k, box in enumerate(boxes):
	score = scores[k]
	if score < self.confidence_threshold:
	continue
	box = list(map(int, box))
	mask = masks[k,0,:,:]
	polygon = self.mask2polygon(mask, box, original_image.shape, threshold=0.5, output_polygon=self.output_polygon)
	if polygon is None:
	polygon = [box[0], box[1], box[2], box[1], box[2], box[3], box[0], box[3]]
	result_polygons.append(polygon)
	word = words[k]
	rec_score = rec_scores[k]
	seq_word = seq_words[k]
	seq_char_scores = seq_scores[k]
	seq_score = sum(seq_char_scores) / float(len(seq_char_scores))
	if seq_score > rec_score:
	result_words.append(seq_word)
	else:
	result_words.append(word)
	return result_polygons, result_words

	def process_char_mask(self, char_masks, boxes, threshold=192):
	texts, rec_scores = [], []
	for index in range(char_masks.shape[0]):
	box = list(boxes[index])
	box = list(map(int, box))
	text, rec_score, _, _ = getstr_grid(char_masks[index,:,:,:].copy(), box, threshold=threshold)
	texts.append(text)
	rec_scores.append(rec_score)
	return texts, rec_scores

	def mask2polygon(self, mask, box, im_size, threshold=0.5, output_polygon=True):
	# mask 32*128
	image_width, image_height = im_size[1], im_size[0]
	box_h = box[3] - box[1]
	box_w = box[2] - box[0]
	cls_polys = (mask*255).astype(np.uint8)
	poly_map = np.array(Image.fromarray(cls_polys).resize((box_w, box_h)))
	poly_map = poly_map.astype(np.float32) / 255
	poly_map=cv2.GaussianBlur(poly_map,(3,3),sigmaX=3)
	ret, poly_map = cv2.threshold(poly_map,0.5,1,cv2.THRESH_BINARY)
	if output_polygon:
	SE1=cv2.getStructuringElement(cv2.MORPH_RECT,(3,3))
	poly_map = cv2.erode(poly_map,SE1)
	poly_map = cv2.dilate(poly_map,SE1);
	poly_map = cv2.morphologyEx(poly_map,cv2.MORPH_CLOSE,SE1)
	try:
	_, contours, _ = cv2.findContours((poly_map * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
	except:
	contours, _ = cv2.findContours((poly_map * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
	if len(contours)==0:
	print(contours)
	print(len(contours))
	return None
	max_area=0
	max_cnt = contours[0]
	for cnt in contours:
	area=cv2.contourArea(cnt)
	if area > max_area:
	max_area = area
	max_cnt = cnt
	perimeter = cv2.arcLength(max_cnt,True)
	epsilon = 0.01*cv2.arcLength(max_cnt,True)
	approx = cv2.approxPolyDP(max_cnt,epsilon,True)
	pts = approx.reshape((-1,2))
	pts[:,0] = pts[:,0] + box[0]
	pts[:,1] = pts[:,1] + box[1]
	polygon = list(pts.reshape((-1,)))
	polygon = list(map(int, polygon))
	if len(polygon)<6:
	return None
	else:
	SE1=cv2.getStructuringElement(cv2.MORPH_RECT,(3,3))
	poly_map = cv2.erode(poly_map,SE1)
	poly_map = cv2.dilate(poly_map,SE1);
	poly_map = cv2.morphologyEx(poly_map,cv2.MORPH_CLOSE,SE1)
	idy,idx=np.where(poly_map == 1)
	xy=np.vstack((idx,idy))
	xy=np.transpose(xy)
	hull = cv2.convexHull(xy, clockwise=True)
	#reverse order of points.
	if hull is None:
	return None
	hull=hull[::-1]
	#find minimum area bounding box.
	rect = cv2.minAreaRect(hull)
	corners = cv2.boxPoints(rect)
	corners = np.array(corners, dtype="int")
	pts = get_tight_rect(corners, box[0], box[1], image_height, image_width, 1)
	polygon = [x * 1.0 for x in pts]
	polygon = list(map(int, polygon))
	return polygon

	def visualization(self, image, polygons, words):
	for polygon, word in zip(polygons, words):
	pts = np.array(polygon, np.int32)
	pts = pts.reshape((-1,1,2))
	xmin = min(pts[:,0,0])
	ymin = min(pts[:,0,1])
	cv2.polylines(image,[pts],True,(0,0,255))
	cv2.putText(image, word, (xmin, ymin), cv2.FONT_HERSHEY_COMPLEX, 1, (0,0,255), 2)


	def main(args):
	# update the config options with the config file
	cfg.merge_from_file(args.config_file)
	# manual override some options
	# cfg.merge_from_list(["MODEL.DEVICE", "cpu"])

	text_demo = TextDemo(
	cfg,
	min_image_size=800,
	confidence_threshold=0.7,
	output_polygon=True
	)
	# load image and then run prediction

	image = cv2.imread(args.image_path)
	result_polygons, result_words = text_demo.run_on_opencv_image(image)
	text_demo.visualization(image, result_polygons, result_words)
	cv2.imwrite(args.visu_path, image)

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description='parameters for demo')
	parser.add_argument("--config-file", type=str, default='configs/mixtrain/seg_rec_poly_fuse_feature.yaml')
	parser.add_argument("--image_path", type=str, default='./demo_images/demo.jpg')
	parser.add_argument("--visu_path", type=str, default='./demo_images/demo_results.jpg')
	args = parser.parse_args()
	main(args)