from img2table.ocr import DocTR from torchvision import transforms from transformers import AutoModelForObjectDetection import torch def box_cxcywh_to_xyxy(x): x_c, y_c, w, h = x.unbind(-1) b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] return torch.stack(b, dim=1) def rescale_bboxes(out_bbox, size): width, height = size boxes = box_cxcywh_to_xyxy(out_bbox) boxes = boxes * torch.tensor( [width, height, width, height], dtype=torch.float32 ) return boxes def outputs_to_objects(outputs, img_size, id2label): m = outputs.logits.softmax(-1).max(-1) pred_labels = list(m.indices.detach().cpu().numpy())[0] pred_scores = list(m.values.detach().cpu().numpy())[0] pred_bboxes = outputs["pred_boxes"].detach().cpu()[0] pred_bboxes = [ elem.tolist() for elem in rescale_bboxes(pred_bboxes, img_size) ] objects = [] for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes): class_label = id2label[int(label)] if not class_label == "no object": objects.append( { "label": class_label, "score": float(score), "bbox": [float(elem) for elem in bbox], } ) return objects class MaxResize(object): def __init__(self, max_size=800): self.max_size = max_size def __call__(self, image): width, height = image.size current_max_size = max(width, height) scale = self.max_size / current_max_size resized_image = image.resize( (int(round(scale * width)), int(round(scale * height))) ) return resized_image detection_transform = transforms.Compose( [ MaxResize(800), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ] ) device = "cuda" if torch.cuda.is_available() else "cpu" model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-detection", revision="no_timm").to(device) ocr = DocTR()