jinhybr
/

OCR-LayoutLMv3

Token Classification

Generated from Trainer

Inference Endpoints

Model card Files Files and versions Metrics Training metrics Community

jinhybr commited on Nov 7, 2022

Commit

04e6a7b

•

1 Parent(s): 5fa47bf

Create handler.py

Files changed (1) hide show

handler.py +66 -0

handler.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from typing import Dict, List, Any
+from transformers import AutoModelForTokenClassification
+from transformers import AutoProcessor
+import torch
+from subprocess import run
+# install tesseract-ocr and pytesseract
+run("apt install -y tesseract-ocr", shell=True, check=True)
+run("pip install pytesseract", shell=True, check=True)
+# helper function to unnormalize bboxes for drawing onto the image
+def unnormalize_box(bbox, width, height):
+    return [
+        width * (bbox[0] / 1000),
+        height * (bbox[1] / 1000),
+        width * (bbox[2] / 1000),
+        height * (bbox[3] / 1000),
+    ]
+# set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class EndpointHandler:
+    def __init__(self, path=""):
+        # load model and processor from path
+        self.model = AutoModelForTokenClassification.from_pretrained(path).to(device)
+        self.processor = AutoProcessor.from_pretrained(path)
+    def __call__(self, data: Dict[str, bytes]) -> Dict[str, List[Any]]:
+        """
+        Args:
+            data (:obj:):
+                includes the deserialized image file as PIL.Image
+        """
+        # process input
+        image = data.pop("inputs", data)
+        # process image
+        encoding = self.processor(image, return_tensors="pt")
+        # run prediction
+        with torch.inference_mode():
+            outputs = self.model(
+                input_ids=encoding.input_ids.to(device),
+                bbox=encoding.bbox.to(device),
+                attention_mask=encoding.attention_mask.to(device),
+                pixel_values=encoding.pixel_values.to(device),
+            )
+            predictions = outputs.logits.softmax(-1)
+        # post process output
+        result = []
+        for item, inp_ids, bbox in zip(
+            predictions.squeeze(0).cpu(), encoding.input_ids.squeeze(0).cpu(), encoding.bbox.squeeze(0).cpu()
+        ):
+            label = self.model.config.id2label[int(item.argmax().cpu())]
+            if label == "O":
+                continue
+            score = item.max().item()
+            text = self.processor.tokenizer.decode(inp_ids)
+            bbox = unnormalize_box(bbox.tolist(), image.width, image.height)
+            result.append({"label": label, "score": score, "text": text, "bbox": bbox})
+        return {"predictions": result}