File size: 1,672 Bytes
0c86405
75a111d
 
f1a16dd
 
 
 
 
 
0c86405
 
 
75a111d
 
0c86405
 
 
75a111d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from typing import  Dict, List, Any
from transformers import AutoTokenizer, LayoutLMForSequenceClassification
import torch
import os


os.system("sudo apt install -y tesseract-ocr")
os.system("pip3 install pytesseract==0.3.9")


class EndpointHandler():
    def __init__(self, path=""):
        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
        self.model = LayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased")        # load the optimized model


    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
        words = ["Hello", "world"]
        normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
        
        token_boxes = []
        for word, box in zip(words, normalized_word_boxes):
            word_tokens = tokenizer.tokenize(word)
            token_boxes.extend([box] * len(word_tokens))
        # add bounding boxes of cls + sep tokens
        token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
        
        encoding = tokenizer(" ".join(words), return_tensors="pt")
        input_ids = encoding["input_ids"]
        attention_mask = encoding["attention_mask"]
        token_type_ids = encoding["token_type_ids"]
        bbox = torch.tensor([token_boxes])
        sequence_label = torch.tensor([1])
        
        outputs = self.model(
            input_ids=input_ids,
            bbox=bbox,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=sequence_label,
        )
        
        loss = outputs.loss
        logits = outputs.logits
        return {"logits": logits}