|
from typing import Dict, List, Any |
|
from transformers import AutoTokenizer, LayoutLMForSequenceClassification |
|
import torch |
|
import os |
|
|
|
|
|
os.system("sudo apt install -y tesseract-ocr") |
|
os.system("pip3 install pytesseract==0.3.9") |
|
|
|
|
|
class EndpointHandler(): |
|
def __init__(self, path=""): |
|
self.tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased") |
|
self.model = LayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased") |
|
|
|
|
|
def __call__(self, data: Any) -> List[List[Dict[str, float]]]: |
|
words = ["Hello", "world"] |
|
normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782] |
|
|
|
token_boxes = [] |
|
for word, box in zip(words, normalized_word_boxes): |
|
word_tokens = tokenizer.tokenize(word) |
|
token_boxes.extend([box] * len(word_tokens)) |
|
|
|
token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]] |
|
|
|
encoding = tokenizer(" ".join(words), return_tensors="pt") |
|
input_ids = encoding["input_ids"] |
|
attention_mask = encoding["attention_mask"] |
|
token_type_ids = encoding["token_type_ids"] |
|
bbox = torch.tensor([token_boxes]) |
|
sequence_label = torch.tensor([1]) |
|
|
|
outputs = self.model( |
|
input_ids=input_ids, |
|
bbox=bbox, |
|
attention_mask=attention_mask, |
|
token_type_ids=token_type_ids, |
|
labels=sequence_label, |
|
) |
|
|
|
loss = outputs.loss |
|
logits = outputs.logits |
|
return {"logits": logits} |
|
|