philschmid HF staff commited on
Commit
d4347a5
1 Parent(s): f1a16dd

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +26 -38
handler.py CHANGED
@@ -1,46 +1,34 @@
1
  from typing import Dict, List, Any
2
- from transformers import AutoTokenizer, LayoutLMForSequenceClassification
3
- import torch
4
- import os
5
-
6
-
7
- os.system("sudo apt install -y tesseract-ocr")
8
- os.system("pip3 install pytesseract==0.3.9")
9
 
10
 
11
  class EndpointHandler():
12
  def __init__(self, path=""):
13
- self.tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
14
- self.model = LayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased") # load the optimized model
 
 
 
15
 
16
 
17
  def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
18
- words = ["Hello", "world"]
19
- normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
20
-
21
- token_boxes = []
22
- for word, box in zip(words, normalized_word_boxes):
23
- word_tokens = tokenizer.tokenize(word)
24
- token_boxes.extend([box] * len(word_tokens))
25
- # add bounding boxes of cls + sep tokens
26
- token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
27
-
28
- encoding = tokenizer(" ".join(words), return_tensors="pt")
29
- input_ids = encoding["input_ids"]
30
- attention_mask = encoding["attention_mask"]
31
- token_type_ids = encoding["token_type_ids"]
32
- bbox = torch.tensor([token_boxes])
33
- sequence_label = torch.tensor([1])
34
-
35
- outputs = self.model(
36
- input_ids=input_ids,
37
- bbox=bbox,
38
- attention_mask=attention_mask,
39
- token_type_ids=token_type_ids,
40
- labels=sequence_label,
41
- )
42
-
43
- loss = outputs.loss
44
- logits = outputs.logits
45
- return {"logits": logits}
46
-
 
1
  from typing import Dict, List, Any
2
+ from optimum.onnxruntime import ORTModelForSequenceClassification
3
+ from transformers import pipeline, AutoTokenizer
 
 
 
 
 
4
 
5
 
6
  class EndpointHandler():
7
  def __init__(self, path=""):
8
+ # load the optimized model
9
+ model = ORTModelForSequenceClassification.from_pretrained(path)
10
+ tokenizer = AutoTokenizer.from_pretrained(path)
11
+ # create inference pipeline
12
+ self.pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)
13
 
14
 
15
  def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
16
+ """
17
+ Args:
18
+ data (:obj:):
19
+ includes the input data and the parameters for the inference.
20
+ Return:
21
+ A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing :
22
+ - "label": A string representing what the label/class is. There can be multiple labels.
23
+ - "score": A score between 0 and 1 describing how confident the model is for this label/class.
24
+ """
25
+ inputs = data.pop("inputs", data)
26
+ parameters = data.pop("parameters", None)
27
+
28
+ # pass inputs with all kwargs in data
29
+ if parameters is not None:
30
+ prediction = self.pipeline(inputs, **parameters)
31
+ else:
32
+ prediction = self.pipeline(inputs)
33
+ # postprocess the prediction
34
+ return prediction