Alexander Slessor commited on
Commit
1d2c57b
1 Parent(s): 475995c

completed initial handler.py

Browse files
Files changed (3) hide show
  1. .gitignore +13 -0
  2. README.md +5 -0
  3. handler.py +56 -0
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.ipynb
3
+ *.pdf
4
+
5
+ test_handler_local.py
6
+ test_handler.py
7
+ test_endpoint.py
8
+
9
+ setup
10
+ upload_to_hf
11
+ requirements.txt
12
+ hf_token.py
13
+
README.md CHANGED
@@ -7,11 +7,16 @@ tags:
7
  - document-question-answering
8
  - pdf
9
  - invoices
 
10
  widget:
11
  - text: "What is the invoice number?"
12
  src: "https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png"
13
  - text: "What is the purchase amount?"
14
  src: "https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/contract.jpeg"
 
 
 
 
15
  ---
16
 
17
  # LayoutLM for Invoices
 
7
  - document-question-answering
8
  - pdf
9
  - invoices
10
+ - endpoints-template
11
  widget:
12
  - text: "What is the invoice number?"
13
  src: "https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png"
14
  - text: "What is the purchase amount?"
15
  src: "https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/contract.jpeg"
16
+ library_name: generic
17
+ model-index:
18
+ - name: layoutlm-invoices
19
+ results: []
20
  ---
21
 
22
  # LayoutLM for Invoices
handler.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+ from subprocess import run
3
+ from docquery import document, pipeline
4
+ import tempfile
5
+ import os
6
+ # from transformers import AutoConfig, AutoTokenizer, LayoutLMForQuestionAnswering
7
+
8
+ # install tesseract-ocr and pytesseract
9
+ run("apt install -y tesseract-ocr", shell=True, check=True)
10
+
11
+ class EndpointHandler:
12
+ def __init__(self, path=""):
13
+ # config = AutoConfig.from_pretrained(model_checkpoint, revision=rev)
14
+ # self.tokenizer = AutoTokenizer.from_pretrained(path)
15
+ # self.model = LayoutLMForQuestionAnswering.from_pretrained(path)
16
+ # self.pipeline = pipeline('document-question-answering', model=self.model, tokenizer=self.tokenizer)
17
+ # self.pipeline = pipeline('document-question-answering', model='impira/layoutlm-invoices')
18
+ self.pipeline = pipeline('document-question-answering', model=path)
19
+
20
+ def __call__(self, data: dict[str, bytes]) -> dict[str, list[Any]]:
21
+ """
22
+ Args:
23
+ data (:obj:):
24
+ includes:
25
+ - pdf bytes
26
+ """
27
+ # process input
28
+ f_bytes = data.pop("inputs", data)
29
+ try:
30
+ temp_file_name = next(tempfile._get_candidate_names())
31
+ temp_file_path = os.path.join('/tmp', f'{temp_file_name}.pdf')
32
+ with open(temp_file_path, 'wb') as temp_file:
33
+ temp_file.write(f_bytes)
34
+
35
+ if not os.path.exists(temp_file_path):
36
+ raise ValueError(f'File not found at path: {temp_file_path}')
37
+
38
+ results = []
39
+ doc = document.load_document(temp_file_path)
40
+ for q in ["What is the invoice number?", "What is the invoice total?"]:
41
+ result = self.pipeline(question=q, **doc.context)
42
+ results.append(result)
43
+
44
+ except Exception as e:
45
+ raise
46
+ else:
47
+ return {"predictions": results}
48
+ finally:
49
+ try:
50
+ os.remove(temp_file_path)
51
+ except FileNotFoundError as e:
52
+ print(e)
53
+
54
+
55
+
56
+