|
from typing import Any |
|
from subprocess import run |
|
from docquery import document, pipeline |
|
import tempfile |
|
import os |
|
|
|
|
|
|
|
run("apt install -y tesseract-ocr", shell=True, check=True) |
|
|
|
class EndpointHandler: |
|
def __init__(self, path=""): |
|
|
|
|
|
|
|
|
|
|
|
self.pipeline = pipeline('document-question-answering', model=path) |
|
|
|
def __call__(self, data: dict[str, bytes]) -> dict[str, list[Any]]: |
|
""" |
|
Args: |
|
data (:obj:): |
|
includes: |
|
- pdf bytes |
|
""" |
|
|
|
f_bytes = data.pop("inputs", data) |
|
try: |
|
temp_file_name = next(tempfile._get_candidate_names()) |
|
temp_file_path = os.path.join('/tmp', f'{temp_file_name}.pdf') |
|
with open(temp_file_path, 'wb') as temp_file: |
|
temp_file.write(f_bytes) |
|
|
|
if not os.path.exists(temp_file_path): |
|
raise ValueError(f'File not found at path: {temp_file_path}') |
|
|
|
results = [] |
|
doc = document.load_document(temp_file_path) |
|
for q in ["What is the invoice number?", "What is the invoice total?"]: |
|
result = self.pipeline(question=q, **doc.context) |
|
results.append(result) |
|
|
|
except Exception as e: |
|
raise |
|
else: |
|
return {"predictions": results} |
|
finally: |
|
try: |
|
os.remove(temp_file_path) |
|
except FileNotFoundError as e: |
|
print(e) |
|
|
|
|
|
|
|
|
|
|