document-extractors / unstructuredio /unstructured_pdf.py
rishiraj's picture
Create unstructuredio/unstructured_pdf.py
153d307 verified
raw
history blame
No virus
1.82 kB
from unstructured.partition.pdf import partition_pdf
import tempfile
from typing import List, Union, Optional
from indexify_extractor_sdk import Content, Extractor, Feature
from pydantic import BaseModel, Field
class UnstructuredIOConfig(BaseModel):
strategy: Optional[str] = Field(default="auto") # "auto", "hi_res", "ocr_only", and "fast"
hi_res_model_name: Optional[str] = Field(default="yolox")
infer_table_structure: Optional[bool] = True
class UnstructuredIOExtractor(Extractor):
name = "tensorlake/unstructuredio"
description = "This extractor uses unstructured.io to extract pieces of pdf document into separate plain text content data."
system_dependencies = ["libmagic-dev", "poppler-utils", "tesseract-ocr"]
input_mime_types = ["application/pdf"]
def __init__(self):
super(UnstructuredIOExtractor, self).__init__()
def extract(self, content: Content, params: UnstructuredIOConfig) -> List[Union[Feature, Content]]:
contents = []
strategy = params.strategy
hi_res_model_name = params.hi_res_model_name
infer_table_structure = params.infer_table_structure
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as inputtmpfile:
inputtmpfile.write(content.data)
inputtmpfile.flush()
elements = partition_pdf(inputtmpfile.name, strategy=strategy, hi_res_model_name=hi_res_model_name, infer_table_structure=infer_table_structure)
for el in elements:
feature = Feature.metadata(value={"type": type(el).__name__, "page_number": el.metadata.page_number})
contents.append(Content.from_text(el.text, features=[feature]))
return contents
def sample_input(self) -> Content:
return self.sample_scientific_pdf()