rishiraj commited on
Commit
153d307
1 Parent(s): 0ef7f8f

Create unstructuredio/unstructured_pdf.py

Browse files
Files changed (1) hide show
  1. unstructuredio/unstructured_pdf.py +39 -0
unstructuredio/unstructured_pdf.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from unstructured.partition.pdf import partition_pdf
2
+ import tempfile
3
+ from typing import List, Union, Optional
4
+ from indexify_extractor_sdk import Content, Extractor, Feature
5
+ from pydantic import BaseModel, Field
6
+
7
+ class UnstructuredIOConfig(BaseModel):
8
+ strategy: Optional[str] = Field(default="auto") # "auto", "hi_res", "ocr_only", and "fast"
9
+ hi_res_model_name: Optional[str] = Field(default="yolox")
10
+ infer_table_structure: Optional[bool] = True
11
+
12
+ class UnstructuredIOExtractor(Extractor):
13
+ name = "tensorlake/unstructuredio"
14
+ description = "This extractor uses unstructured.io to extract pieces of pdf document into separate plain text content data."
15
+ system_dependencies = ["libmagic-dev", "poppler-utils", "tesseract-ocr"]
16
+ input_mime_types = ["application/pdf"]
17
+
18
+ def __init__(self):
19
+ super(UnstructuredIOExtractor, self).__init__()
20
+
21
+ def extract(self, content: Content, params: UnstructuredIOConfig) -> List[Union[Feature, Content]]:
22
+ contents = []
23
+ strategy = params.strategy
24
+ hi_res_model_name = params.hi_res_model_name
25
+ infer_table_structure = params.infer_table_structure
26
+
27
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as inputtmpfile:
28
+ inputtmpfile.write(content.data)
29
+ inputtmpfile.flush()
30
+
31
+ elements = partition_pdf(inputtmpfile.name, strategy=strategy, hi_res_model_name=hi_res_model_name, infer_table_structure=infer_table_structure)
32
+ for el in elements:
33
+ feature = Feature.metadata(value={"type": type(el).__name__, "page_number": el.metadata.page_number})
34
+ contents.append(Content.from_text(el.text, features=[feature]))
35
+
36
+ return contents
37
+
38
+ def sample_input(self) -> Content:
39
+ return self.sample_scientific_pdf()