from typing import List, Union, Optional import json from indexify_extractor_sdk import Content, Extractor, Feature from pydantic import BaseModel, Field from .utils.tt_module import get_tables import fitz import tempfile class PDFExtractorConfig(BaseModel): output_types: List[str] = Field(default_factory=lambda: ["text", "image", "table"]) class PDFExtractor(Extractor): name = "tensorlake/pdf-extractor" description = "PDF Extractor for Texts, Images & Tables" system_dependencies = ["poppler-utils"] input_mime_types = ["application/pdf"] def __init__(self): super(PDFExtractor, self).__init__() def extract(self, content: Content, params: PDFExtractorConfig) -> List[Union[Feature, Content]]: contents = [] with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as inputtmpfile: inputtmpfile.write(content.data) inputtmpfile.flush() doc = fitz.open(inputtmpfile.name) for i in range(len(doc)): page = doc[i] if "text" in params.output_types: page_text = page.get_text() feature = Feature.metadata(value={"type": "text", "page": i+1}) contents.append(Content.from_text(page_text, features=[feature])) if "image" in params.output_types: image_list = page.get_images() for img in image_list: xref = img[0] pix = fitz.Pixmap(doc, xref) if not pix.colorspace.name in (fitz.csGRAY.name, fitz.csRGB.name): pix = fitz.Pixmap(fitz.csRGB, pix) feature = Feature.metadata({"type": "image", "page": i+1}) contents.append(Content(content_type="image/png", data=pix.tobytes(), features=[feature])) if "table" in params.output_types: tables = get_tables(content.data) for page, content in tables.items(): feature = Feature.metadata({"type": "table", "page": int(page)}) contents.append(Content(content_type="application/json", data=json.dumps(content), features=[feature])) return contents def sample_input(self) -> Content: return self.sample_scientific_pdf()