from __future__ import annotations import glob import osa import re import textwrap from google.api_core.client_options import ClientOptions from google.cloud import documentai import pandas as pd import os def create_processor( project_id: str, location: str, processor_display_name: str ) -> documentai.Processor: client = documentai.DocumentProcessorServiceClient(client_options=client_options) # The full resource name of the location # e.g.: projects/project_id/locations/location parent = client.common_location_path(project_id, location) # Create a processor return client.create_processor( parent=parent, processor=documentai.Processor( display_name=processor_display_name, type_="OCR_PROCESSOR" ), ) def process_document( processor_name: str, file_path: str, ) -> documentai.Document: client = documentai.DocumentProcessorServiceClient(client_options=client_options) # Read the file into memory with open(file_path, "rb") as image: image_content = image.read() # Load Binary Data into Document AI RawDocument Object raw_document = documentai.RawDocument( content=image_content, mime_type="application/pdf" ) # Configure the process request request = documentai.ProcessRequest(name=processor_name, raw_document=raw_document) result = client.process_document(request=request) return result.document def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str: """ Document AI identifies text in different parts of the document by their offsets in the entirety of the document"s text. This function converts offsets to a string. """ # If a text segment spans several lines, it will # be stored in different text segments. return "".join( text[int(segment.start_index) : int(segment.end_index)] for segment in layout.text_anchor.text_segments ) def pdf_processor(processor_name: str, extracted_data) -> list[dict]: # Loop through each PDF file in the "docai" directory. for path in glob.glob("docs/*.pdf"): # Extract the file name and type from the path. file_name, file_type = os.path.splitext(path) print(f"Processing {file_name}") # Process the document. document = process_document(processor_name, file_path=path) if not document: print("Processing did not complete successfully.") continue # Split the text into chunks based on paragraphs. document_chunks = [ layout_to_text(paragraph.layout, document.text) for page in document.pages for paragraph in page.paragraphs ] # Can also split into chunks by page or blocks. # document_chunks = [page.text for page in wrapped_document.pages] # document_chunks = [block.text for page in wrapped_document.pages for block in page.blocks] # Loop through each chunk and create a dictionary with metadata and content. for chunk_number, chunk_content in enumerate(document_chunks, start=1): # Append the chunk information to the extracted_data list. extracted_data.append( { "file_name": file_name, "file_type": file_type, "chunk_number": chunk_number, "content": chunk_content, } ) return extracted_data if __name__ == "__main__": project_id = "iglintdb" location = "us" processor_display_name = "knowledge-base-ocr-processor-test-1" client_options = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com") processor = create_processor(project_id, location, processor_display_name) # If you already have a Document AI Processor in your project, assign the full processor resource name here. processor_name = processor.name chunk_size = 5000 extracted_data: list[dict] = [] extracted_data = pdf_processor(processor_name, extracted_data) # Convert extracted_data to a sorted Pandas DataFrame pdf_data = ( pd.DataFrame.from_dict(extracted_data) .sort_values(by=["file_name"]) .reset_index(drop=True) ) pdf_data.head() pdf_data.to_csv("doc_ai/pdf_data.csv", index=False)