|
from __future__ import annotations |
|
|
|
import glob |
|
import osa |
|
import re |
|
import textwrap |
|
|
|
from google.api_core.client_options import ClientOptions |
|
from google.cloud import documentai |
|
import pandas as pd |
|
import os |
|
|
|
def create_processor( |
|
project_id: str, location: str, processor_display_name: str |
|
) -> documentai.Processor: |
|
client = documentai.DocumentProcessorServiceClient(client_options=client_options) |
|
|
|
|
|
|
|
parent = client.common_location_path(project_id, location) |
|
|
|
|
|
return client.create_processor( |
|
parent=parent, |
|
processor=documentai.Processor( |
|
display_name=processor_display_name, type_="OCR_PROCESSOR" |
|
), |
|
) |
|
|
|
def process_document( |
|
processor_name: str, |
|
file_path: str, |
|
) -> documentai.Document: |
|
client = documentai.DocumentProcessorServiceClient(client_options=client_options) |
|
|
|
|
|
with open(file_path, "rb") as image: |
|
image_content = image.read() |
|
|
|
|
|
raw_document = documentai.RawDocument( |
|
content=image_content, mime_type="application/pdf" |
|
) |
|
|
|
|
|
request = documentai.ProcessRequest(name=processor_name, raw_document=raw_document) |
|
|
|
result = client.process_document(request=request) |
|
|
|
return result.document |
|
|
|
def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str: |
|
""" |
|
Document AI identifies text in different parts of the document by their |
|
offsets in the entirety of the document"s text. This function converts |
|
offsets to a string. |
|
""" |
|
|
|
|
|
return "".join( |
|
text[int(segment.start_index) : int(segment.end_index)] |
|
for segment in layout.text_anchor.text_segments |
|
) |
|
|
|
def pdf_processor(processor_name: str, extracted_data) -> list[dict]: |
|
|
|
for path in glob.glob("docs/*.pdf"): |
|
|
|
file_name, file_type = os.path.splitext(path) |
|
|
|
print(f"Processing {file_name}") |
|
|
|
document = process_document(processor_name, file_path=path) |
|
|
|
if not document: |
|
print("Processing did not complete successfully.") |
|
continue |
|
|
|
|
|
document_chunks = [ |
|
layout_to_text(paragraph.layout, document.text) |
|
for page in document.pages |
|
for paragraph in page.paragraphs |
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
for chunk_number, chunk_content in enumerate(document_chunks, start=1): |
|
|
|
extracted_data.append( |
|
{ |
|
"file_name": file_name, |
|
"file_type": file_type, |
|
"chunk_number": chunk_number, |
|
"content": chunk_content, |
|
} |
|
) |
|
return extracted_data |
|
|
|
|
|
if __name__ == "__main__": |
|
project_id = "iglintdb" |
|
location = "us" |
|
processor_display_name = "knowledge-base-ocr-processor-test-1" |
|
client_options = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com") |
|
processor = create_processor(project_id, location, processor_display_name) |
|
|
|
|
|
processor_name = processor.name |
|
chunk_size = 5000 |
|
extracted_data: list[dict] = [] |
|
|
|
extracted_data = pdf_processor(processor_name, extracted_data) |
|
|
|
|
|
pdf_data = ( |
|
pd.DataFrame.from_dict(extracted_data) |
|
.sort_values(by=["file_name"]) |
|
.reset_index(drop=True) |
|
) |
|
|
|
pdf_data.head() |
|
|
|
pdf_data.to_csv("doc_ai/pdf_data.csv", index=False) |
|
|
|
|
|
|
|
|
|
|