File size: 4,381 Bytes
9610b37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
from __future__ import annotations
import glob
import osa
import re
import textwrap
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
import pandas as pd
import os
def create_processor(
project_id: str, location: str, processor_display_name: str
) -> documentai.Processor:
client = documentai.DocumentProcessorServiceClient(client_options=client_options)
# The full resource name of the location
# e.g.: projects/project_id/locations/location
parent = client.common_location_path(project_id, location)
# Create a processor
return client.create_processor(
parent=parent,
processor=documentai.Processor(
display_name=processor_display_name, type_="OCR_PROCESSOR"
),
)
def process_document(
processor_name: str,
file_path: str,
) -> documentai.Document:
client = documentai.DocumentProcessorServiceClient(client_options=client_options)
# Read the file into memory
with open(file_path, "rb") as image:
image_content = image.read()
# Load Binary Data into Document AI RawDocument Object
raw_document = documentai.RawDocument(
content=image_content, mime_type="application/pdf"
)
# Configure the process request
request = documentai.ProcessRequest(name=processor_name, raw_document=raw_document)
result = client.process_document(request=request)
return result.document
def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
"""
Document AI identifies text in different parts of the document by their
offsets in the entirety of the document"s text. This function converts
offsets to a string.
"""
# If a text segment spans several lines, it will
# be stored in different text segments.
return "".join(
text[int(segment.start_index) : int(segment.end_index)]
for segment in layout.text_anchor.text_segments
)
def pdf_processor(processor_name: str, extracted_data) -> list[dict]:
# Loop through each PDF file in the "docai" directory.
for path in glob.glob("docs/*.pdf"):
# Extract the file name and type from the path.
file_name, file_type = os.path.splitext(path)
print(f"Processing {file_name}")
# Process the document.
document = process_document(processor_name, file_path=path)
if not document:
print("Processing did not complete successfully.")
continue
# Split the text into chunks based on paragraphs.
document_chunks = [
layout_to_text(paragraph.layout, document.text)
for page in document.pages
for paragraph in page.paragraphs
]
# Can also split into chunks by page or blocks.
# document_chunks = [page.text for page in wrapped_document.pages]
# document_chunks = [block.text for page in wrapped_document.pages for block in page.blocks]
# Loop through each chunk and create a dictionary with metadata and content.
for chunk_number, chunk_content in enumerate(document_chunks, start=1):
# Append the chunk information to the extracted_data list.
extracted_data.append(
{
"file_name": file_name,
"file_type": file_type,
"chunk_number": chunk_number,
"content": chunk_content,
}
)
return extracted_data
if __name__ == "__main__":
project_id = "iglintdb"
location = "us"
processor_display_name = "knowledge-base-ocr-processor-test-1"
client_options = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
processor = create_processor(project_id, location, processor_display_name)
# If you already have a Document AI Processor in your project, assign the full processor resource name here.
processor_name = processor.name
chunk_size = 5000
extracted_data: list[dict] = []
extracted_data = pdf_processor(processor_name, extracted_data)
# Convert extracted_data to a sorted Pandas DataFrame
pdf_data = (
pd.DataFrame.from_dict(extracted_data)
.sort_values(by=["file_name"])
.reset_index(drop=True)
)
pdf_data.head()
pdf_data.to_csv("doc_ai/pdf_data.csv", index=False)
|