ChatIGL / doc_ai /doc_ocr.py
Koshti10's picture
Upload 51 files
9610b37 verified
from __future__ import annotations
import glob
import osa
import re
import textwrap
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
import pandas as pd
import os
def create_processor(
project_id: str, location: str, processor_display_name: str
) -> documentai.Processor:
client = documentai.DocumentProcessorServiceClient(client_options=client_options)
# The full resource name of the location
# e.g.: projects/project_id/locations/location
parent = client.common_location_path(project_id, location)
# Create a processor
return client.create_processor(
parent=parent,
processor=documentai.Processor(
display_name=processor_display_name, type_="OCR_PROCESSOR"
),
)
def process_document(
processor_name: str,
file_path: str,
) -> documentai.Document:
client = documentai.DocumentProcessorServiceClient(client_options=client_options)
# Read the file into memory
with open(file_path, "rb") as image:
image_content = image.read()
# Load Binary Data into Document AI RawDocument Object
raw_document = documentai.RawDocument(
content=image_content, mime_type="application/pdf"
)
# Configure the process request
request = documentai.ProcessRequest(name=processor_name, raw_document=raw_document)
result = client.process_document(request=request)
return result.document
def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
"""
Document AI identifies text in different parts of the document by their
offsets in the entirety of the document"s text. This function converts
offsets to a string.
"""
# If a text segment spans several lines, it will
# be stored in different text segments.
return "".join(
text[int(segment.start_index) : int(segment.end_index)]
for segment in layout.text_anchor.text_segments
)
def pdf_processor(processor_name: str, extracted_data) -> list[dict]:
# Loop through each PDF file in the "docai" directory.
for path in glob.glob("docs/*.pdf"):
# Extract the file name and type from the path.
file_name, file_type = os.path.splitext(path)
print(f"Processing {file_name}")
# Process the document.
document = process_document(processor_name, file_path=path)
if not document:
print("Processing did not complete successfully.")
continue
# Split the text into chunks based on paragraphs.
document_chunks = [
layout_to_text(paragraph.layout, document.text)
for page in document.pages
for paragraph in page.paragraphs
]
# Can also split into chunks by page or blocks.
# document_chunks = [page.text for page in wrapped_document.pages]
# document_chunks = [block.text for page in wrapped_document.pages for block in page.blocks]
# Loop through each chunk and create a dictionary with metadata and content.
for chunk_number, chunk_content in enumerate(document_chunks, start=1):
# Append the chunk information to the extracted_data list.
extracted_data.append(
{
"file_name": file_name,
"file_type": file_type,
"chunk_number": chunk_number,
"content": chunk_content,
}
)
return extracted_data
if __name__ == "__main__":
project_id = "iglintdb"
location = "us"
processor_display_name = "knowledge-base-ocr-processor-test-1"
client_options = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
processor = create_processor(project_id, location, processor_display_name)
# If you already have a Document AI Processor in your project, assign the full processor resource name here.
processor_name = processor.name
chunk_size = 5000
extracted_data: list[dict] = []
extracted_data = pdf_processor(processor_name, extracted_data)
# Convert extracted_data to a sorted Pandas DataFrame
pdf_data = (
pd.DataFrame.from_dict(extracted_data)
.sort_values(by=["file_name"])
.reset_index(drop=True)
)
pdf_data.head()
pdf_data.to_csv("doc_ai/pdf_data.csv", index=False)