File size: 4,381 Bytes
9610b37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from __future__ import annotations

import glob
import osa
import re
import textwrap

from google.api_core.client_options import ClientOptions
from google.cloud import documentai
import pandas as pd
import os

def create_processor(
    project_id: str, location: str, processor_display_name: str
) -> documentai.Processor:
    client = documentai.DocumentProcessorServiceClient(client_options=client_options)

    # The full resource name of the location
    # e.g.: projects/project_id/locations/location
    parent = client.common_location_path(project_id, location)

    # Create a processor
    return client.create_processor(
        parent=parent,
        processor=documentai.Processor(
            display_name=processor_display_name, type_="OCR_PROCESSOR"
        ),
    )

def process_document(
    processor_name: str,
    file_path: str,
) -> documentai.Document:
    client = documentai.DocumentProcessorServiceClient(client_options=client_options)

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load Binary Data into Document AI RawDocument Object
    raw_document = documentai.RawDocument(
        content=image_content, mime_type="application/pdf"
    )

    # Configure the process request
    request = documentai.ProcessRequest(name=processor_name, raw_document=raw_document)

    result = client.process_document(request=request)

    return result.document

def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document"s text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )

def pdf_processor(processor_name: str, extracted_data) -> list[dict]:
# Loop through each PDF file in the "docai" directory.
    for path in glob.glob("docs/*.pdf"):
        # Extract the file name and type from the path.
        file_name, file_type = os.path.splitext(path)
            
        print(f"Processing {file_name}")
        # Process the document.
        document = process_document(processor_name, file_path=path)

        if not document:
            print("Processing did not complete successfully.")
            continue

        # Split the text into chunks based on paragraphs.
        document_chunks = [
            layout_to_text(paragraph.layout, document.text)
            for page in document.pages
            for paragraph in page.paragraphs
        ]

        # Can also split into chunks by page or blocks.
        # document_chunks = [page.text for page in wrapped_document.pages]
        # document_chunks = [block.text for page in wrapped_document.pages for block in page.blocks]

        # Loop through each chunk and create a dictionary with metadata and content.
        for chunk_number, chunk_content in enumerate(document_chunks, start=1):
            # Append the chunk information to the extracted_data list.
            extracted_data.append(
                {
                    "file_name": file_name,
                    "file_type": file_type,
                    "chunk_number": chunk_number,
                    "content": chunk_content,
                }
            )
    return extracted_data


if __name__ == "__main__":
    project_id = "iglintdb"
    location = "us"
    processor_display_name = "knowledge-base-ocr-processor-test-1"
    client_options = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    processor = create_processor(project_id, location, processor_display_name)

    # If you already have a Document AI Processor in your project, assign the full processor resource name here.
    processor_name = processor.name
    chunk_size = 5000
    extracted_data: list[dict] = []

    extracted_data = pdf_processor(processor_name, extracted_data)

    # Convert extracted_data to a sorted Pandas DataFrame
    pdf_data = (
        pd.DataFrame.from_dict(extracted_data)
        .sort_values(by=["file_name"])
        .reset_index(drop=True)
    )

    pdf_data.head()

    pdf_data.to_csv("doc_ai/pdf_data.csv", index=False)