Spaces:

Koshti10
/

ChatIGL

Sleeping

File size: 4,381 Bytes

9610b37

from __future__ import annotations

import glob
import osa
import re
import textwrap

from google.api_core.client_options import ClientOptions
from google.cloud import documentai
import pandas as pd
import os

def create_processor(
    project_id: str, location: str, processor_display_name: str
) -> documentai.Processor:
    client = documentai.DocumentProcessorServiceClient(client_options=client_options)

    # The full resource name of the location
    # e.g.: projects/project_id/locations/location
    parent = client.common_location_path(project_id, location)

    # Create a processor
    return client.create_processor(
        parent=parent,
        processor=documentai.Processor(
            display_name=processor_display_name, type_="OCR_PROCESSOR"
        ),
    )

def process_document(
    processor_name: str,
    file_path: str,
) -> documentai.Document:
    client = documentai.DocumentProcessorServiceClient(client_options=client_options)

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load Binary Data into Document AI RawDocument Object
    raw_document = documentai.RawDocument(
        content=image_content, mime_type="application/pdf"
    )

    # Configure the process request
    request = documentai.ProcessRequest(name=processor_name, raw_document=raw_document)

    result = client.process_document(request=request)

    return result.document

def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document"s text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )

def pdf_processor(processor_name: str, extracted_data) -> list[dict]:
# Loop through each PDF file in the "docai" directory.
    for path in glob.glob("docs/*.pdf"):
        # Extract the file name and type from the path.
        file_name, file_type = os.path.splitext(path)
            
        print(f"Processing {file_name}")
        # Process the document.
        document = process_document(processor_name, file_path=path)

        if not document:
            print("Processing did not complete successfully.")
            continue

        # Split the text into chunks based on paragraphs.
        document_chunks = [
            layout_to_text(paragraph.layout, document.text)
            for page in document.pages
            for paragraph in page.paragraphs
        ]

        # Can also split into chunks by page or blocks.
        # document_chunks = [page.text for page in wrapped_document.pages]
        # document_chunks = [block.text for page in wrapped_document.pages for block in page.blocks]

        # Loop through each chunk and create a dictionary with metadata and content.
        for chunk_number, chunk_content in enumerate(document_chunks, start=1):
            # Append the chunk information to the extracted_data list.
            extracted_data.append(
                {
                    "file_name": file_name,
                    "file_type": file_type,
                    "chunk_number": chunk_number,
                    "content": chunk_content,
                }
            )
    return extracted_data


if __name__ == "__main__":
    project_id = "iglintdb"
    location = "us"
    processor_display_name = "knowledge-base-ocr-processor-test-1"
    client_options = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    processor = create_processor(project_id, location, processor_display_name)

    # If you already have a Document AI Processor in your project, assign the full processor resource name here.
    processor_name = processor.name
    chunk_size = 5000
    extracted_data: list[dict] = []

    extracted_data = pdf_processor(processor_name, extracted_data)

    # Convert extracted_data to a sorted Pandas DataFrame
    pdf_data = (
        pd.DataFrame.from_dict(extracted_data)
        .sort_values(by=["file_name"])
        .reset_index(drop=True)
    )

    pdf_data.head()

    pdf_data.to_csv("doc_ai/pdf_data.csv", index=False)