Spaces:

Koshti10
/

ChatIGL

Sleeping

App Files Files Community

ChatIGL / doc_ai /doc_ocr.py

Koshti10

Upload 51 files

9610b37 verified about 1 month ago

raw

history blame contribute delete

4.38 kB

	from __future__ import annotations

	import glob
	import osa
	import re
	import textwrap

	from google.api_core.client_options import ClientOptions
	from google.cloud import documentai
	import pandas as pd
	import os

	def create_processor(
	project_id: str, location: str, processor_display_name: str
	) -> documentai.Processor:
	client = documentai.DocumentProcessorServiceClient(client_options=client_options)

	# The full resource name of the location
	# e.g.: projects/project_id/locations/location
	parent = client.common_location_path(project_id, location)

	# Create a processor
	return client.create_processor(
	parent=parent,
	processor=documentai.Processor(
	display_name=processor_display_name, type_="OCR_PROCESSOR"
	),
	)

	def process_document(
	processor_name: str,
	file_path: str,
	) -> documentai.Document:
	client = documentai.DocumentProcessorServiceClient(client_options=client_options)

	# Read the file into memory
	with open(file_path, "rb") as image:
	image_content = image.read()

	# Load Binary Data into Document AI RawDocument Object
	raw_document = documentai.RawDocument(
	content=image_content, mime_type="application/pdf"
	)

	# Configure the process request
	request = documentai.ProcessRequest(name=processor_name, raw_document=raw_document)

	result = client.process_document(request=request)

	return result.document

	def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
	"""
	Document AI identifies text in different parts of the document by their
	offsets in the entirety of the document"s text. This function converts
	offsets to a string.
	"""
	# If a text segment spans several lines, it will
	# be stored in different text segments.
	return "".join(
	text[int(segment.start_index) : int(segment.end_index)]
	for segment in layout.text_anchor.text_segments
	)

	def pdf_processor(processor_name: str, extracted_data) -> list[dict]:
	# Loop through each PDF file in the "docai" directory.
	for path in glob.glob("docs/*.pdf"):
	# Extract the file name and type from the path.
	file_name, file_type = os.path.splitext(path)

	print(f"Processing {file_name}")
	# Process the document.
	document = process_document(processor_name, file_path=path)

	if not document:
	print("Processing did not complete successfully.")
	continue

	# Split the text into chunks based on paragraphs.
	document_chunks = [
	layout_to_text(paragraph.layout, document.text)
	for page in document.pages
	for paragraph in page.paragraphs
	]

	# Can also split into chunks by page or blocks.
	# document_chunks = [page.text for page in wrapped_document.pages]
	# document_chunks = [block.text for page in wrapped_document.pages for block in page.blocks]

	# Loop through each chunk and create a dictionary with metadata and content.
	for chunk_number, chunk_content in enumerate(document_chunks, start=1):
	# Append the chunk information to the extracted_data list.
	extracted_data.append(
	{
	"file_name": file_name,
	"file_type": file_type,
	"chunk_number": chunk_number,
	"content": chunk_content,
	}
	)
	return extracted_data


	if __name__ == "__main__":
	project_id = "iglintdb"
	location = "us"
	processor_display_name = "knowledge-base-ocr-processor-test-1"
	client_options = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
	processor = create_processor(project_id, location, processor_display_name)

	# If you already have a Document AI Processor in your project, assign the full processor resource name here.
	processor_name = processor.name
	chunk_size = 5000
	extracted_data: list[dict] = []

	extracted_data = pdf_processor(processor_name, extracted_data)

	# Convert extracted_data to a sorted Pandas DataFrame
	pdf_data = (
	pd.DataFrame.from_dict(extracted_data)
	.sort_values(by=["file_name"])
	.reset_index(drop=True)
	)

	pdf_data.head()

	pdf_data.to_csv("doc_ai/pdf_data.csv", index=False)