Broadridge_AiContract

Sleeping

Broadridge_AiContract / pdfProcessor.py

Create pdfProcessor.py

4834106 verified 4 months ago

1.29 kB

	from langchain_community.document_loaders import PyPDFLoader
	import os
	from typing import List
	class PDFProcessor:
	"""
	Class for processing PDF files to extract text content.
	"""
	def extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
	"""
	Extract text content from a list of PDF files.

	Args:
	file_paths (List[str]): A list of file paths to the PDF documents.

	Returns:
	List[str]: A list of text content extracted from the PDF documents.
	"""
	texts = []
	for file_path in file_paths:
	try:
	loader = PyPDFLoader(file_path)
	pages = loader.load_and_split()

	for page in pages:
	if isinstance(page.page_content, bytes):
	text = page.page_content.decode('utf-8', errors='ignore')
	elif isinstance(page.page_content, str):
	text = page.page_content
	else:
	print(f"Unexpected type: {type(page.page_content)}")
	continue
	texts.append(text)
	except Exception as e:
	print(f"Failed to process {file_path}: {e}")

	return texts