Broadridge_AiContract / pdfProcessor.py
karthikeyan-r's picture
Create pdfProcessor.py
4834106 verified
raw
history blame
1.29 kB
from langchain_community.document_loaders import PyPDFLoader
import os
from typing import List
class PDFProcessor:
"""
Class for processing PDF files to extract text content.
"""
def extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
"""
Extract text content from a list of PDF files.
Args:
file_paths (List[str]): A list of file paths to the PDF documents.
Returns:
List[str]: A list of text content extracted from the PDF documents.
"""
texts = []
for file_path in file_paths:
try:
loader = PyPDFLoader(file_path)
pages = loader.load_and_split()
for page in pages:
if isinstance(page.page_content, bytes):
text = page.page_content.decode('utf-8', errors='ignore')
elif isinstance(page.page_content, str):
text = page.page_content
else:
print(f"Unexpected type: {type(page.page_content)}")
continue
texts.append(text)
except Exception as e:
print(f"Failed to process {file_path}: {e}")
return texts