coderpotter's picture
Upload folder using huggingface_hub
7b2e5db verified
import re
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
from research_assistant.app_logging import app_logger
def pdf_parser(pdf_path):
"""
Extracts text from a PDF file, removing headers, footers, and page numbers.
Args:
pdf_path (str): The file path to the PDF.
Returns:
str: The extracted text suitable for LLM input.
"""
extracted_text = []
header_counter, footer_counter = {}, {}
header_patterns, footer_patterns = set(), set()
# Matches lines with page numbers
page_number_pattern = re.compile(r"^(Page\s+)?\d+(/\d+)?$")
try:
# First pass: identify headers and footers by tracking recurring lines
total_pages = 0
for page_layout in extract_pages(pdf_path):
total_pages += 1
page_text = [
element.get_text().strip()
for element in page_layout
if isinstance(element, LTTextContainer) and element.get_text().strip()
]
if len(page_text) >= 2:
header, footer = page_text[0], page_text[-1]
header_counter[header] = header_counter.get(header, 0) + 1
footer_counter[footer] = footer_counter.get(footer, 0) + 1
# Determine most common headers and footers
header_patterns = {
k for k, v in header_counter.items() if v > total_pages * 0.5
}
footer_patterns = {
k for k, v in footer_counter.items() if v > total_pages * 0.5
}
# Compile regex patterns
header_regexes = [re.compile(re.escape(header)) for header in header_patterns]
footer_regexes = [re.compile(re.escape(footer)) for footer in footer_patterns]
# Second pass: extract and clean text
for page_layout in extract_pages(pdf_path):
page_text = [
element.get_text().strip()
for element in page_layout
if isinstance(element, LTTextContainer) and element.get_text().strip()
]
extracted_text.extend(
line
for line in page_text
if not any(regex.match(line) for regex in header_regexes)
and not any(regex.match(line) for regex in footer_regexes)
and not page_number_pattern.match(line)
)
return " ".join(extracted_text).replace("\n", " ").strip()
except Exception as e:
app_logger.error(f"Failed to parse PDF {pdf_path}: {e}")
return ""