Spaces:
Runtime error
Runtime error
import re | |
from pdfminer.high_level import extract_pages | |
from pdfminer.layout import LTTextContainer | |
from research_assistant.app_logging import app_logger | |
def pdf_parser(pdf_path): | |
""" | |
Extracts text from a PDF file, removing headers, footers, and page numbers. | |
Args: | |
pdf_path (str): The file path to the PDF. | |
Returns: | |
str: The extracted text suitable for LLM input. | |
""" | |
extracted_text = [] | |
header_counter, footer_counter = {}, {} | |
header_patterns, footer_patterns = set(), set() | |
# Matches lines with page numbers | |
page_number_pattern = re.compile(r"^(Page\s+)?\d+(/\d+)?$") | |
try: | |
# First pass: identify headers and footers by tracking recurring lines | |
total_pages = 0 | |
for page_layout in extract_pages(pdf_path): | |
total_pages += 1 | |
page_text = [ | |
element.get_text().strip() | |
for element in page_layout | |
if isinstance(element, LTTextContainer) and element.get_text().strip() | |
] | |
if len(page_text) >= 2: | |
header, footer = page_text[0], page_text[-1] | |
header_counter[header] = header_counter.get(header, 0) + 1 | |
footer_counter[footer] = footer_counter.get(footer, 0) + 1 | |
# Determine most common headers and footers | |
header_patterns = { | |
k for k, v in header_counter.items() if v > total_pages * 0.5 | |
} | |
footer_patterns = { | |
k for k, v in footer_counter.items() if v > total_pages * 0.5 | |
} | |
# Compile regex patterns | |
header_regexes = [re.compile(re.escape(header)) for header in header_patterns] | |
footer_regexes = [re.compile(re.escape(footer)) for footer in footer_patterns] | |
# Second pass: extract and clean text | |
for page_layout in extract_pages(pdf_path): | |
page_text = [ | |
element.get_text().strip() | |
for element in page_layout | |
if isinstance(element, LTTextContainer) and element.get_text().strip() | |
] | |
extracted_text.extend( | |
line | |
for line in page_text | |
if not any(regex.match(line) for regex in header_regexes) | |
and not any(regex.match(line) for regex in footer_regexes) | |
and not page_number_pattern.match(line) | |
) | |
return " ".join(extracted_text).replace("\n", " ").strip() | |
except Exception as e: | |
app_logger.error(f"Failed to parse PDF {pdf_path}: {e}") | |
return "" | |