File size: 2,573 Bytes
7b2e5db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import re

from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

from research_assistant.app_logging import app_logger


def pdf_parser(pdf_path):
    """
    Extracts text from a PDF file, removing headers, footers, and page numbers.
    Args:
        pdf_path (str): The file path to the PDF.
    Returns:
        str: The extracted text suitable for LLM input.
    """
    extracted_text = []
    header_counter, footer_counter = {}, {}
    header_patterns, footer_patterns = set(), set()
    # Matches lines with page numbers
    page_number_pattern = re.compile(r"^(Page\s+)?\d+(/\d+)?$")

    try:
        # First pass: identify headers and footers by tracking recurring lines
        total_pages = 0
        for page_layout in extract_pages(pdf_path):
            total_pages += 1
            page_text = [
                element.get_text().strip()
                for element in page_layout
                if isinstance(element, LTTextContainer) and element.get_text().strip()
            ]

            if len(page_text) >= 2:
                header, footer = page_text[0], page_text[-1]
                header_counter[header] = header_counter.get(header, 0) + 1
                footer_counter[footer] = footer_counter.get(footer, 0) + 1

        # Determine most common headers and footers
        header_patterns = {
            k for k, v in header_counter.items() if v > total_pages * 0.5
        }
        footer_patterns = {
            k for k, v in footer_counter.items() if v > total_pages * 0.5
        }

        # Compile regex patterns
        header_regexes = [re.compile(re.escape(header)) for header in header_patterns]
        footer_regexes = [re.compile(re.escape(footer)) for footer in footer_patterns]

        # Second pass: extract and clean text
        for page_layout in extract_pages(pdf_path):
            page_text = [
                element.get_text().strip()
                for element in page_layout
                if isinstance(element, LTTextContainer) and element.get_text().strip()
            ]
            extracted_text.extend(
                line
                for line in page_text
                if not any(regex.match(line) for regex in header_regexes)
                and not any(regex.match(line) for regex in footer_regexes)
                and not page_number_pattern.match(line)
            )
        return " ".join(extracted_text).replace("\n", " ").strip()
    except Exception as e:
        app_logger.error(f"Failed to parse PDF {pdf_path}: {e}")
        return ""