import re from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextContainer from research_assistant.app_logging import app_logger def pdf_parser(pdf_path): """ Extracts text from a PDF file, removing headers, footers, and page numbers. Args: pdf_path (str): The file path to the PDF. Returns: str: The extracted text suitable for LLM input. """ extracted_text = [] header_counter, footer_counter = {}, {} header_patterns, footer_patterns = set(), set() # Matches lines with page numbers page_number_pattern = re.compile(r"^(Page\s+)?\d+(/\d+)?$") try: # First pass: identify headers and footers by tracking recurring lines total_pages = 0 for page_layout in extract_pages(pdf_path): total_pages += 1 page_text = [ element.get_text().strip() for element in page_layout if isinstance(element, LTTextContainer) and element.get_text().strip() ] if len(page_text) >= 2: header, footer = page_text[0], page_text[-1] header_counter[header] = header_counter.get(header, 0) + 1 footer_counter[footer] = footer_counter.get(footer, 0) + 1 # Determine most common headers and footers header_patterns = { k for k, v in header_counter.items() if v > total_pages * 0.5 } footer_patterns = { k for k, v in footer_counter.items() if v > total_pages * 0.5 } # Compile regex patterns header_regexes = [re.compile(re.escape(header)) for header in header_patterns] footer_regexes = [re.compile(re.escape(footer)) for footer in footer_patterns] # Second pass: extract and clean text for page_layout in extract_pages(pdf_path): page_text = [ element.get_text().strip() for element in page_layout if isinstance(element, LTTextContainer) and element.get_text().strip() ] extracted_text.extend( line for line in page_text if not any(regex.match(line) for regex in header_regexes) and not any(regex.match(line) for regex in footer_regexes) and not page_number_pattern.match(line) ) return " ".join(extracted_text).replace("\n", " ").strip() except Exception as e: app_logger.error(f"Failed to parse PDF {pdf_path}: {e}") return ""