import os |
from PyPDF2 import PdfReader |
import re |
import json |
from pathlib import Path |
from datetime import datetime |
import hashlib |
class PDFExtractor: |
def __init__(self): |
self.base_dir = Path(__file__).parent.parent |
self.output_dir = self.base_dir / 'processed_data' |
self.raw_dir = self.output_dir / 'raw_extractions' |
print(f"Creating directories in: {self.output_dir}") |
self.raw_dir.mkdir(parents=True, exist_ok=True) |
self.manifest = [] |
def generate_file_hash(self, filepath): |
"""Generate hash of PDF file for tracking""" |
hash_md5 = hashlib.md5() |
with open(filepath, "rb") as f: |
for chunk in iter(lambda: f.read(4096), b""): |
hash_md5.update(chunk) |
return hash_md5.hexdigest() |
def add_to_manifest(self, input_file, output_file, process_type, metadata): |
"""Track file transformations in manifest""" |
manifest_entry = { |
'timestamp': datetime.now().isoformat(), |
'input_file': str(input_file), |
'output_file': str(output_file), |
'process_type': process_type, |
'file_hash': self.generate_file_hash(input_file), |
'metadata': metadata |
} |
self.manifest.append(manifest_entry) |
def clean_text(self, text): |
"""Clean extracted text""" |
text = re.sub(r'\f', ' ', text) |
text = re.sub(r'[^\x00-\x7F]+', '', text) |
text = re.sub(r'Alcoholics Anonymous World Services, Inc\.?', '', text) |
text = re.sub(r'Page \d+', '', text) |
text = re.sub(r'(\w)-\s*\n(\w)', r'\1\2', text) |
text = re.sub(r'[\r\n]+', ' ', text) |
text = re.sub(r'\s+', ' ', text) |
return text.strip() |
def extract_pdf(self, pdf_path): |
"""Extract text from PDF with page-level tracking""" |
print(f"Processing: {pdf_path}") |
reader = PdfReader(pdf_path) |
extraction_data = { |
'filename': pdf_path.name, |
'total_pages': len(reader.pages), |
'extraction_date': datetime.now().isoformat(), |
'pages': [] |
} |
for page_num, page in enumerate(reader.pages, 1): |
try: |
raw_text = page.extract_text() |
cleaned_text = self.clean_text(raw_text) |
page_data = { |
'page_number': page_num, |
'text': cleaned_text, |
'char_count': len(cleaned_text), |
'word_count': len(cleaned_text.split()) |
} |
extraction_data['pages'].append(page_data) |
except Exception as e: |
print(f"Error on page {page_num}: {str(e)}") |
extraction_data['pages'].append({ |
'page_number': page_num, |
'error': str(e) |
}) |
return extraction_data |
def process_pdfs(self): |
"""Process PDFs with provenance tracking""" |
pdf_dir = Path("src_files") |
combined_data = {} |
for pdf_path in pdf_dir.glob("*.pdf"): |
try: |
extraction_data = self.extract_pdf(pdf_path) |
source_name = pdf_path.stem.lower() |
raw_output = self.raw_dir / f"{source_name}_raw.json" |
with open(raw_output, 'w', encoding='utf-8') as f: |
json.dump(extraction_data, f, indent=2) |
self.add_to_manifest( |
pdf_path, |
raw_output, |
'pdf_extraction', |
{ |
'total_pages': extraction_data['total_pages'], |
'successful_pages': len([p for p in extraction_data['pages'] if 'error' not in p]) |
} |
) |
combined_text = ' '.join( |
page['text'] for page in extraction_data['pages'] |
if 'text' in page |
) |
combined_data[pdf_path.name] = combined_text |
except Exception as e: |
print(f"Error processing {pdf_path.name}: {str(e)}") |
combined_output = self.output_dir / 'extracted_text.json' |
with open(combined_output, 'w', encoding='utf-8') as f: |
json.dump(combined_data, f, indent=2) |
manifest_file = self.output_dir / 'extraction_manifest.json' |
with open(manifest_file, 'w', encoding='utf-8') as f: |
json.dump(self.manifest, f, indent=2) |
print("\nExtraction Summary:") |
print(f"Processed files: {len(combined_data)}") |
print(f"Raw extractions saved in: {self.raw_dir}") |
print(f"Combined data saved as: {combined_output}") |
print(f"Provenance data saved as: {manifest_file}") |
if __name__ == "__main__": |
extractor = PDFExtractor() |
extractor.process_pdfs() |