|
import os |
|
from PyPDF2 import PdfReader |
|
import re |
|
import json |
|
from pathlib import Path |
|
from datetime import datetime |
|
import hashlib |
|
|
|
class PDFExtractor: |
|
def __init__(self): |
|
self.base_dir = Path(__file__).parent.parent |
|
self.output_dir = self.base_dir / 'processed_data' |
|
self.raw_dir = self.output_dir / 'raw_extractions' |
|
print(f"Creating directories in: {self.output_dir}") |
|
self.raw_dir.mkdir(parents=True, exist_ok=True) |
|
self.manifest = [] |
|
|
|
def generate_file_hash(self, filepath): |
|
"""Generate hash of PDF file for tracking""" |
|
hash_md5 = hashlib.md5() |
|
with open(filepath, "rb") as f: |
|
for chunk in iter(lambda: f.read(4096), b""): |
|
hash_md5.update(chunk) |
|
return hash_md5.hexdigest() |
|
|
|
def add_to_manifest(self, input_file, output_file, process_type, metadata): |
|
"""Track file transformations in manifest""" |
|
manifest_entry = { |
|
'timestamp': datetime.now().isoformat(), |
|
'input_file': str(input_file), |
|
'output_file': str(output_file), |
|
'process_type': process_type, |
|
'file_hash': self.generate_file_hash(input_file), |
|
'metadata': metadata |
|
} |
|
self.manifest.append(manifest_entry) |
|
|
|
def clean_text(self, text): |
|
"""Clean extracted text""" |
|
text = re.sub(r'\f', ' ', text) |
|
text = re.sub(r'[^\x00-\x7F]+', '', text) |
|
text = re.sub(r'Alcoholics Anonymous World Services, Inc\.?', '', text) |
|
text = re.sub(r'Page \d+', '', text) |
|
text = re.sub(r'(\w)-\s*\n(\w)', r'\1\2', text) |
|
text = re.sub(r'[\r\n]+', ' ', text) |
|
text = re.sub(r'\s+', ' ', text) |
|
return text.strip() |
|
|
|
def extract_pdf(self, pdf_path): |
|
"""Extract text from PDF with page-level tracking""" |
|
print(f"Processing: {pdf_path}") |
|
reader = PdfReader(pdf_path) |
|
extraction_data = { |
|
'filename': pdf_path.name, |
|
'total_pages': len(reader.pages), |
|
'extraction_date': datetime.now().isoformat(), |
|
'pages': [] |
|
} |
|
|
|
for page_num, page in enumerate(reader.pages, 1): |
|
try: |
|
raw_text = page.extract_text() |
|
cleaned_text = self.clean_text(raw_text) |
|
|
|
page_data = { |
|
'page_number': page_num, |
|
'text': cleaned_text, |
|
'char_count': len(cleaned_text), |
|
'word_count': len(cleaned_text.split()) |
|
} |
|
extraction_data['pages'].append(page_data) |
|
|
|
except Exception as e: |
|
print(f"Error on page {page_num}: {str(e)}") |
|
extraction_data['pages'].append({ |
|
'page_number': page_num, |
|
'error': str(e) |
|
}) |
|
|
|
return extraction_data |
|
|
|
def process_pdfs(self): |
|
"""Process PDFs with provenance tracking""" |
|
pdf_dir = Path("src_files") |
|
combined_data = {} |
|
|
|
for pdf_path in pdf_dir.glob("*.pdf"): |
|
try: |
|
|
|
extraction_data = self.extract_pdf(pdf_path) |
|
|
|
|
|
source_name = pdf_path.stem.lower() |
|
raw_output = self.raw_dir / f"{source_name}_raw.json" |
|
with open(raw_output, 'w', encoding='utf-8') as f: |
|
json.dump(extraction_data, f, indent=2) |
|
|
|
|
|
self.add_to_manifest( |
|
pdf_path, |
|
raw_output, |
|
'pdf_extraction', |
|
{ |
|
'total_pages': extraction_data['total_pages'], |
|
'successful_pages': len([p for p in extraction_data['pages'] if 'error' not in p]) |
|
} |
|
) |
|
|
|
|
|
combined_text = ' '.join( |
|
page['text'] for page in extraction_data['pages'] |
|
if 'text' in page |
|
) |
|
combined_data[pdf_path.name] = combined_text |
|
|
|
except Exception as e: |
|
print(f"Error processing {pdf_path.name}: {str(e)}") |
|
|
|
|
|
combined_output = self.output_dir / 'extracted_text.json' |
|
with open(combined_output, 'w', encoding='utf-8') as f: |
|
json.dump(combined_data, f, indent=2) |
|
|
|
|
|
manifest_file = self.output_dir / 'extraction_manifest.json' |
|
with open(manifest_file, 'w', encoding='utf-8') as f: |
|
json.dump(self.manifest, f, indent=2) |
|
|
|
print("\nExtraction Summary:") |
|
print(f"Processed files: {len(combined_data)}") |
|
print(f"Raw extractions saved in: {self.raw_dir}") |
|
print(f"Combined data saved as: {combined_output}") |
|
print(f"Provenance data saved as: {manifest_file}") |
|
|
|
if __name__ == "__main__": |
|
extractor = PDFExtractor() |
|
extractor.process_pdfs() |