llama-aa-fine-tuned / extract_pdfs.py
Lukeam's picture
Upload extract_pdfs.py with huggingface_hub
3807a79 verified
import os
from PyPDF2 import PdfReader
import re
import json
from pathlib import Path
from datetime import datetime
import hashlib
class PDFExtractor:
def __init__(self):
self.base_dir = Path(__file__).parent.parent # go up from src_files to aa_book
self.output_dir = self.base_dir / 'processed_data'
self.raw_dir = self.output_dir / 'raw_extractions'
print(f"Creating directories in: {self.output_dir}")
self.raw_dir.mkdir(parents=True, exist_ok=True)
self.manifest = []
def generate_file_hash(self, filepath):
"""Generate hash of PDF file for tracking"""
hash_md5 = hashlib.md5()
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def add_to_manifest(self, input_file, output_file, process_type, metadata):
"""Track file transformations in manifest"""
manifest_entry = {
'timestamp': datetime.now().isoformat(),
'input_file': str(input_file),
'output_file': str(output_file),
'process_type': process_type,
'file_hash': self.generate_file_hash(input_file),
'metadata': metadata
}
self.manifest.append(manifest_entry)
def clean_text(self, text):
"""Clean extracted text"""
text = re.sub(r'\f', ' ', text)
text = re.sub(r'[^\x00-\x7F]+', '', text)
text = re.sub(r'Alcoholics Anonymous World Services, Inc\.?', '', text)
text = re.sub(r'Page \d+', '', text)
text = re.sub(r'(\w)-\s*\n(\w)', r'\1\2', text)
text = re.sub(r'[\r\n]+', ' ', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def extract_pdf(self, pdf_path):
"""Extract text from PDF with page-level tracking"""
print(f"Processing: {pdf_path}")
reader = PdfReader(pdf_path)
extraction_data = {
'filename': pdf_path.name,
'total_pages': len(reader.pages),
'extraction_date': datetime.now().isoformat(),
'pages': []
}
for page_num, page in enumerate(reader.pages, 1):
try:
raw_text = page.extract_text()
cleaned_text = self.clean_text(raw_text)
page_data = {
'page_number': page_num,
'text': cleaned_text,
'char_count': len(cleaned_text),
'word_count': len(cleaned_text.split())
}
extraction_data['pages'].append(page_data)
except Exception as e:
print(f"Error on page {page_num}: {str(e)}")
extraction_data['pages'].append({
'page_number': page_num,
'error': str(e)
})
return extraction_data
def process_pdfs(self):
"""Process PDFs with provenance tracking"""
pdf_dir = Path("src_files")
combined_data = {}
for pdf_path in pdf_dir.glob("*.pdf"):
try:
# Extract text with metadata
extraction_data = self.extract_pdf(pdf_path)
# Save individual raw extraction
source_name = pdf_path.stem.lower()
raw_output = self.raw_dir / f"{source_name}_raw.json"
with open(raw_output, 'w', encoding='utf-8') as f:
json.dump(extraction_data, f, indent=2)
# Add to manifest
self.add_to_manifest(
pdf_path,
raw_output,
'pdf_extraction',
{
'total_pages': extraction_data['total_pages'],
'successful_pages': len([p for p in extraction_data['pages'] if 'error' not in p])
}
)
# Combine all page text for the traditional output
combined_text = ' '.join(
page['text'] for page in extraction_data['pages']
if 'text' in page
)
combined_data[pdf_path.name] = combined_text
except Exception as e:
print(f"Error processing {pdf_path.name}: {str(e)}")
# Save combined data (for backward compatibility)
combined_output = self.output_dir / 'extracted_text.json'
with open(combined_output, 'w', encoding='utf-8') as f:
json.dump(combined_data, f, indent=2)
# Save manifest
manifest_file = self.output_dir / 'extraction_manifest.json'
with open(manifest_file, 'w', encoding='utf-8') as f:
json.dump(self.manifest, f, indent=2)
print("\nExtraction Summary:")
print(f"Processed files: {len(combined_data)}")
print(f"Raw extractions saved in: {self.raw_dir}")
print(f"Combined data saved as: {combined_output}")
print(f"Provenance data saved as: {manifest_file}")
if __name__ == "__main__":
extractor = PDFExtractor()
extractor.process_pdfs()