File size: 6,171 Bytes
1bf5b03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import json
from pathlib import Path
from datetime import datetime
import hashlib
class QAGenerator:
def __init__(self):
# Fix paths to stay within aa_book directory
self.base_dir = Path(__file__).parent # current directory (aa_book)
self.output_dir = self.base_dir / 'processed_data'
self.qa_dir = self.output_dir / 'qa_pairs'
self.raw_dir = self.output_dir / 'raw_extractions'
print(f"Looking for raw extractions in: {self.raw_dir}")
self.qa_dir.mkdir(parents=True, exist_ok=True)
self.manifest = []
def add_to_manifest(self, input_file, output_file, process_type, metadata):
"""Track transformations in manifest"""
manifest_entry = {
'timestamp': datetime.now().isoformat(),
'input_file': str(input_file),
'output_file': str(output_file),
'process_type': process_type,
'metadata': metadata
}
self.manifest.append(manifest_entry)
def generate_qa_pairs(self, text, source_info):
"""Generate Q&A pairs from text"""
qa_pairs = []
# Split into sections (chapters, paragraphs)
sections = text.split('\n\n')
for i, section in enumerate(sections):
if len(section.strip()) < 100: # Skip short sections
continue
# Generate different types of questions
qa_pairs.extend([
{
'question': f"What are the main points discussed in this section of {source_info['title']}?",
'answer': section.strip(),
'source': source_info,
'section_index': i,
'qa_type': 'main_points',
'timestamp': datetime.now().isoformat()
},
{
'question': f"Can you summarize the key concepts from this passage in {source_info['title']}?",
'answer': section.strip(),
'source': source_info,
'section_index': i,
'qa_type': 'summary',
'timestamp': datetime.now().isoformat()
}
])
# Add specific AA-related questions if relevant keywords are found
if any(word in section.lower() for word in ['step', 'tradition', 'recovery', 'sobriety']):
qa_pairs.append({
'question': f"What recovery principles or concepts are discussed in this section of {source_info['title']}?",
'answer': section.strip(),
'source': source_info,
'section_index': i,
'qa_type': 'aa_specific',
'timestamp': datetime.now().isoformat()
})
return qa_pairs
def process_all_sources(self):
"""Process all extracted texts into QA pairs"""
# Update path to look in the correct location
raw_dir = self.output_dir / 'raw_extractions'
if not raw_dir.exists():
raise FileNotFoundError(f"Directory not found: {raw_dir}. Please run extract_pdfs.py first.")
all_qa_pairs = []
sources_processed = []
for raw_file in raw_dir.glob('*_raw.json'):
print(f"\nProcessing {raw_file.name}...")
with open(raw_file, 'r', encoding='utf-8') as f:
raw_data = json.load(f)
# Create source info
source_info = {
'title': raw_data['filename'],
'extraction_date': raw_data['extraction_date'],
'total_pages': raw_data['total_pages']
}
# Combine all page text
full_text = ' '.join(
page['text'] for page in raw_data['pages']
if 'text' in page
)
# Generate QA pairs
qa_pairs = self.generate_qa_pairs(full_text, source_info)
# Save source-specific QA pairs
source_output = self.qa_dir / f"{raw_file.stem.replace('_raw', '')}_qa.jsonl"
with open(source_output, 'w', encoding='utf-8') as f:
for pair in qa_pairs:
f.write(json.dumps(pair) + '\n')
# Add to manifest
self.add_to_manifest(
raw_file,
source_output,
'qa_generation',
{
'pairs_generated': len(qa_pairs),
'source': source_info['title']
}
)
all_qa_pairs.extend(qa_pairs)
sources_processed.append(source_info)
print(f"Generated {len(qa_pairs)} Q&A pairs")
# Save combined QA pairs
combined_output = self.qa_dir / 'combined_qa.jsonl'
with open(combined_output, 'w', encoding='utf-8') as f:
# Write metadata first
metadata = {
'timestamp': datetime.now().isoformat(),
'total_pairs': len(all_qa_pairs),
'sources': sources_processed
}
f.write(json.dumps(metadata) + '\n')
# Write all QA pairs
for pair in all_qa_pairs:
f.write(json.dumps(pair) + '\n')
# Save QA generation manifest
manifest_file = self.qa_dir / 'qa_generation_manifest.json'
with open(manifest_file, 'w', encoding='utf-8') as f:
json.dump(self.manifest, f, indent=2)
print("\nQ&A Generation Summary:")
print(f"Total sources processed: {len(sources_processed)}")
print(f"Total Q&A pairs generated: {len(all_qa_pairs)}")
print(f"Individual source files saved in: {self.qa_dir}")
print(f"Combined Q&A pairs saved as: {combined_output}")
print(f"Provenance data saved as: {manifest_file}")
if __name__ == "__main__":
generator = QAGenerator()
generator.process_all_sources() |