|
import json |
|
from pathlib import Path |
|
from datetime import datetime |
|
import hashlib |
|
|
|
class QAGenerator: |
|
def __init__(self): |
|
|
|
self.base_dir = Path(__file__).parent |
|
self.output_dir = self.base_dir / 'processed_data' |
|
self.qa_dir = self.output_dir / 'qa_pairs' |
|
self.raw_dir = self.output_dir / 'raw_extractions' |
|
print(f"Looking for raw extractions in: {self.raw_dir}") |
|
self.qa_dir.mkdir(parents=True, exist_ok=True) |
|
self.manifest = [] |
|
|
|
def add_to_manifest(self, input_file, output_file, process_type, metadata): |
|
"""Track transformations in manifest""" |
|
manifest_entry = { |
|
'timestamp': datetime.now().isoformat(), |
|
'input_file': str(input_file), |
|
'output_file': str(output_file), |
|
'process_type': process_type, |
|
'metadata': metadata |
|
} |
|
self.manifest.append(manifest_entry) |
|
|
|
def generate_qa_pairs(self, text, source_info): |
|
"""Generate Q&A pairs from text""" |
|
qa_pairs = [] |
|
|
|
|
|
sections = text.split('\n\n') |
|
|
|
for i, section in enumerate(sections): |
|
if len(section.strip()) < 100: |
|
continue |
|
|
|
|
|
qa_pairs.extend([ |
|
{ |
|
'question': f"What are the main points discussed in this section of {source_info['title']}?", |
|
'answer': section.strip(), |
|
'source': source_info, |
|
'section_index': i, |
|
'qa_type': 'main_points', |
|
'timestamp': datetime.now().isoformat() |
|
}, |
|
{ |
|
'question': f"Can you summarize the key concepts from this passage in {source_info['title']}?", |
|
'answer': section.strip(), |
|
'source': source_info, |
|
'section_index': i, |
|
'qa_type': 'summary', |
|
'timestamp': datetime.now().isoformat() |
|
} |
|
]) |
|
|
|
|
|
if any(word in section.lower() for word in ['step', 'tradition', 'recovery', 'sobriety']): |
|
qa_pairs.append({ |
|
'question': f"What recovery principles or concepts are discussed in this section of {source_info['title']}?", |
|
'answer': section.strip(), |
|
'source': source_info, |
|
'section_index': i, |
|
'qa_type': 'aa_specific', |
|
'timestamp': datetime.now().isoformat() |
|
}) |
|
|
|
return qa_pairs |
|
|
|
def process_all_sources(self): |
|
"""Process all extracted texts into QA pairs""" |
|
|
|
raw_dir = self.output_dir / 'raw_extractions' |
|
|
|
if not raw_dir.exists(): |
|
raise FileNotFoundError(f"Directory not found: {raw_dir}. Please run extract_pdfs.py first.") |
|
|
|
all_qa_pairs = [] |
|
sources_processed = [] |
|
|
|
for raw_file in raw_dir.glob('*_raw.json'): |
|
print(f"\nProcessing {raw_file.name}...") |
|
|
|
with open(raw_file, 'r', encoding='utf-8') as f: |
|
raw_data = json.load(f) |
|
|
|
|
|
source_info = { |
|
'title': raw_data['filename'], |
|
'extraction_date': raw_data['extraction_date'], |
|
'total_pages': raw_data['total_pages'] |
|
} |
|
|
|
|
|
full_text = ' '.join( |
|
page['text'] for page in raw_data['pages'] |
|
if 'text' in page |
|
) |
|
|
|
|
|
qa_pairs = self.generate_qa_pairs(full_text, source_info) |
|
|
|
|
|
source_output = self.qa_dir / f"{raw_file.stem.replace('_raw', '')}_qa.jsonl" |
|
with open(source_output, 'w', encoding='utf-8') as f: |
|
for pair in qa_pairs: |
|
f.write(json.dumps(pair) + '\n') |
|
|
|
|
|
self.add_to_manifest( |
|
raw_file, |
|
source_output, |
|
'qa_generation', |
|
{ |
|
'pairs_generated': len(qa_pairs), |
|
'source': source_info['title'] |
|
} |
|
) |
|
|
|
all_qa_pairs.extend(qa_pairs) |
|
sources_processed.append(source_info) |
|
|
|
print(f"Generated {len(qa_pairs)} Q&A pairs") |
|
|
|
|
|
combined_output = self.qa_dir / 'combined_qa.jsonl' |
|
with open(combined_output, 'w', encoding='utf-8') as f: |
|
|
|
metadata = { |
|
'timestamp': datetime.now().isoformat(), |
|
'total_pairs': len(all_qa_pairs), |
|
'sources': sources_processed |
|
} |
|
f.write(json.dumps(metadata) + '\n') |
|
|
|
|
|
for pair in all_qa_pairs: |
|
f.write(json.dumps(pair) + '\n') |
|
|
|
|
|
manifest_file = self.qa_dir / 'qa_generation_manifest.json' |
|
with open(manifest_file, 'w', encoding='utf-8') as f: |
|
json.dump(self.manifest, f, indent=2) |
|
|
|
print("\nQ&A Generation Summary:") |
|
print(f"Total sources processed: {len(sources_processed)}") |
|
print(f"Total Q&A pairs generated: {len(all_qa_pairs)}") |
|
print(f"Individual source files saved in: {self.qa_dir}") |
|
print(f"Combined Q&A pairs saved as: {combined_output}") |
|
print(f"Provenance data saved as: {manifest_file}") |
|
|
|
if __name__ == "__main__": |
|
generator = QAGenerator() |
|
generator.process_all_sources() |