aa_book / qanda_gen_script.py
lambchop11's picture
Updated Q&A dataset with enhanced questions
87ce387
import json
# Input and output files
input_file = "cleaned_big_book.jsonl"
output_file = "qa_dataset.jsonl"
def create_qa_pair(prompt, response, source_info, location="chapter_overview", max_length=300):
"""
Creates a formatted Q&A pair with variable response lengths based on question type.
"""
# Clean and truncate response based on max_length
cleaned_response = response[:max_length] if response else ""
# Create a single clean object
return {
"prompt": f"### Question: {prompt}\n\n### Answer:",
"response": cleaned_response,
"metadata": {
"book": "Alcoholics Anonymous",
"chapter": source_info["chapter"],
"section": location,
"edition": "First 164 pages",
"type": "primary_text"
}
}
def generate_qa_pairs(chapter, text, source_info):
"""Generates comprehensive Q&A pairs"""
qa_pairs = []
paragraphs = [p for p in text.split('\n\n') if len(p.strip()) > 100]
# 1. Basic Questions (for all chapters)
qa_pairs.extend([
create_qa_pair(f"What is {chapter} about?", text, source_info),
create_qa_pair(f"What are the main principles discussed in {chapter}?", text, source_info),
create_qa_pair(f"What are the key takeaways from {chapter}?", text, source_info)
])
# 2. Recovery-Specific Questions
qa_pairs.extend([
create_qa_pair(f"How does {chapter} help someone stay sober?", text, source_info),
create_qa_pair(f"What solutions to alcoholism are presented in {chapter}?", text, source_info),
create_qa_pair(f"What role does surrender play in {chapter}?", text, source_info)
])
# 3. Emotional/Mental Questions
qa_pairs.extend([
create_qa_pair(f"What fears are addressed in {chapter}?", text, source_info),
create_qa_pair(f"How does {chapter} deal with resentment?", text, source_info),
create_qa_pair(f"What mental shifts are suggested in {chapter}?", text, source_info)
])
# 4. Spiritual Growth Questions
qa_pairs.extend([
create_qa_pair(f"How does {chapter} address spiritual growth?", text, source_info),
create_qa_pair(f"What spiritual principles are discussed in {chapter}?", text, source_info),
create_qa_pair(f"How does {chapter} help develop faith?", text, source_info)
])
# 5. Practical Action Questions
qa_pairs.extend([
create_qa_pair(f"What specific actions are recommended in {chapter}?", text, source_info),
create_qa_pair(f"What daily practices are suggested in {chapter}?", text, source_info),
create_qa_pair(f"What habits need changing according to {chapter}?", text, source_info)
])
# 6. Fellowship Questions
qa_pairs.extend([
create_qa_pair(f"How does {chapter} discuss helping others?", text, source_info),
create_qa_pair(f"What role does sponsorship play in {chapter}?", text, source_info),
create_qa_pair(f"How does {chapter} address working with newcomers?", text, source_info)
])
# 7. Personal Experience Questions
qa_pairs.extend([
create_qa_pair(f"What personal experiences are shared in {chapter}?", text, source_info),
create_qa_pair(f"What transformations are described in {chapter}?", text, source_info),
create_qa_pair(f"What struggles and victories are mentioned in {chapter}?", text, source_info)
])
# 8. Relationship Questions
qa_pairs.extend([
create_qa_pair(f"How does {chapter} address family relationships?", text, source_info),
create_qa_pair(f"What guidance about relationships is given in {chapter}?", text, source_info),
create_qa_pair(f"How does {chapter} discuss making amends?", text, source_info)
])
# 9. Common Obstacles Questions
qa_pairs.extend([
create_qa_pair(f"What obstacles to recovery are discussed in {chapter}?", text, source_info),
create_qa_pair(f"How does {chapter} address denial?", text, source_info),
create_qa_pair(f"What solutions to common problems are offered in {chapter}?", text, source_info)
])
# 10. Chapter-Specific Questions
if chapter == "THE DOCTOR'S OPINION":
qa_pairs.extend([
create_qa_pair(f"What medical perspective is shared in {chapter}?", text, source_info),
create_qa_pair(f"How does the doctor describe alcoholism?", text, source_info),
create_qa_pair(f"What physical aspects of alcoholism are discussed?", text, source_info)
])
elif chapter in ["BILL'S STORY", "A VISION FOR YOU"]:
qa_pairs.extend([
create_qa_pair(f"What was the turning point in this story?", text, source_info),
create_qa_pair(f"How did spiritual experience play a role?", text, source_info),
create_qa_pair(f"What was the progression of alcoholism described?", text, source_info)
])
elif chapter in ["HOW IT WORKS", "INTO ACTION", "WORKING WITH OTHERS"]:
qa_pairs.extend([
create_qa_pair(f"What specific steps are outlined in {chapter}?", text, source_info),
create_qa_pair(f"How should one practice these principles?", text, source_info),
create_qa_pair(f"What actions are essential according to {chapter}?", text, source_info)
])
# 11. Add paragraph-specific questions for substance
if len(paragraphs) > 2:
for i, para in enumerate(paragraphs[:3]):
qa_pairs.append(
create_qa_pair(f"What key point is made in paragraph {i+1} of {chapter}?",
para, source_info)
)
return qa_pairs
def extract_key_concepts(text):
"""Extract important AA concepts from text"""
concepts = [
"recovery", "sobriety", "alcoholism", "spiritual", "fellowship",
"steps", "program", "healing", "hope", "solution", "experience",
"strength", "faith", "willingness", "honesty", "humility",
"surrender", "acceptance", "service", "meditation", "prayer",
"amends", "inventory", "powerlessness", "unity", "sponsorship"
]
return [c for c in concepts if c.lower() in text.lower()]
def find_relevant_excerpt(text, concept, max_length=300):
"""Find relevant text portion for a concept."""
sentences = text.split('.')
for sentence in sentences:
if concept.lower() in sentence.lower():
return sentence[:max_length]
return text[:max_length]
def clean_chapter_name(chapter):
"""Clean up chapter names"""
# First remove any trailing periods and spaces
chapter = chapter.strip(". ")
# Extract number if it's in "Chapter X" format
if chapter.startswith("Chapter "):
chapter = chapter.split(" ")[1]
# Map numbers to proper names (using actual Big Book chapter names)
chapter_map = {
"1": "BILL'S STORY",
"2": "THERE IS A SOLUTION",
"3": "MORE ABOUT ALCOHOLISM",
"4": "WE AGNOSTICS",
"5": "HOW IT WORKS",
"6": "INTO ACTION",
"7": "WORKING WITH OTHERS",
"8": "TO WIVES",
"9": "THE FAMILY AFTERWARD",
"10": "TO EMPLOYERS",
"11": "A VISION FOR YOU",
"12": "A WAY OUT",
"000": "THE DOCTOR'S OPINION",
"32": "FOREWORD",
"1935": "HISTORICAL NOTE"
}
return chapter_map.get(chapter, chapter)
# Main processing
if __name__ == "__main__":
qa_data = []
processed_chapters = set() # Keep track of chapters we've already processed
try:
print(f"Reading from {input_file}...")
with open(input_file, "r") as f:
for i, line in enumerate(f, 1):
entry = json.loads(line)
original_chapter = entry.get("chapter", "Unnamed Chapter")
chapter = clean_chapter_name(original_chapter)
# Skip if we've already processed this chapter
if chapter in processed_chapters:
print(f"Skipping duplicate chapter: {original_chapter} -> {chapter}")
continue
processed_chapters.add(chapter)
text = entry.get("text", "")
print(f"Processing chapter {len(processed_chapters)}: {original_chapter} -> {chapter}")
source_info = {
"chapter": chapter,
"text_type": "chapter_content"
}
qa_pairs = generate_qa_pairs(chapter, text, source_info)
qa_data.extend(qa_pairs)
print(f"Generated {len(qa_pairs)} Q&A pairs for {chapter}")
print(f"\nWriting {len(qa_data)} Q&A pairs to {output_file}")
with open(output_file, "w") as f:
for qa in qa_data:
f.write(json.dumps(qa) + "\n")
print(f"\nFinal Statistics:")
print(f"Total unique chapters processed: {len(processed_chapters)}")
print(f"Total Q&A pairs generated: {len(qa_data)}")
except FileNotFoundError:
print(f"Error: Could not find input file '{input_file}'")
except Exception as e:
print(f"Error: {str(e)}")