Spaces:

Lukeam
/

aa_book

Sleeping

App Files Files Community

aa_book / qanda_gen_script.py

lambchop11

Updated Q&A dataset with enhanced questions

87ce387 3 months ago

raw

history blame contribute delete

9.29 kB

	import json

	# Input and output files
	input_file = "cleaned_big_book.jsonl"
	output_file = "qa_dataset.jsonl"

	def create_qa_pair(prompt, response, source_info, location="chapter_overview", max_length=300):
	"""
	Creates a formatted Q&A pair with variable response lengths based on question type.
	"""
	# Clean and truncate response based on max_length
	cleaned_response = response[:max_length] if response else ""

	# Create a single clean object
	return {
	"prompt": f"### Question: {prompt}\n\n### Answer:",
	"response": cleaned_response,
	"metadata": {
	"book": "Alcoholics Anonymous",
	"chapter": source_info["chapter"],
	"section": location,
	"edition": "First 164 pages",
	"type": "primary_text"
	}
	}

	def generate_qa_pairs(chapter, text, source_info):
	"""Generates comprehensive Q&A pairs"""
	qa_pairs = []
	paragraphs = [p for p in text.split('\n\n') if len(p.strip()) > 100]

	# 1. Basic Questions (for all chapters)
	qa_pairs.extend([
	create_qa_pair(f"What is {chapter} about?", text, source_info),
	create_qa_pair(f"What are the main principles discussed in {chapter}?", text, source_info),
	create_qa_pair(f"What are the key takeaways from {chapter}?", text, source_info)
	])

	# 2. Recovery-Specific Questions
	qa_pairs.extend([
	create_qa_pair(f"How does {chapter} help someone stay sober?", text, source_info),
	create_qa_pair(f"What solutions to alcoholism are presented in {chapter}?", text, source_info),
	create_qa_pair(f"What role does surrender play in {chapter}?", text, source_info)
	])

	# 3. Emotional/Mental Questions
	qa_pairs.extend([
	create_qa_pair(f"What fears are addressed in {chapter}?", text, source_info),
	create_qa_pair(f"How does {chapter} deal with resentment?", text, source_info),
	create_qa_pair(f"What mental shifts are suggested in {chapter}?", text, source_info)
	])

	# 4. Spiritual Growth Questions
	qa_pairs.extend([
	create_qa_pair(f"How does {chapter} address spiritual growth?", text, source_info),
	create_qa_pair(f"What spiritual principles are discussed in {chapter}?", text, source_info),
	create_qa_pair(f"How does {chapter} help develop faith?", text, source_info)
	])

	# 5. Practical Action Questions
	qa_pairs.extend([
	create_qa_pair(f"What specific actions are recommended in {chapter}?", text, source_info),
	create_qa_pair(f"What daily practices are suggested in {chapter}?", text, source_info),
	create_qa_pair(f"What habits need changing according to {chapter}?", text, source_info)
	])

	# 6. Fellowship Questions
	qa_pairs.extend([
	create_qa_pair(f"How does {chapter} discuss helping others?", text, source_info),
	create_qa_pair(f"What role does sponsorship play in {chapter}?", text, source_info),
	create_qa_pair(f"How does {chapter} address working with newcomers?", text, source_info)
	])

	# 7. Personal Experience Questions
	qa_pairs.extend([
	create_qa_pair(f"What personal experiences are shared in {chapter}?", text, source_info),
	create_qa_pair(f"What transformations are described in {chapter}?", text, source_info),
	create_qa_pair(f"What struggles and victories are mentioned in {chapter}?", text, source_info)
	])

	# 8. Relationship Questions
	qa_pairs.extend([
	create_qa_pair(f"How does {chapter} address family relationships?", text, source_info),
	create_qa_pair(f"What guidance about relationships is given in {chapter}?", text, source_info),
	create_qa_pair(f"How does {chapter} discuss making amends?", text, source_info)
	])

	# 9. Common Obstacles Questions
	qa_pairs.extend([
	create_qa_pair(f"What obstacles to recovery are discussed in {chapter}?", text, source_info),
	create_qa_pair(f"How does {chapter} address denial?", text, source_info),
	create_qa_pair(f"What solutions to common problems are offered in {chapter}?", text, source_info)
	])

	# 10. Chapter-Specific Questions
	if chapter == "THE DOCTOR'S OPINION":
	qa_pairs.extend([
	create_qa_pair(f"What medical perspective is shared in {chapter}?", text, source_info),
	create_qa_pair(f"How does the doctor describe alcoholism?", text, source_info),
	create_qa_pair(f"What physical aspects of alcoholism are discussed?", text, source_info)
	])
	elif chapter in ["BILL'S STORY", "A VISION FOR YOU"]:
	qa_pairs.extend([
	create_qa_pair(f"What was the turning point in this story?", text, source_info),
	create_qa_pair(f"How did spiritual experience play a role?", text, source_info),
	create_qa_pair(f"What was the progression of alcoholism described?", text, source_info)
	])
	elif chapter in ["HOW IT WORKS", "INTO ACTION", "WORKING WITH OTHERS"]:
	qa_pairs.extend([
	create_qa_pair(f"What specific steps are outlined in {chapter}?", text, source_info),
	create_qa_pair(f"How should one practice these principles?", text, source_info),
	create_qa_pair(f"What actions are essential according to {chapter}?", text, source_info)
	])

	# 11. Add paragraph-specific questions for substance
	if len(paragraphs) > 2:
	for i, para in enumerate(paragraphs[:3]):
	qa_pairs.append(
	create_qa_pair(f"What key point is made in paragraph {i+1} of {chapter}?",
	para, source_info)
	)

	return qa_pairs

	def extract_key_concepts(text):
	"""Extract important AA concepts from text"""
	concepts = [
	"recovery", "sobriety", "alcoholism", "spiritual", "fellowship",
	"steps", "program", "healing", "hope", "solution", "experience",
	"strength", "faith", "willingness", "honesty", "humility",
	"surrender", "acceptance", "service", "meditation", "prayer",
	"amends", "inventory", "powerlessness", "unity", "sponsorship"
	]
	return [c for c in concepts if c.lower() in text.lower()]

	def find_relevant_excerpt(text, concept, max_length=300):
	"""Find relevant text portion for a concept."""
	sentences = text.split('.')
	for sentence in sentences:
	if concept.lower() in sentence.lower():
	return sentence[:max_length]
	return text[:max_length]

	def clean_chapter_name(chapter):
	"""Clean up chapter names"""
	# First remove any trailing periods and spaces
	chapter = chapter.strip(". ")

	# Extract number if it's in "Chapter X" format
	if chapter.startswith("Chapter "):
	chapter = chapter.split(" ")[1]

	# Map numbers to proper names (using actual Big Book chapter names)
	chapter_map = {
	"1": "BILL'S STORY",
	"2": "THERE IS A SOLUTION",
	"3": "MORE ABOUT ALCOHOLISM",
	"4": "WE AGNOSTICS",
	"5": "HOW IT WORKS",
	"6": "INTO ACTION",
	"7": "WORKING WITH OTHERS",
	"8": "TO WIVES",
	"9": "THE FAMILY AFTERWARD",
	"10": "TO EMPLOYERS",
	"11": "A VISION FOR YOU",
	"12": "A WAY OUT",
	"000": "THE DOCTOR'S OPINION",
	"32": "FOREWORD",
	"1935": "HISTORICAL NOTE"
	}

	return chapter_map.get(chapter, chapter)

	# Main processing
	if __name__ == "__main__":
	qa_data = []
	processed_chapters = set() # Keep track of chapters we've already processed

	try:
	print(f"Reading from {input_file}...")
	with open(input_file, "r") as f:
	for i, line in enumerate(f, 1):
	entry = json.loads(line)
	original_chapter = entry.get("chapter", "Unnamed Chapter")
	chapter = clean_chapter_name(original_chapter)

	# Skip if we've already processed this chapter
	if chapter in processed_chapters:
	print(f"Skipping duplicate chapter: {original_chapter} -> {chapter}")
	continue

	processed_chapters.add(chapter)
	text = entry.get("text", "")

	print(f"Processing chapter {len(processed_chapters)}: {original_chapter} -> {chapter}")

	source_info = {
	"chapter": chapter,
	"text_type": "chapter_content"
	}

	qa_pairs = generate_qa_pairs(chapter, text, source_info)
	qa_data.extend(qa_pairs)
	print(f"Generated {len(qa_pairs)} Q&A pairs for {chapter}")

	print(f"\nWriting {len(qa_data)} Q&A pairs to {output_file}")
	with open(output_file, "w") as f:
	for qa in qa_data:
	f.write(json.dumps(qa) + "\n")

	print(f"\nFinal Statistics:")
	print(f"Total unique chapters processed: {len(processed_chapters)}")
	print(f"Total Q&A pairs generated: {len(qa_data)}")

	except FileNotFoundError:
	print(f"Error: Could not find input file '{input_file}'")
	except Exception as e:
	print(f"Error: {str(e)}")