import json # Input and output files input_file = "cleaned_big_book.jsonl" output_file = "qa_dataset.jsonl" def create_qa_pair(prompt, response, source_info, location="chapter_overview", max_length=300): """ Creates a formatted Q&A pair with variable response lengths based on question type. """ # Clean and truncate response based on max_length cleaned_response = response[:max_length] if response else "" # Create a single clean object return { "prompt": f"### Question: {prompt}\n\n### Answer:", "response": cleaned_response, "metadata": { "book": "Alcoholics Anonymous", "chapter": source_info["chapter"], "section": location, "edition": "First 164 pages", "type": "primary_text" } } def generate_qa_pairs(chapter, text, source_info): """Generates comprehensive Q&A pairs""" qa_pairs = [] paragraphs = [p for p in text.split('\n\n') if len(p.strip()) > 100] # 1. Basic Questions (for all chapters) qa_pairs.extend([ create_qa_pair(f"What is {chapter} about?", text, source_info), create_qa_pair(f"What are the main principles discussed in {chapter}?", text, source_info), create_qa_pair(f"What are the key takeaways from {chapter}?", text, source_info) ]) # 2. Recovery-Specific Questions qa_pairs.extend([ create_qa_pair(f"How does {chapter} help someone stay sober?", text, source_info), create_qa_pair(f"What solutions to alcoholism are presented in {chapter}?", text, source_info), create_qa_pair(f"What role does surrender play in {chapter}?", text, source_info) ]) # 3. Emotional/Mental Questions qa_pairs.extend([ create_qa_pair(f"What fears are addressed in {chapter}?", text, source_info), create_qa_pair(f"How does {chapter} deal with resentment?", text, source_info), create_qa_pair(f"What mental shifts are suggested in {chapter}?", text, source_info) ]) # 4. Spiritual Growth Questions qa_pairs.extend([ create_qa_pair(f"How does {chapter} address spiritual growth?", text, source_info), create_qa_pair(f"What spiritual principles are discussed in {chapter}?", text, source_info), create_qa_pair(f"How does {chapter} help develop faith?", text, source_info) ]) # 5. Practical Action Questions qa_pairs.extend([ create_qa_pair(f"What specific actions are recommended in {chapter}?", text, source_info), create_qa_pair(f"What daily practices are suggested in {chapter}?", text, source_info), create_qa_pair(f"What habits need changing according to {chapter}?", text, source_info) ]) # 6. Fellowship Questions qa_pairs.extend([ create_qa_pair(f"How does {chapter} discuss helping others?", text, source_info), create_qa_pair(f"What role does sponsorship play in {chapter}?", text, source_info), create_qa_pair(f"How does {chapter} address working with newcomers?", text, source_info) ]) # 7. Personal Experience Questions qa_pairs.extend([ create_qa_pair(f"What personal experiences are shared in {chapter}?", text, source_info), create_qa_pair(f"What transformations are described in {chapter}?", text, source_info), create_qa_pair(f"What struggles and victories are mentioned in {chapter}?", text, source_info) ]) # 8. Relationship Questions qa_pairs.extend([ create_qa_pair(f"How does {chapter} address family relationships?", text, source_info), create_qa_pair(f"What guidance about relationships is given in {chapter}?", text, source_info), create_qa_pair(f"How does {chapter} discuss making amends?", text, source_info) ]) # 9. Common Obstacles Questions qa_pairs.extend([ create_qa_pair(f"What obstacles to recovery are discussed in {chapter}?", text, source_info), create_qa_pair(f"How does {chapter} address denial?", text, source_info), create_qa_pair(f"What solutions to common problems are offered in {chapter}?", text, source_info) ]) # 10. Chapter-Specific Questions if chapter == "THE DOCTOR'S OPINION": qa_pairs.extend([ create_qa_pair(f"What medical perspective is shared in {chapter}?", text, source_info), create_qa_pair(f"How does the doctor describe alcoholism?", text, source_info), create_qa_pair(f"What physical aspects of alcoholism are discussed?", text, source_info) ]) elif chapter in ["BILL'S STORY", "A VISION FOR YOU"]: qa_pairs.extend([ create_qa_pair(f"What was the turning point in this story?", text, source_info), create_qa_pair(f"How did spiritual experience play a role?", text, source_info), create_qa_pair(f"What was the progression of alcoholism described?", text, source_info) ]) elif chapter in ["HOW IT WORKS", "INTO ACTION", "WORKING WITH OTHERS"]: qa_pairs.extend([ create_qa_pair(f"What specific steps are outlined in {chapter}?", text, source_info), create_qa_pair(f"How should one practice these principles?", text, source_info), create_qa_pair(f"What actions are essential according to {chapter}?", text, source_info) ]) # 11. Add paragraph-specific questions for substance if len(paragraphs) > 2: for i, para in enumerate(paragraphs[:3]): qa_pairs.append( create_qa_pair(f"What key point is made in paragraph {i+1} of {chapter}?", para, source_info) ) return qa_pairs def extract_key_concepts(text): """Extract important AA concepts from text""" concepts = [ "recovery", "sobriety", "alcoholism", "spiritual", "fellowship", "steps", "program", "healing", "hope", "solution", "experience", "strength", "faith", "willingness", "honesty", "humility", "surrender", "acceptance", "service", "meditation", "prayer", "amends", "inventory", "powerlessness", "unity", "sponsorship" ] return [c for c in concepts if c.lower() in text.lower()] def find_relevant_excerpt(text, concept, max_length=300): """Find relevant text portion for a concept.""" sentences = text.split('.') for sentence in sentences: if concept.lower() in sentence.lower(): return sentence[:max_length] return text[:max_length] def clean_chapter_name(chapter): """Clean up chapter names""" # First remove any trailing periods and spaces chapter = chapter.strip(". ") # Extract number if it's in "Chapter X" format if chapter.startswith("Chapter "): chapter = chapter.split(" ")[1] # Map numbers to proper names (using actual Big Book chapter names) chapter_map = { "1": "BILL'S STORY", "2": "THERE IS A SOLUTION", "3": "MORE ABOUT ALCOHOLISM", "4": "WE AGNOSTICS", "5": "HOW IT WORKS", "6": "INTO ACTION", "7": "WORKING WITH OTHERS", "8": "TO WIVES", "9": "THE FAMILY AFTERWARD", "10": "TO EMPLOYERS", "11": "A VISION FOR YOU", "12": "A WAY OUT", "000": "THE DOCTOR'S OPINION", "32": "FOREWORD", "1935": "HISTORICAL NOTE" } return chapter_map.get(chapter, chapter) # Main processing if __name__ == "__main__": qa_data = [] processed_chapters = set() # Keep track of chapters we've already processed try: print(f"Reading from {input_file}...") with open(input_file, "r") as f: for i, line in enumerate(f, 1): entry = json.loads(line) original_chapter = entry.get("chapter", "Unnamed Chapter") chapter = clean_chapter_name(original_chapter) # Skip if we've already processed this chapter if chapter in processed_chapters: print(f"Skipping duplicate chapter: {original_chapter} -> {chapter}") continue processed_chapters.add(chapter) text = entry.get("text", "") print(f"Processing chapter {len(processed_chapters)}: {original_chapter} -> {chapter}") source_info = { "chapter": chapter, "text_type": "chapter_content" } qa_pairs = generate_qa_pairs(chapter, text, source_info) qa_data.extend(qa_pairs) print(f"Generated {len(qa_pairs)} Q&A pairs for {chapter}") print(f"\nWriting {len(qa_data)} Q&A pairs to {output_file}") with open(output_file, "w") as f: for qa in qa_data: f.write(json.dumps(qa) + "\n") print(f"\nFinal Statistics:") print(f"Total unique chapters processed: {len(processed_chapters)}") print(f"Total Q&A pairs generated: {len(qa_data)}") except FileNotFoundError: print(f"Error: Could not find input file '{input_file}'") except Exception as e: print(f"Error: {str(e)}")