|
import json |
|
|
|
|
|
input_file = "cleaned_big_book.jsonl" |
|
output_file = "qa_dataset.jsonl" |
|
|
|
def create_qa_pair(prompt, response, source_info, location="chapter_overview", max_length=300): |
|
""" |
|
Creates a formatted Q&A pair with variable response lengths based on question type. |
|
""" |
|
|
|
cleaned_response = response[:max_length] if response else "" |
|
|
|
|
|
return { |
|
"prompt": f"### Question: {prompt}\n\n### Answer:", |
|
"response": cleaned_response, |
|
"metadata": { |
|
"book": "Alcoholics Anonymous", |
|
"chapter": source_info["chapter"], |
|
"section": location, |
|
"edition": "First 164 pages", |
|
"type": "primary_text" |
|
} |
|
} |
|
|
|
def generate_qa_pairs(chapter, text, source_info): |
|
"""Generates comprehensive Q&A pairs""" |
|
qa_pairs = [] |
|
paragraphs = [p for p in text.split('\n\n') if len(p.strip()) > 100] |
|
|
|
|
|
qa_pairs.extend([ |
|
create_qa_pair(f"What is {chapter} about?", text, source_info), |
|
create_qa_pair(f"What are the main principles discussed in {chapter}?", text, source_info), |
|
create_qa_pair(f"What are the key takeaways from {chapter}?", text, source_info) |
|
]) |
|
|
|
|
|
qa_pairs.extend([ |
|
create_qa_pair(f"How does {chapter} help someone stay sober?", text, source_info), |
|
create_qa_pair(f"What solutions to alcoholism are presented in {chapter}?", text, source_info), |
|
create_qa_pair(f"What role does surrender play in {chapter}?", text, source_info) |
|
]) |
|
|
|
|
|
qa_pairs.extend([ |
|
create_qa_pair(f"What fears are addressed in {chapter}?", text, source_info), |
|
create_qa_pair(f"How does {chapter} deal with resentment?", text, source_info), |
|
create_qa_pair(f"What mental shifts are suggested in {chapter}?", text, source_info) |
|
]) |
|
|
|
|
|
qa_pairs.extend([ |
|
create_qa_pair(f"How does {chapter} address spiritual growth?", text, source_info), |
|
create_qa_pair(f"What spiritual principles are discussed in {chapter}?", text, source_info), |
|
create_qa_pair(f"How does {chapter} help develop faith?", text, source_info) |
|
]) |
|
|
|
|
|
qa_pairs.extend([ |
|
create_qa_pair(f"What specific actions are recommended in {chapter}?", text, source_info), |
|
create_qa_pair(f"What daily practices are suggested in {chapter}?", text, source_info), |
|
create_qa_pair(f"What habits need changing according to {chapter}?", text, source_info) |
|
]) |
|
|
|
|
|
qa_pairs.extend([ |
|
create_qa_pair(f"How does {chapter} discuss helping others?", text, source_info), |
|
create_qa_pair(f"What role does sponsorship play in {chapter}?", text, source_info), |
|
create_qa_pair(f"How does {chapter} address working with newcomers?", text, source_info) |
|
]) |
|
|
|
|
|
qa_pairs.extend([ |
|
create_qa_pair(f"What personal experiences are shared in {chapter}?", text, source_info), |
|
create_qa_pair(f"What transformations are described in {chapter}?", text, source_info), |
|
create_qa_pair(f"What struggles and victories are mentioned in {chapter}?", text, source_info) |
|
]) |
|
|
|
|
|
qa_pairs.extend([ |
|
create_qa_pair(f"How does {chapter} address family relationships?", text, source_info), |
|
create_qa_pair(f"What guidance about relationships is given in {chapter}?", text, source_info), |
|
create_qa_pair(f"How does {chapter} discuss making amends?", text, source_info) |
|
]) |
|
|
|
|
|
qa_pairs.extend([ |
|
create_qa_pair(f"What obstacles to recovery are discussed in {chapter}?", text, source_info), |
|
create_qa_pair(f"How does {chapter} address denial?", text, source_info), |
|
create_qa_pair(f"What solutions to common problems are offered in {chapter}?", text, source_info) |
|
]) |
|
|
|
|
|
if chapter == "THE DOCTOR'S OPINION": |
|
qa_pairs.extend([ |
|
create_qa_pair(f"What medical perspective is shared in {chapter}?", text, source_info), |
|
create_qa_pair(f"How does the doctor describe alcoholism?", text, source_info), |
|
create_qa_pair(f"What physical aspects of alcoholism are discussed?", text, source_info) |
|
]) |
|
elif chapter in ["BILL'S STORY", "A VISION FOR YOU"]: |
|
qa_pairs.extend([ |
|
create_qa_pair(f"What was the turning point in this story?", text, source_info), |
|
create_qa_pair(f"How did spiritual experience play a role?", text, source_info), |
|
create_qa_pair(f"What was the progression of alcoholism described?", text, source_info) |
|
]) |
|
elif chapter in ["HOW IT WORKS", "INTO ACTION", "WORKING WITH OTHERS"]: |
|
qa_pairs.extend([ |
|
create_qa_pair(f"What specific steps are outlined in {chapter}?", text, source_info), |
|
create_qa_pair(f"How should one practice these principles?", text, source_info), |
|
create_qa_pair(f"What actions are essential according to {chapter}?", text, source_info) |
|
]) |
|
|
|
|
|
if len(paragraphs) > 2: |
|
for i, para in enumerate(paragraphs[:3]): |
|
qa_pairs.append( |
|
create_qa_pair(f"What key point is made in paragraph {i+1} of {chapter}?", |
|
para, source_info) |
|
) |
|
|
|
return qa_pairs |
|
|
|
def extract_key_concepts(text): |
|
"""Extract important AA concepts from text""" |
|
concepts = [ |
|
"recovery", "sobriety", "alcoholism", "spiritual", "fellowship", |
|
"steps", "program", "healing", "hope", "solution", "experience", |
|
"strength", "faith", "willingness", "honesty", "humility", |
|
"surrender", "acceptance", "service", "meditation", "prayer", |
|
"amends", "inventory", "powerlessness", "unity", "sponsorship" |
|
] |
|
return [c for c in concepts if c.lower() in text.lower()] |
|
|
|
def find_relevant_excerpt(text, concept, max_length=300): |
|
"""Find relevant text portion for a concept.""" |
|
sentences = text.split('.') |
|
for sentence in sentences: |
|
if concept.lower() in sentence.lower(): |
|
return sentence[:max_length] |
|
return text[:max_length] |
|
|
|
def clean_chapter_name(chapter): |
|
"""Clean up chapter names""" |
|
|
|
chapter = chapter.strip(". ") |
|
|
|
|
|
if chapter.startswith("Chapter "): |
|
chapter = chapter.split(" ")[1] |
|
|
|
|
|
chapter_map = { |
|
"1": "BILL'S STORY", |
|
"2": "THERE IS A SOLUTION", |
|
"3": "MORE ABOUT ALCOHOLISM", |
|
"4": "WE AGNOSTICS", |
|
"5": "HOW IT WORKS", |
|
"6": "INTO ACTION", |
|
"7": "WORKING WITH OTHERS", |
|
"8": "TO WIVES", |
|
"9": "THE FAMILY AFTERWARD", |
|
"10": "TO EMPLOYERS", |
|
"11": "A VISION FOR YOU", |
|
"12": "A WAY OUT", |
|
"000": "THE DOCTOR'S OPINION", |
|
"32": "FOREWORD", |
|
"1935": "HISTORICAL NOTE" |
|
} |
|
|
|
return chapter_map.get(chapter, chapter) |
|
|
|
|
|
if __name__ == "__main__": |
|
qa_data = [] |
|
processed_chapters = set() |
|
|
|
try: |
|
print(f"Reading from {input_file}...") |
|
with open(input_file, "r") as f: |
|
for i, line in enumerate(f, 1): |
|
entry = json.loads(line) |
|
original_chapter = entry.get("chapter", "Unnamed Chapter") |
|
chapter = clean_chapter_name(original_chapter) |
|
|
|
|
|
if chapter in processed_chapters: |
|
print(f"Skipping duplicate chapter: {original_chapter} -> {chapter}") |
|
continue |
|
|
|
processed_chapters.add(chapter) |
|
text = entry.get("text", "") |
|
|
|
print(f"Processing chapter {len(processed_chapters)}: {original_chapter} -> {chapter}") |
|
|
|
source_info = { |
|
"chapter": chapter, |
|
"text_type": "chapter_content" |
|
} |
|
|
|
qa_pairs = generate_qa_pairs(chapter, text, source_info) |
|
qa_data.extend(qa_pairs) |
|
print(f"Generated {len(qa_pairs)} Q&A pairs for {chapter}") |
|
|
|
print(f"\nWriting {len(qa_data)} Q&A pairs to {output_file}") |
|
with open(output_file, "w") as f: |
|
for qa in qa_data: |
|
f.write(json.dumps(qa) + "\n") |
|
|
|
print(f"\nFinal Statistics:") |
|
print(f"Total unique chapters processed: {len(processed_chapters)}") |
|
print(f"Total Q&A pairs generated: {len(qa_data)}") |
|
|
|
except FileNotFoundError: |
|
print(f"Error: Could not find input file '{input_file}'") |
|
except Exception as e: |
|
print(f"Error: {str(e)}") |