#
# Reads "tbot-dataset/" + name + ".pdf"
# Outputs "tbot-dataset/" + name + ".json"
#
import PyPDF2
from langchain.schema import Document
import json

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def create_langchain_document(page_content, metadata=None):
    return Document(page_content=page_content, metadata=metadata)

def save_document_to_json(document, json_path):
    with open(json_path, 'w') as json_file:
        json.dump(document.dict(), json_file)

# Paths for the PDF and the JSON file
name = "PFC ebook"
pdf_path = "tbot-dataset/" + name + ".pdf"
json_path = "tbot-dataset/" + name + ".json"

# Extract text from PDF
extracted_text = extract_text_from_pdf(pdf_path)

# Create a LangChain document
metadata = {"source": pdf_path}
langchain_document = create_langchain_document(extracted_text, metadata)

# Save the LangChain document as a JSON file
save_document_to_json(langchain_document, json_path)

print(f"Document saved to {json_path}")