# # Reads "tbot-dataset/" + name + ".pdf" # Outputs "tbot-dataset/" + name + ".json" # import PyPDF2 from langchain.schema import Document import json def extract_text_from_pdf(pdf_path): text = "" with open(pdf_path, "rb") as file: reader = PyPDF2.PdfReader(file) for page in reader.pages: text += page.extract_text() + "\n" return text def create_langchain_document(page_content, metadata=None): return Document(page_content=page_content, metadata=metadata) def save_document_to_json(document, json_path): with open(json_path, 'w') as json_file: json.dump(document.dict(), json_file) # Paths for the PDF and the JSON file name = "PFC ebook" pdf_path = "tbot-dataset/" + name + ".pdf" json_path = "tbot-dataset/" + name + ".json" # Extract text from PDF extracted_text = extract_text_from_pdf(pdf_path) # Create a LangChain document metadata = {"source": pdf_path} langchain_document = create_langchain_document(extracted_text, metadata) # Save the LangChain document as a JSON file save_document_to_json(langchain_document, json_path) print(f"Document saved to {json_path}")