Spaces:
Sleeping
Sleeping
File size: 1,141 Bytes
a24a077 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
#
# Reads "tbot-dataset/" + name + ".pdf"
# Outputs "tbot-dataset/" + name + ".json"
#
import PyPDF2
from langchain.schema import Document
import json
def extract_text_from_pdf(pdf_path):
text = ""
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def create_langchain_document(page_content, metadata=None):
return Document(page_content=page_content, metadata=metadata)
def save_document_to_json(document, json_path):
with open(json_path, 'w') as json_file:
json.dump(document.dict(), json_file)
# Paths for the PDF and the JSON file
name = "PFC ebook"
pdf_path = "tbot-dataset/" + name + ".pdf"
json_path = "tbot-dataset/" + name + ".json"
# Extract text from PDF
extracted_text = extract_text_from_pdf(pdf_path)
# Create a LangChain document
metadata = {"source": pdf_path}
langchain_document = create_langchain_document(extracted_text, metadata)
# Save the LangChain document as a JSON file
save_document_to_json(langchain_document, json_path)
print(f"Document saved to {json_path}")
|