File size: 1,141 Bytes
a24a077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#
# Reads "tbot-dataset/" + name + ".pdf"
# Outputs "tbot-dataset/" + name + ".json"
#
import PyPDF2
from langchain.schema import Document
import json

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def create_langchain_document(page_content, metadata=None):
    return Document(page_content=page_content, metadata=metadata)

def save_document_to_json(document, json_path):
    with open(json_path, 'w') as json_file:
        json.dump(document.dict(), json_file)

# Paths for the PDF and the JSON file
name = "PFC ebook"
pdf_path = "tbot-dataset/" + name + ".pdf"
json_path = "tbot-dataset/" + name + ".json"

# Extract text from PDF
extracted_text = extract_text_from_pdf(pdf_path)

# Create a LangChain document
metadata = {"source": pdf_path}
langchain_document = create_langchain_document(extracted_text, metadata)

# Save the LangChain document as a JSON file
save_document_to_json(langchain_document, json_path)

print(f"Document saved to {json_path}")