aminaj commited on
Commit
5ac9b29
·
verified ·
1 Parent(s): 0dca065

Create backend/pdf_ingestion.py

Browse files
Files changed (1) hide show
  1. backend/pdf_ingestion.py +19 -0
backend/pdf_ingestion.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
3
+
4
+ # Load and split the PDF document and return the documents and text chunks
5
+ def load_split_pdf(file_path):
6
+ # Load the PDF document and split it into chunks
7
+ loader = PyPDFLoader(file_path) # Initialize the PDF loader with the file path
8
+ documents = loader.load() # Load the PDF document
9
+
10
+ # Initialize the recursive character text splitter
11
+ text_splitter = RecursiveCharacterTextSplitter(
12
+ chunk_size=500, # Set the maximum chunk size
13
+ chunk_overlap=50, # Set the number of overlapping characters between chunks
14
+ separators=["\n\n", "\n", " ", ""], # Define resume-specific separators for splitting
15
+ )
16
+
17
+ # Split the loaded documents into chunks
18
+ chunks = text_splitter.split_documents(documents)
19
+ return documents, chunks