Spaces:
Running
Running
Create backend/pdf_ingestion.py
Browse files- backend/pdf_ingestion.py +19 -0
backend/pdf_ingestion.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import PyPDFLoader
|
2 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
3 |
+
|
4 |
+
# Load and split the PDF document and return the documents and text chunks
|
5 |
+
def load_split_pdf(file_path):
|
6 |
+
# Load the PDF document and split it into chunks
|
7 |
+
loader = PyPDFLoader(file_path) # Initialize the PDF loader with the file path
|
8 |
+
documents = loader.load() # Load the PDF document
|
9 |
+
|
10 |
+
# Initialize the recursive character text splitter
|
11 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
12 |
+
chunk_size=500, # Set the maximum chunk size
|
13 |
+
chunk_overlap=50, # Set the number of overlapping characters between chunks
|
14 |
+
separators=["\n\n", "\n", " ", ""], # Define resume-specific separators for splitting
|
15 |
+
)
|
16 |
+
|
17 |
+
# Split the loaded documents into chunks
|
18 |
+
chunks = text_splitter.split_documents(documents)
|
19 |
+
return documents, chunks
|