Spaces:
Runtime error
Runtime error
File size: 1,753 Bytes
5a47e6d 9c98b1f 5a47e6d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredAPIFileLoader
def load_documents_OCR(file_path, unstructured_api):
"""Load documents that require OCR via unstructured."""
loader = UnstructuredAPIFileLoader(file_path=file_path,
api_key=unstructured_api,
url='https://paf-stkjy1b5.api.unstructuredapp.io/',
mode='paged')
documents = loader.load()
return documents
def load_documents(file_path):
"""Load documents using LangChain."""
loader = PyPDFLoader(file_path)
documents = loader.load()
return documents
def split_documents(documents):
"""Split documents using LangChain splitter."""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500)
split_docs = text_splitter.split_documents(documents)
return split_docs
def load_and_split_documents(file_path):
"""Load and split documents from the specified file path."""
loader = PyPDFLoader(file_path)
documents = loader.load()
if not documents:
print("No documents loaded from file:", file_path)
return []
split_docs = split_documents(documents)
if not split_docs:
print("Document splitting resulted in no output for file:", file_path)
return split_docs
def update_metadata(documents, original_name):
"""Update metadata for each document."""
updated_documents = []
for doc in documents:
doc.metadata['source'] = original_name
updated_documents.append(doc)
return updated_documents |