Spaces:
Runtime error
Runtime error
from langchain_community.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import UnstructuredAPIFileLoader | |
def load_documents_OCR(file_path, unstructured_api): | |
"""Load documents that require OCR via unstructured.""" | |
loader = UnstructuredAPIFileLoader(file_path=file_path, api_key=unstructured_api) | |
documents = loader.load() | |
return documents | |
def load_documents(file_path): | |
"""Load documents using LangChain.""" | |
loader = PyPDFLoader(file_path) | |
documents = loader.load() | |
return documents | |
def split_documents(documents): | |
"""Split documents using LangChain splitter.""" | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500) | |
split_docs = text_splitter.split_documents(documents) | |
return split_docs | |
def load_and_split_documents(file_path): | |
"""Load and split documents from the specified file path.""" | |
loader = PyPDFLoader(file_path) | |
documents = loader.load() | |
if not documents: | |
print("No documents loaded from file:", file_path) | |
return [] | |
split_docs = split_documents(documents) | |
if not split_docs: | |
print("Document splitting resulted in no output for file:", file_path) | |
return split_docs | |
def update_metadata(documents, original_name): | |
"""Update metadata for each document.""" | |
updated_documents = [] | |
for doc in documents: | |
doc.metadata['source'] = original_name | |
updated_documents.append(doc) | |
return updated_documents |