Spaces:
Paused
Paused
##################################################### | |
### DOCUMENT PROCESSOR [STORAGE] | |
##################################################### | |
# Jonathan Wang | |
# ABOUT: | |
# This project creates an app to chat with PDFs. | |
# This is the setup for the Storage in the RAG pipeline. | |
##################################################### | |
## TODOS: | |
# Handle creating multiple vector stores, one for each document which has been processed (?) | |
##################################################### | |
## IMPORTS: | |
import gc | |
from torch.cuda import empty_cache | |
from typing import Optional, IO, List, Tuple | |
import streamlit as st | |
import qdrant_client | |
from llama_index.core import StorageContext | |
from llama_index.core.storage.docstore.types import BaseDocumentStore | |
from llama_index.core.storage.docstore import SimpleDocumentStore | |
from llama_index.vector_stores.qdrant import QdrantVectorStore | |
from llama_index.core import VectorStoreIndex | |
from llama_index.core.settings import Settings | |
from llama_index.core.base.embeddings.base import BaseEmbedding | |
from llama_index.core.node_parser import NodeParser | |
# Reader and processing | |
from pdf_reader import UnstructuredPDFReader | |
from pdf_reader_utils import clean_abbreviations, dedupe_title_chunks, combine_listitem_chunks, remove_header_footer_repeated, chunk_by_header | |
from metadata_adder import UnstructuredPDFPostProcessor | |
##################################################### | |
# Get Vector Store | |
def get_vector_store() -> QdrantVectorStore: | |
qdr_client = qdrant_client.QdrantClient( | |
location=":memory:" | |
) | |
qdr_aclient = qdrant_client.AsyncQdrantClient( | |
location=":memory:" | |
) | |
return QdrantVectorStore(client=qdr_client, aclient=qdr_aclient, collection_name='pdf', prefer_grpc=True) | |
# Get Document Store from List of Documents | |
# @st.cache_resource # can't hash a list. | |
def get_docstore(documents: List) -> BaseDocumentStore: | |
"""Get the document store from a list of documents.""" | |
docstore = SimpleDocumentStore() | |
docstore.add_documents(documents) | |
return docstore | |
# Get storage context and | |
# @st.cache_resource # can't cache the pdf_reader or vector_store | |
# def pdf_to_storage( | |
# pdf_file_path: Optional[str], | |
# pdf_file: Optional[IO[bytes]], | |
# _pdf_reader: UnstructuredPDFReader, | |
# _embed_model: BaseEmbedding, | |
# _node_parser: Optional[NodeParser] = None, | |
# _pdf_postprocessor: Optional[UnstructuredPDFPostProcessor] = None, | |
# _vector_store: Optional[QdrantVectorStore]=None, | |
# ) -> Tuple[StorageContext, VectorStoreIndex]: | |
# """Read in PDF and save to storage.""" | |
# # Read the PDF with the PDF reader | |
# pdf_chunks = _pdf_reader.load_data(pdf_file_path=pdf_file_path, pdf_file=pdf_file) | |
# # Clean the PDF chunks | |
# # Insert any cleaners here. | |
# # TODO: Cleaners to remove repeated header/footer text, overlapping elements, ... | |
# pdf_chunks = clean_abbreviations(pdf_chunks) | |
# pdf_chunks = dedupe_title_chunks(pdf_chunks) | |
# pdf_chunks = combine_listitem_chunks(pdf_chunks) | |
# pdf_chunks = remove_header_footer_repeated(pdf_chunks) | |
# empty_cache() | |
# gc.collect() | |
# # Postprocess the PDF nodes. | |
# if (_node_parser is None): | |
# _node_parser = Settings.node_parser | |
# # Combine by semantic headers | |
# pdf_chunks = chunk_by_header(pdf_chunks, 1000) | |
# pdf_chunks = _node_parser.get_nodes_from_documents(pdf_chunks) | |
# if (_pdf_postprocessor is not None): | |
# pdf_chunks = _pdf_postprocessor(pdf_chunks) | |
# # Add embeddings | |
# pdf_chunks = _embed_model(pdf_chunks) | |
# # Create Document Store | |
# docstore = get_docstore(documents=pdf_chunks) | |
# # Create Vector Store if not provided | |
# if (_vector_store is None): | |
# _vector_store = get_vector_store() | |
# ## TODO: Handle images in StorageContext. | |
# # Save into Storage | |
# storage_context = StorageContext.from_defaults( | |
# docstore=docstore, | |
# vector_store=_vector_store | |
# ) | |
# vector_store_index = VectorStoreIndex( | |
# pdf_chunks, storage_context=storage_context | |
# ) | |
# return (storage_context, vector_store_index) |