Autodoc-Lifter / storage.py
Jonathan Wang
initial commit
89cbc4d
#####################################################
### DOCUMENT PROCESSOR [STORAGE]
#####################################################
# Jonathan Wang
# ABOUT:
# This project creates an app to chat with PDFs.
# This is the setup for the Storage in the RAG pipeline.
#####################################################
## TODOS:
# Handle creating multiple vector stores, one for each document which has been processed (?)
#####################################################
## IMPORTS:
import gc
from torch.cuda import empty_cache
from typing import Optional, IO, List, Tuple
import streamlit as st
import qdrant_client
from llama_index.core import StorageContext
from llama_index.core.storage.docstore.types import BaseDocumentStore
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex
from llama_index.core.settings import Settings
from llama_index.core.base.embeddings.base import BaseEmbedding
from llama_index.core.node_parser import NodeParser
# Reader and processing
from pdf_reader import UnstructuredPDFReader
from pdf_reader_utils import clean_abbreviations, dedupe_title_chunks, combine_listitem_chunks, remove_header_footer_repeated, chunk_by_header
from metadata_adder import UnstructuredPDFPostProcessor
#####################################################
# Get Vector Store
@st.cache_resource
def get_vector_store() -> QdrantVectorStore:
qdr_client = qdrant_client.QdrantClient(
location=":memory:"
)
qdr_aclient = qdrant_client.AsyncQdrantClient(
location=":memory:"
)
return QdrantVectorStore(client=qdr_client, aclient=qdr_aclient, collection_name='pdf', prefer_grpc=True)
# Get Document Store from List of Documents
# @st.cache_resource # can't hash a list.
def get_docstore(documents: List) -> BaseDocumentStore:
"""Get the document store from a list of documents."""
docstore = SimpleDocumentStore()
docstore.add_documents(documents)
return docstore
# Get storage context and
# @st.cache_resource # can't cache the pdf_reader or vector_store
# def pdf_to_storage(
# pdf_file_path: Optional[str],
# pdf_file: Optional[IO[bytes]],
# _pdf_reader: UnstructuredPDFReader,
# _embed_model: BaseEmbedding,
# _node_parser: Optional[NodeParser] = None,
# _pdf_postprocessor: Optional[UnstructuredPDFPostProcessor] = None,
# _vector_store: Optional[QdrantVectorStore]=None,
# ) -> Tuple[StorageContext, VectorStoreIndex]:
# """Read in PDF and save to storage."""
# # Read the PDF with the PDF reader
# pdf_chunks = _pdf_reader.load_data(pdf_file_path=pdf_file_path, pdf_file=pdf_file)
# # Clean the PDF chunks
# # Insert any cleaners here.
# # TODO: Cleaners to remove repeated header/footer text, overlapping elements, ...
# pdf_chunks = clean_abbreviations(pdf_chunks)
# pdf_chunks = dedupe_title_chunks(pdf_chunks)
# pdf_chunks = combine_listitem_chunks(pdf_chunks)
# pdf_chunks = remove_header_footer_repeated(pdf_chunks)
# empty_cache()
# gc.collect()
# # Postprocess the PDF nodes.
# if (_node_parser is None):
# _node_parser = Settings.node_parser
# # Combine by semantic headers
# pdf_chunks = chunk_by_header(pdf_chunks, 1000)
# pdf_chunks = _node_parser.get_nodes_from_documents(pdf_chunks)
# if (_pdf_postprocessor is not None):
# pdf_chunks = _pdf_postprocessor(pdf_chunks)
# # Add embeddings
# pdf_chunks = _embed_model(pdf_chunks)
# # Create Document Store
# docstore = get_docstore(documents=pdf_chunks)
# # Create Vector Store if not provided
# if (_vector_store is None):
# _vector_store = get_vector_store()
# ## TODO: Handle images in StorageContext.
# # Save into Storage
# storage_context = StorageContext.from_defaults(
# docstore=docstore,
# vector_store=_vector_store
# )
# vector_store_index = VectorStoreIndex(
# pdf_chunks, storage_context=storage_context
# )
# return (storage_context, vector_store_index)