Spaces:

jdwh08s
/

Autodoc-Lifter

Paused

Autodoc-Lifter / storage.py

Jonathan Wang

initial commit

89cbc4d 5 months ago

4.16 kB

	#####################################################
	### DOCUMENT PROCESSOR [STORAGE]
	#####################################################
	# Jonathan Wang

	# ABOUT:
	# This project creates an app to chat with PDFs.

	# This is the setup for the Storage in the RAG pipeline.
	#####################################################
	## TODOS:
	# Handle creating multiple vector stores, one for each document which has been processed (?)

	#####################################################
	## IMPORTS:
	import gc
	from torch.cuda import empty_cache

	from typing import Optional, IO, List, Tuple

	import streamlit as st

	import qdrant_client
	from llama_index.core import StorageContext
	from llama_index.core.storage.docstore.types import BaseDocumentStore
	from llama_index.core.storage.docstore import SimpleDocumentStore
	from llama_index.vector_stores.qdrant import QdrantVectorStore
	from llama_index.core import VectorStoreIndex

	from llama_index.core.settings import Settings
	from llama_index.core.base.embeddings.base import BaseEmbedding
	from llama_index.core.node_parser import NodeParser

	# Reader and processing
	from pdf_reader import UnstructuredPDFReader
	from pdf_reader_utils import clean_abbreviations, dedupe_title_chunks, combine_listitem_chunks, remove_header_footer_repeated, chunk_by_header
	from metadata_adder import UnstructuredPDFPostProcessor

	#####################################################
	# Get Vector Store
	@st.cache_resource
	def get_vector_store() -> QdrantVectorStore:
	qdr_client = qdrant_client.QdrantClient(
	location=":memory:"
	)
	qdr_aclient = qdrant_client.AsyncQdrantClient(
	location=":memory:"
	)
	return QdrantVectorStore(client=qdr_client, aclient=qdr_aclient, collection_name='pdf', prefer_grpc=True)


	# Get Document Store from List of Documents
	# @st.cache_resource # can't hash a list.
	def get_docstore(documents: List) -> BaseDocumentStore:
	"""Get the document store from a list of documents."""
	docstore = SimpleDocumentStore()
	docstore.add_documents(documents)
	return docstore


	# Get storage context and
	# @st.cache_resource # can't cache the pdf_reader or vector_store
	# def pdf_to_storage(
	# pdf_file_path: Optional[str],
	# pdf_file: Optional[IO[bytes]],
	# _pdf_reader: UnstructuredPDFReader,
	# _embed_model: BaseEmbedding,
	# _node_parser: Optional[NodeParser] = None,
	# _pdf_postprocessor: Optional[UnstructuredPDFPostProcessor] = None,
	# _vector_store: Optional[QdrantVectorStore]=None,
	# ) -> Tuple[StorageContext, VectorStoreIndex]:
	# """Read in PDF and save to storage."""

	# # Read the PDF with the PDF reader
	# pdf_chunks = _pdf_reader.load_data(pdf_file_path=pdf_file_path, pdf_file=pdf_file)

	# # Clean the PDF chunks
	# # Insert any cleaners here.

	# # TODO: Cleaners to remove repeated header/footer text, overlapping elements, ...
	# pdf_chunks = clean_abbreviations(pdf_chunks)
	# pdf_chunks = dedupe_title_chunks(pdf_chunks)
	# pdf_chunks = combine_listitem_chunks(pdf_chunks)
	# pdf_chunks = remove_header_footer_repeated(pdf_chunks)
	# empty_cache()
	# gc.collect()

	# # Postprocess the PDF nodes.
	# if (_node_parser is None):
	# _node_parser = Settings.node_parser

	# # Combine by semantic headers
	# pdf_chunks = chunk_by_header(pdf_chunks, 1000)
	# pdf_chunks = _node_parser.get_nodes_from_documents(pdf_chunks)

	# if (_pdf_postprocessor is not None):
	# pdf_chunks = _pdf_postprocessor(pdf_chunks)

	# # Add embeddings
	# pdf_chunks = _embed_model(pdf_chunks)

	# # Create Document Store
	# docstore = get_docstore(documents=pdf_chunks)

	# # Create Vector Store if not provided
	# if (_vector_store is None):
	# _vector_store = get_vector_store()

	# ## TODO: Handle images in StorageContext.

	# # Save into Storage
	# storage_context = StorageContext.from_defaults(
	# docstore=docstore,
	# vector_store=_vector_store
	# )
	# vector_store_index = VectorStoreIndex(
	# pdf_chunks, storage_context=storage_context
	# )

	# return (storage_context, vector_store_index)