Autodoc-Lifter / app.py
Jonathan Wang
initial commit
89cbc4d
#####################################################
### DOCUMENT PROCESSOR [APP]
#####################################################
### Jonathan Wang
# ABOUT:
# This creates an app to chat with PDFs.
# This is the APP
# which runs the backend and codes the frontend UI.
#####################################################
### TODO Board:
# Try ColPali? https://huggingface.co/vidore/colpali
#####################################################
### PROGRAM IMPORTS
from __future__ import annotations
import base64
import gc
import logging
import os
import random
import sys
import warnings
from pathlib import Path
from typing import Any, cast
import nest_asyncio
import numpy as np
import streamlit as st
from llama_index.core import Settings, get_response_synthesizer
from llama_index.core.base.llms import BaseLLM
from llama_index.core.postprocessor import (
SentenceEmbeddingOptimizer,
SimilarityPostprocessor,
)
from llama_index.core.response_synthesizers import ResponseMode
from streamlit import session_state as ss
from summary import (
ImageSummaryMetadataAdder,
TableSummaryMetadataAdder,
get_tree_summarizer,
)
from torch.cuda import (
empty_cache,
get_device_name,
is_available,
manual_seed,
mem_get_info,
)
from transformers import set_seed
# Own Modules
from agent import doclist_to_agent
from citation import get_citation_builder
from full_doc import FullDocument
from keywords import KeywordMetadataAdder
from metadata_adder import UnstructuredPDFPostProcessor
from models import get_embedder, get_llm, get_multimodal_llm, get_reranker
from obs_logging import get_callback_manager, get_obs
from pdf_reader import UnstructuredPDFReader
from pdf_reader_utils import (
chunk_by_header,
clean_abbreviations,
combine_listitem_chunks,
dedupe_title_chunks,
remove_header_footer_repeated,
)
from parsers import get_parser
from prompts import get_qa_prompt, get_refine_prompt
#####################################
### SETTINGS
# Logging
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
# CUDA GPU memory avoid fragmentation.
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" # avoid vram frag
os.environ["MAX_SPLIT_SIZE_MB"] = "128"
os.environ["SCARF_NO_ANALYTICS"] = "true" # get rid of data collection from Unstructured
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
os.environ["HF_HOME"] = "/data/.huggingface" # save cached models on disk.
SEED = 31415926
print(f"CUDA Availablility: {is_available()}")
print(f"CUDA Device Name: {get_device_name()}")
print(f"CUDA Memory: {mem_get_info()}")
gc.collect()
empty_cache()
# Asyncio: fix some issues with nesting https://github.com/run-llama/llama_index/issues/9978
nest_asyncio.apply()
# Set seeds
if (random.getstate() is None):
random.seed(SEED) # python
np.random.seed(SEED) # numpy # TODO(Jonathan Wang): Replace with generator
manual_seed(SEED) # pytorch
set_seed(SEED) # transformers
# API Keys
os.environ["HF_TOKEN"] = st.secrets["huggingface_api_token"]
os.environ["OPENAI_API_KEY"] = st.secrets["openai_api_key"]
os.environ["GROQ_API_KEY"] = st.secrets["groq_api_key"]
#########################################################################
### SESSION STATE INITIALIZATION
st.set_page_config(layout="wide")
if "pdf_ref" not in ss:
ss.input_pdf = []
if "doclist" not in ss:
ss.doclist = []
if "pdf_reader" not in ss:
ss.pdf_reader = None
if "pdf_postprocessor" not in ss:
ss.pdf_postprocessor = None
# if 'sentence_model' not in ss:
# ss.sentence_model = None # sentence splitting model, as alternative to nltk/PySBD
if "embed_model" not in ss:
ss.embed_model = None
gc.collect()
empty_cache()
if "reranker_model" not in ss:
ss.reranker_model = None
gc.collect()
empty_cache()
if "llm" not in ss:
ss.llm = None
gc.collect()
empty_cache()
if "multimodal_llm" not in ss:
ss.multimodal_llm = None
gc.collect()
empty_cache()
if "callback_manager" not in ss:
ss.callback_manager = None
if "node_parser" not in ss:
ss.node_parser = None
if "node_postprocessors" not in ss:
ss.node_postprocessors = None
if "response_synthesizer" not in ss:
ss.response_synthesizer = None
if "tree_summarizer" not in ss:
ss.tree_summarizer = None
if "citation_builder" not in ss:
ss.citation_builder = None
if "agent" not in ss:
ss.agent = None
if "observability" not in ss:
ss.observability = None
if "uploaded_files" not in ss:
ss.uploaded_files = []
if "selected_file" not in ss:
ss.selected_file = None
if "chat_messages" not in ss:
ss.chat_messages = []
################################################################################
### SCRIPT
st.markdown("""
<style>
.block-container {
padding-top: 3rem;
padding-bottom: 0rem;
padding-left: 3rem;
padding-right: 3rem;
}
</style>
""", unsafe_allow_html=True)
### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
### UI
st.text("Autodoc Lifter Local PDF Chatbot (Built with MetaπŸ¦™3)")
col_left, col_right = st.columns([1, 1])
### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
### PDF Upload UI (Left Panel)
with st.sidebar:
uploaded_files = st.file_uploader(
label="Upload a PDF file.",
type="pdf",
accept_multiple_files=True,
label_visibility="collapsed",
)
### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
### PDF Display UI (Middle Panel)
# NOTE: This currently only displays the PDF, which requires user interaction (below)
### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
### Chat UI (Right Panel)
with col_right:
messages_container = st.container(height=475, border=False)
input_container = st.container(height=80, border=False)
with messages_container:
for message in ss.chat_messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
with input_container:
# Accept user input
prompt = st.chat_input("Ask your question about the document here.")
### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
### Get Models and Settings
# Get Vision LLM
if (ss.multimodal_llm is None):
print(f"CUDA Memory Pre-VLLM: {mem_get_info()}")
vision_llm = get_multimodal_llm()
ss.multimodal_llm = vision_llm
# Get LLM
if (ss.llm is None):
print(f"CUDA Memory Pre-LLM: {mem_get_info()}")
llm = get_llm()
ss.llm = llm
Settings.llm = cast(llm, BaseLLM)
# Get Sentence Splitting Model.
# if (ss.sentence_model is None):
# sent_splitter = get_sat_sentence_splitter('sat-3l-sm')
# ss.sentence_model = sent_splitter
# Get Embedding Model
if (ss.embed_model is None):
print(f"CUDA Memory Pre-Embedding: {mem_get_info()}")
embed_model = get_embedder()
ss.embed_model = embed_model
Settings.embed_model = embed_model
# Get Reranker
if (ss.reranker_model is None):
print(f"CUDA Memory Pre-Reranking: {mem_get_info()}")
ss.reranker_model = get_reranker()
# Get Callback Manager
if (ss.callback_manager is None):
callback_manager = get_callback_manager()
ss.callback_manager = callback_manager
Settings.callback_manager = callback_manager
# Get Node Parser
if (ss.node_parser is None):
node_parser = get_parser(
embed_model=Settings.embed_model,
callback_manager=ss.callback_manager
)
ss.node_parser = node_parser
Settings.node_parser = node_parser
#### Get Observability
if (ss.observability is None):
obs = get_obs()
### Get PDF Reader
if (ss.pdf_reader is None):
ss.pdf_reader = UnstructuredPDFReader()
### Get PDF Reader Postprocessing
if (ss.pdf_postprocessor is None):
# Get embedding
# regex_adder = RegexMetadataAdder(regex_pattern=) # Are there any that I need?
keyword_adder = KeywordMetadataAdder(metadata_name="keywords")
table_summary_adder = TableSummaryMetadataAdder(llm=ss.llm)
image_summary_adder = ImageSummaryMetadataAdder(llm=ss.multimodal_llm)
pdf_postprocessor = UnstructuredPDFPostProcessor(
embed_model=ss.embed_model,
metadata_adders=[keyword_adder, table_summary_adder, image_summary_adder]
)
ss.pdf_postprocessor = pdf_postprocessor
#### Get Observability
if (ss.observability is None):
ss.observability = get_obs()
observability = ss.observability
### Get Node Postprocessor Pipeline
if (ss.node_postprocessors is None):
from nltk.tokenize import PunktTokenizer
punkt_tokenizer = PunktTokenizer()
ss.node_postprocessors = [
SimilarityPostprocessor(similarity_cutoff=0.01), # remove nodes unrelated to query
ss.reranker_model, # rerank
# remove sentences less related to query. lower is stricter
SentenceEmbeddingOptimizer(tokenizer_fn=punkt_tokenizer.tokenize, percentile_cutoff=0.2),
]
### Get Response Synthesizer
if (ss.response_synthesizer is None):
ss.response_synthesizer = get_response_synthesizer(
response_mode=ResponseMode.COMPACT,
text_qa_template=get_qa_prompt(),
refine_template=get_refine_prompt()
)
### Get Tree Summarizer
if (ss.tree_summarizer is None):
ss.tree_summarizer = get_tree_summarizer()
### Get Citation Builder
if (ss.citation_builder is None):
ss.citation_builder = get_citation_builder()
### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
### Handle User Interaction
def handle_new_pdf(file_io: Any) -> None:
"""Handle processing a new source PDF file document."""
with st.sidebar:
with (st.spinner("Reading input file, this make take some time...")):
### Save Locally
# TODO(Jonathan Wang): Get the user to upload their file with a reference name in a separate tab.
if not Path(__file__).parent.joinpath("data").exists():
print("NEWPDF: Making data directory...")
Path(__file__).parent.joinpath("data").mkdir(parents=True)
with open(Path(__file__).parent.joinpath("data/input.pdf"), "wb") as f:
print("NEWPDF: Writing input file...")
f.write(file_io.getbuffer())
### Create Document
print("NEWPDF: Building Document...")
new_document = FullDocument(
name="input.pdf",
file_path=Path(__file__).parent.joinpath("data/input.pdf"),
)
#### Process document.
print("NEWPDF: Writing input file...")
new_document.file_to_nodes(
reader=ss.pdf_reader,
postreaders=[
clean_abbreviations, dedupe_title_chunks, combine_listitem_chunks,
remove_header_footer_repeated, chunk_by_header
],
node_parser=ss.node_parser,
postparsers=[ss.pdf_postprocessor],
)
### Get Storage Context
with (st.spinner("Processing input file, this make take some time...")):
new_document.nodes_to_summary(summarizer=ss.tree_summarizer)
new_document.summary_to_oneline(summarizer=ss.tree_summarizer)
new_document.nodes_to_document_keywords()
new_document.nodes_to_storage()
### Get Retrieval on Vector Store Index
with (st.spinner("Building retriever for the input file...")):
new_document.storage_to_retriever(callback_manager=ss.callback_manager)
### Get LLM Query Engine
with (st.spinner("Building query responder for the input file...")):
new_document.retriever_to_engine(
response_synthesizer=ss.response_synthesizer,
callback_manager=ss.callback_manager
)
new_document.engine_to_sub_question_engine()
### Officially Add to Document List
ss.uploaded_files.append(uploaded_file) # Left UI Bar
ss.doclist.append(new_document) # Document list for RAG. # TODO(Jonathan Wang): Fix potential duplication.
### Get LLM Agent
with (st.spinner("Building LLM Agent for the input file...")):
agent = doclist_to_agent(ss.doclist)
ss.agent = agent
# All done!
st.toast("All done!")
# Display summary of new document in chat.
with messages_container:
ss.chat_messages.append(
{"role": "assistant", "content": new_document.summary_oneline}
)
with st.chat_message("assistant"):
st.markdown(new_document.summary_oneline)
### Cleaning
empty_cache()
gc.collect()
def handle_chat_message(user_message: str) -> str:
# Get Response
if (not hasattr(ss, "doclist") or len(ss.doclist) == 0):
return "Please upload a document to get started."
if (not hasattr(ss, "agent")):
warnings.warn("No LLM Agent found. Attempting to create one.", stacklevel=2)
with st.sidebar, (st.spinner("Building LLM Agent for the input file...")):
agent = doclist_to_agent(ss.doclist)
ss.agent = agent
response = ss.agent.query(user_message)
# Get citations if available
response = ss.citation_builder.get_citations(response, citation_threshold=60)
# Add citations to response text
response_with_citations = ss.citation_builder.add_citations_to_response(response)
return str(response_with_citations.response)
@st.cache_data
def get_pdf_display(
file: Any,
app_width: str = "100%",
app_height: str = "500",
starting_page_number: int | None = None
) -> str:
# Read file as binary
file_bytes = file.getbuffer()
base64_pdf = base64.b64encode(file_bytes).decode("utf-8")
pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}"' # TODO(Jonathan Wang): iframe vs embed
if starting_page_number is not None:
pdf_display += f"#page={starting_page_number}"
pdf_display += f' width={app_width} height="{app_height}" type="application/pdf"></iembed>' # iframe vs embed
return (pdf_display)
# Upload
with st.sidebar:
uploaded_files = uploaded_files or [] # handle case when no file is uploaded
for uploaded_file in uploaded_files:
if (uploaded_file not in ss.uploaded_files):
handle_new_pdf(uploaded_file)
if (ss.selected_file is None and ss.uploaded_files):
ss.selected_file = ss.uploaded_files[-1]
file_names = [file.name for file in ss.uploaded_files]
selected_file_name = st.radio("Uploaded Files:", file_names)
if selected_file_name:
ss.selected_file = [file for file in ss.uploaded_files if file.name == selected_file_name][-1]
with col_left:
if (ss.selected_file is None):
selected_file_name = "Upload a file."
st.markdown(f"## {selected_file_name}")
elif (ss.selected_file is not None):
selected_file = ss.selected_file
selected_file_name = selected_file.name
if (selected_file.type == "application/pdf"):
pdf_display = get_pdf_display(selected_file, app_width="100%", app_height="550")
st.markdown(pdf_display, unsafe_allow_html=True)
# Chat
if prompt:
with messages_container:
with st.chat_message("user"):
st.markdown(prompt)
ss.chat_messages.append({"role": "user", "content": prompt})
with st.spinner("Generating response..."):
# Get Response
response = handle_chat_message(prompt)
if response:
ss.chat_messages.append(
{"role": "assistant", "content": response}
)
with st.chat_message("assistant"):
st.markdown(response)