Spaces:

jdwh08s
/

Autodoc-Lifter

Paused

Autodoc-Lifter / app.py

Jonathan Wang

initial commit

89cbc4d 5 months ago

15.9 kB

	#####################################################
	### DOCUMENT PROCESSOR [APP]
	#####################################################
	### Jonathan Wang

	# ABOUT:
	# This creates an app to chat with PDFs.

	# This is the APP
	# which runs the backend and codes the frontend UI.
	#####################################################
	### TODO Board:
	# Try ColPali? https://huggingface.co/vidore/colpali

	#####################################################
	### PROGRAM IMPORTS
	from __future__ import annotations

	import base64
	import gc
	import logging
	import os
	import random
	import sys
	import warnings
	from pathlib import Path
	from typing import Any, cast

	import nest_asyncio
	import numpy as np
	import streamlit as st
	from llama_index.core import Settings, get_response_synthesizer
	from llama_index.core.base.llms import BaseLLM
	from llama_index.core.postprocessor import (
	SentenceEmbeddingOptimizer,
	SimilarityPostprocessor,
	)
	from llama_index.core.response_synthesizers import ResponseMode
	from streamlit import session_state as ss
	from summary import (
	ImageSummaryMetadataAdder,
	TableSummaryMetadataAdder,
	get_tree_summarizer,
	)
	from torch.cuda import (
	empty_cache,
	get_device_name,
	is_available,
	manual_seed,
	mem_get_info,
	)
	from transformers import set_seed

	# Own Modules
	from agent import doclist_to_agent
	from citation import get_citation_builder
	from full_doc import FullDocument
	from keywords import KeywordMetadataAdder
	from metadata_adder import UnstructuredPDFPostProcessor
	from models import get_embedder, get_llm, get_multimodal_llm, get_reranker
	from obs_logging import get_callback_manager, get_obs
	from pdf_reader import UnstructuredPDFReader
	from pdf_reader_utils import (
	chunk_by_header,
	clean_abbreviations,
	combine_listitem_chunks,
	dedupe_title_chunks,
	remove_header_footer_repeated,
	)
	from parsers import get_parser
	from prompts import get_qa_prompt, get_refine_prompt

	#####################################
	### SETTINGS
	# Logging
	logging.basicConfig(stream=sys.stdout, level=logging.INFO)
	logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

	# CUDA GPU memory avoid fragmentation.
	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" # avoid vram frag
	os.environ["MAX_SPLIT_SIZE_MB"] = "128"
	os.environ["SCARF_NO_ANALYTICS"] = "true" # get rid of data collection from Unstructured
	os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

	os.environ["HF_HOME"] = "/data/.huggingface" # save cached models on disk.

	SEED = 31415926

	print(f"CUDA Availablility: {is_available()}")
	print(f"CUDA Device Name: {get_device_name()}")
	print(f"CUDA Memory: {mem_get_info()}")

	gc.collect()
	empty_cache()

	# Asyncio: fix some issues with nesting https://github.com/run-llama/llama_index/issues/9978
	nest_asyncio.apply()

	# Set seeds
	if (random.getstate() is None):
	random.seed(SEED) # python
	np.random.seed(SEED) # numpy # TODO(Jonathan Wang): Replace with generator
	manual_seed(SEED) # pytorch
	set_seed(SEED) # transformers

	# API Keys
	os.environ["HF_TOKEN"] = st.secrets["huggingface_api_token"]
	os.environ["OPENAI_API_KEY"] = st.secrets["openai_api_key"]
	os.environ["GROQ_API_KEY"] = st.secrets["groq_api_key"]

	#########################################################################
	### SESSION STATE INITIALIZATION
	st.set_page_config(layout="wide")

	if "pdf_ref" not in ss:
	ss.input_pdf = []
	if "doclist" not in ss:
	ss.doclist = []
	if "pdf_reader" not in ss:
	ss.pdf_reader = None
	if "pdf_postprocessor" not in ss:
	ss.pdf_postprocessor = None
	# if 'sentence_model' not in ss:
	# ss.sentence_model = None # sentence splitting model, as alternative to nltk/PySBD
	if "embed_model" not in ss:
	ss.embed_model = None
	gc.collect()
	empty_cache()
	if "reranker_model" not in ss:
	ss.reranker_model = None
	gc.collect()
	empty_cache()
	if "llm" not in ss:
	ss.llm = None
	gc.collect()
	empty_cache()
	if "multimodal_llm" not in ss:
	ss.multimodal_llm = None
	gc.collect()
	empty_cache()
	if "callback_manager" not in ss:
	ss.callback_manager = None
	if "node_parser" not in ss:
	ss.node_parser = None
	if "node_postprocessors" not in ss:
	ss.node_postprocessors = None
	if "response_synthesizer" not in ss:
	ss.response_synthesizer = None
	if "tree_summarizer" not in ss:
	ss.tree_summarizer = None
	if "citation_builder" not in ss:
	ss.citation_builder = None
	if "agent" not in ss:
	ss.agent = None
	if "observability" not in ss:
	ss.observability = None

	if "uploaded_files" not in ss:
	ss.uploaded_files = []
	if "selected_file" not in ss:
	ss.selected_file = None

	if "chat_messages" not in ss:
	ss.chat_messages = []

	################################################################################
	### SCRIPT

	st.markdown("""
	<style>
	.block-container {
	padding-top: 3rem;
	padding-bottom: 0rem;
	padding-left: 3rem;
	padding-right: 3rem;
	}
	</style>
	""", unsafe_allow_html=True)

	### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
	### UI
	st.text("Autodoc Lifter Local PDF Chatbot (Built with Meta🦙3)")
	col_left, col_right = st.columns([1, 1])

	### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
	### PDF Upload UI (Left Panel)
	with st.sidebar:
	uploaded_files = st.file_uploader(
	label="Upload a PDF file.",
	type="pdf",
	accept_multiple_files=True,
	label_visibility="collapsed",
	)

	### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
	### PDF Display UI (Middle Panel)
	# NOTE: This currently only displays the PDF, which requires user interaction (below)

	### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
	### Chat UI (Right Panel)

	with col_right:
	messages_container = st.container(height=475, border=False)
	input_container = st.container(height=80, border=False)

	with messages_container:
	for message in ss.chat_messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	with input_container:
	# Accept user input
	prompt = st.chat_input("Ask your question about the document here.")

	### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
	### Get Models and Settings
	# Get Vision LLM
	if (ss.multimodal_llm is None):
	print(f"CUDA Memory Pre-VLLM: {mem_get_info()}")
	vision_llm = get_multimodal_llm()
	ss.multimodal_llm = vision_llm

	# Get LLM
	if (ss.llm is None):
	print(f"CUDA Memory Pre-LLM: {mem_get_info()}")
	llm = get_llm()
	ss.llm = llm
	Settings.llm = cast(llm, BaseLLM)

	# Get Sentence Splitting Model.
	# if (ss.sentence_model is None):
	# sent_splitter = get_sat_sentence_splitter('sat-3l-sm')
	# ss.sentence_model = sent_splitter

	# Get Embedding Model
	if (ss.embed_model is None):
	print(f"CUDA Memory Pre-Embedding: {mem_get_info()}")
	embed_model = get_embedder()
	ss.embed_model = embed_model
	Settings.embed_model = embed_model

	# Get Reranker
	if (ss.reranker_model is None):
	print(f"CUDA Memory Pre-Reranking: {mem_get_info()}")
	ss.reranker_model = get_reranker()

	# Get Callback Manager
	if (ss.callback_manager is None):
	callback_manager = get_callback_manager()
	ss.callback_manager = callback_manager
	Settings.callback_manager = callback_manager

	# Get Node Parser
	if (ss.node_parser is None):
	node_parser = get_parser(
	embed_model=Settings.embed_model,
	callback_manager=ss.callback_manager
	)
	ss.node_parser = node_parser
	Settings.node_parser = node_parser

	#### Get Observability
	if (ss.observability is None):
	obs = get_obs()

	### Get PDF Reader
	if (ss.pdf_reader is None):
	ss.pdf_reader = UnstructuredPDFReader()

	### Get PDF Reader Postprocessing
	if (ss.pdf_postprocessor is None):
	# Get embedding
	# regex_adder = RegexMetadataAdder(regex_pattern=) # Are there any that I need?
	keyword_adder = KeywordMetadataAdder(metadata_name="keywords")
	table_summary_adder = TableSummaryMetadataAdder(llm=ss.llm)
	image_summary_adder = ImageSummaryMetadataAdder(llm=ss.multimodal_llm)

	pdf_postprocessor = UnstructuredPDFPostProcessor(
	embed_model=ss.embed_model,
	metadata_adders=[keyword_adder, table_summary_adder, image_summary_adder]
	)
	ss.pdf_postprocessor = pdf_postprocessor

	#### Get Observability
	if (ss.observability is None):
	ss.observability = get_obs()
	observability = ss.observability

	### Get Node Postprocessor Pipeline
	if (ss.node_postprocessors is None):
	from nltk.tokenize import PunktTokenizer
	punkt_tokenizer = PunktTokenizer()
	ss.node_postprocessors = [
	SimilarityPostprocessor(similarity_cutoff=0.01), # remove nodes unrelated to query
	ss.reranker_model, # rerank
	# remove sentences less related to query. lower is stricter
	SentenceEmbeddingOptimizer(tokenizer_fn=punkt_tokenizer.tokenize, percentile_cutoff=0.2),
	]

	### Get Response Synthesizer
	if (ss.response_synthesizer is None):
	ss.response_synthesizer = get_response_synthesizer(
	response_mode=ResponseMode.COMPACT,
	text_qa_template=get_qa_prompt(),
	refine_template=get_refine_prompt()
	)

	### Get Tree Summarizer
	if (ss.tree_summarizer is None):
	ss.tree_summarizer = get_tree_summarizer()

	### Get Citation Builder
	if (ss.citation_builder is None):
	ss.citation_builder = get_citation_builder()

	### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
	### Handle User Interaction
	def handle_new_pdf(file_io: Any) -> None:
	"""Handle processing a new source PDF file document."""
	with st.sidebar:
	with (st.spinner("Reading input file, this make take some time...")):
	### Save Locally
	# TODO(Jonathan Wang): Get the user to upload their file with a reference name in a separate tab.
	if not Path(__file__).parent.joinpath("data").exists():
	print("NEWPDF: Making data directory...")
	Path(__file__).parent.joinpath("data").mkdir(parents=True)
	with open(Path(__file__).parent.joinpath("data/input.pdf"), "wb") as f:
	print("NEWPDF: Writing input file...")
	f.write(file_io.getbuffer())

	### Create Document
	print("NEWPDF: Building Document...")
	new_document = FullDocument(
	name="input.pdf",
	file_path=Path(__file__).parent.joinpath("data/input.pdf"),
	)

	#### Process document.
	print("NEWPDF: Writing input file...")
	new_document.file_to_nodes(
	reader=ss.pdf_reader,
	postreaders=[
	clean_abbreviations, dedupe_title_chunks, combine_listitem_chunks,
	remove_header_footer_repeated, chunk_by_header
	],
	node_parser=ss.node_parser,
	postparsers=[ss.pdf_postprocessor],
	)

	### Get Storage Context
	with (st.spinner("Processing input file, this make take some time...")):
	new_document.nodes_to_summary(summarizer=ss.tree_summarizer)
	new_document.summary_to_oneline(summarizer=ss.tree_summarizer)
	new_document.nodes_to_document_keywords()
	new_document.nodes_to_storage()
	### Get Retrieval on Vector Store Index
	with (st.spinner("Building retriever for the input file...")):
	new_document.storage_to_retriever(callback_manager=ss.callback_manager)
	### Get LLM Query Engine
	with (st.spinner("Building query responder for the input file...")):
	new_document.retriever_to_engine(
	response_synthesizer=ss.response_synthesizer,
	callback_manager=ss.callback_manager
	)
	new_document.engine_to_sub_question_engine()

	### Officially Add to Document List
	ss.uploaded_files.append(uploaded_file) # Left UI Bar
	ss.doclist.append(new_document) # Document list for RAG. # TODO(Jonathan Wang): Fix potential duplication.

	### Get LLM Agent
	with (st.spinner("Building LLM Agent for the input file...")):
	agent = doclist_to_agent(ss.doclist)
	ss.agent = agent

	# All done!
	st.toast("All done!")

	# Display summary of new document in chat.
	with messages_container:
	ss.chat_messages.append(
	{"role": "assistant", "content": new_document.summary_oneline}
	)
	with st.chat_message("assistant"):
	st.markdown(new_document.summary_oneline)

	### Cleaning
	empty_cache()
	gc.collect()


	def handle_chat_message(user_message: str) -> str:
	# Get Response
	if (not hasattr(ss, "doclist") or len(ss.doclist) == 0):
	return "Please upload a document to get started."

	if (not hasattr(ss, "agent")):
	warnings.warn("No LLM Agent found. Attempting to create one.", stacklevel=2)
	with st.sidebar, (st.spinner("Building LLM Agent for the input file...")):
	agent = doclist_to_agent(ss.doclist)
	ss.agent = agent

	response = ss.agent.query(user_message)
	# Get citations if available
	response = ss.citation_builder.get_citations(response, citation_threshold=60)
	# Add citations to response text
	response_with_citations = ss.citation_builder.add_citations_to_response(response)
	return str(response_with_citations.response)

	@st.cache_data
	def get_pdf_display(
	file: Any,
	app_width: str = "100%",
	app_height: str = "500",
	starting_page_number: int \| None = None
	) -> str:
	# Read file as binary
	file_bytes = file.getbuffer()
	base64_pdf = base64.b64encode(file_bytes).decode("utf-8")

	pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}"' # TODO(Jonathan Wang): iframe vs embed
	if starting_page_number is not None:
	pdf_display += f"#page={starting_page_number}"
	pdf_display += f' width={app_width} height="{app_height}" type="application/pdf"></iembed>' # iframe vs embed
	return (pdf_display)

	# Upload
	with st.sidebar:
	uploaded_files = uploaded_files or [] # handle case when no file is uploaded
	for uploaded_file in uploaded_files:
	if (uploaded_file not in ss.uploaded_files):
	handle_new_pdf(uploaded_file)

	if (ss.selected_file is None and ss.uploaded_files):
	ss.selected_file = ss.uploaded_files[-1]

	file_names = [file.name for file in ss.uploaded_files]
	selected_file_name = st.radio("Uploaded Files:", file_names)
	if selected_file_name:
	ss.selected_file = [file for file in ss.uploaded_files if file.name == selected_file_name][-1]

	with col_left:
	if (ss.selected_file is None):
	selected_file_name = "Upload a file."
	st.markdown(f"## {selected_file_name}")

	elif (ss.selected_file is not None):
	selected_file = ss.selected_file
	selected_file_name = selected_file.name

	if (selected_file.type == "application/pdf"):
	pdf_display = get_pdf_display(selected_file, app_width="100%", app_height="550")
	st.markdown(pdf_display, unsafe_allow_html=True)

	# Chat
	if prompt:
	with messages_container:
	with st.chat_message("user"):
	st.markdown(prompt)
	ss.chat_messages.append({"role": "user", "content": prompt})

	with st.spinner("Generating response..."):
	# Get Response
	response = handle_chat_message(prompt)

	if response:
	ss.chat_messages.append(
	{"role": "assistant", "content": response}
	)
	with st.chat_message("assistant"):
	st.markdown(response)