document-qa / streamlit_app.py
lfoppiano's picture
add more embedding functions
8520312
raw
history blame
19.6 kB
import os
import re
from hashlib import blake2b
from tempfile import NamedTemporaryFile
import dotenv
from grobid_quantities.quantities import QuantitiesAPI
from langchain.memory import ConversationBufferWindowMemory
from langchain_community.chat_models.openai import ChatOpenAI
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.llms.huggingface_endpoint import HuggingFaceEndpoint
from streamlit_pdf_viewer import pdf_viewer
from document_qa.ner_client_generic import NERClientGeneric
dotenv.load_dotenv(override=True)
import streamlit as st
from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations
OPENAI_MODELS = ['gpt-3.5-turbo',
"gpt-4",
"gpt-4-1106-preview"]
OPENAI_EMBEDDINGS = [
'text-embedding-ada-002',
'text-embedding-3-large',
'openai-text-embedding-3-small'
]
OPEN_MODELS = {
'mistral-7b-instruct-v0.3': 'mistralai/Mistral-7B-Instruct-v0.2',
# 'Phi-3-mini-128k-instruct': "microsoft/Phi-3-mini-128k-instruct",
'Phi-3-mini-4k-instruct': "microsoft/Phi-3-mini-4k-instruct"
}
DEFAULT_OPEN_EMBEDDING_NAME = 'Default (all-MiniLM-L6-v2)'
OPEN_EMBEDDINGS = {
DEFAULT_OPEN_EMBEDDING_NAME: 'all-MiniLM-L6-v2',
'SFR-Embedding-Mistral': 'Salesforce/SFR-Embedding-Mistral',
'SFR-Embedding-2_R': 'Salesforce/SFR-Embedding-2_R',
'NV-Embed': 'nvidia/NV-Embed-v1',
'e5-mistral-7b-instruct': 'intfloat/e5-mistral-7b-instruct'
}
if 'rqa' not in st.session_state:
st.session_state['rqa'] = {}
if 'model' not in st.session_state:
st.session_state['model'] = None
if 'api_keys' not in st.session_state:
st.session_state['api_keys'] = {}
if 'doc_id' not in st.session_state:
st.session_state['doc_id'] = None
if 'loaded_embeddings' not in st.session_state:
st.session_state['loaded_embeddings'] = None
if 'hash' not in st.session_state:
st.session_state['hash'] = None
if 'git_rev' not in st.session_state:
st.session_state['git_rev'] = "unknown"
if os.path.exists("revision.txt"):
with open("revision.txt", 'r') as fr:
from_file = fr.read()
st.session_state['git_rev'] = from_file if len(from_file) > 0 else "unknown"
if "messages" not in st.session_state:
st.session_state.messages = []
if 'ner_processing' not in st.session_state:
st.session_state['ner_processing'] = False
if 'uploaded' not in st.session_state:
st.session_state['uploaded'] = False
if 'memory' not in st.session_state:
st.session_state['memory'] = None
if 'binary' not in st.session_state:
st.session_state['binary'] = None
if 'annotations' not in st.session_state:
st.session_state['annotations'] = None
if 'should_show_annotations' not in st.session_state:
st.session_state['should_show_annotations'] = True
if 'pdf' not in st.session_state:
st.session_state['pdf'] = None
if 'embeddings' not in st.session_state:
st.session_state['embeddings'] = None
st.set_page_config(
page_title="Scientific Document Insights Q/A",
page_icon="📝",
initial_sidebar_state="expanded",
layout="wide",
menu_items={
'Get Help': 'https://github.com/lfoppiano/document-qa',
'Report a bug': "https://github.com/lfoppiano/document-qa/issues",
'About': "Upload a scientific article in PDF, ask questions, get insights."
}
)
def new_file():
st.session_state['loaded_embeddings'] = None
st.session_state['doc_id'] = None
st.session_state['uploaded'] = True
if st.session_state['memory']:
st.session_state['memory'].clear()
def clear_memory():
st.session_state['memory'].clear()
# @st.cache_resource
def init_qa(model, embeddings_name=None, api_key=None):
## For debug add: callbacks=[PromptLayerCallbackHandler(pl_tags=["langchain", "chatgpt", "document-qa"])])
if model in OPENAI_MODELS:
if embeddings_name is None:
embeddings_name = 'text-embedding-ada-002'
st.session_state['memory'] = ConversationBufferWindowMemory(k=4)
if api_key:
chat = ChatOpenAI(model_name=model,
temperature=0,
openai_api_key=api_key,
frequency_penalty=0.1)
if embeddings_name not in OPENAI_EMBEDDINGS:
st.error(f"The embeddings provided {embeddings_name} are not supported by this model {model}.")
st.stop()
return
embeddings = OpenAIEmbeddings(model=embeddings_name, openai_api_key=api_key)
else:
chat = ChatOpenAI(model_name=model,
temperature=0,
frequency_penalty=0.1)
embeddings = OpenAIEmbeddings(model=embeddings_name)
elif model in OPEN_MODELS:
if embeddings_name is None:
embeddings_name = DEFAULT_OPEN_EMBEDDING_NAME
chat = HuggingFaceEndpoint(
repo_id=OPEN_MODELS[model],
temperature=0.01,
max_new_tokens=2048,
model_kwargs={"max_length": 4096}
)
embeddings = HuggingFaceEmbeddings(
model_name=OPEN_EMBEDDINGS[embeddings_name])
# st.session_state['memory'] = ConversationBufferWindowMemory(k=4) if model not in DISABLE_MEMORY else None
else:
st.error("The model was not loaded properly. Try reloading. ")
st.stop()
return
storage = DataStorage(embeddings)
return DocumentQAEngine(chat, storage, grobid_url=os.environ['GROBID_URL'], memory=st.session_state['memory'])
@st.cache_resource
def init_ner():
quantities_client = QuantitiesAPI(os.environ['GROBID_QUANTITIES_URL'], check_server=True)
materials_client = NERClientGeneric(ping=True)
config_materials = {
'grobid': {
"server": os.environ['GROBID_MATERIALS_URL'],
'sleep_time': 5,
'timeout': 60,
'url_mapping': {
'processText_disable_linking': "/service/process/text?disableLinking=True",
# 'processText_disable_linking': "/service/process/text"
}
}
}
materials_client.set_config(config_materials)
gqa = GrobidAggregationProcessor(grobid_quantities_client=quantities_client,
grobid_superconductors_client=materials_client)
return gqa
gqa = init_ner()
def get_file_hash(fname):
hash_md5 = blake2b()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def play_old_messages(container):
if st.session_state['messages']:
for message in st.session_state['messages']:
if message['role'] == 'user':
container.chat_message("user").markdown(message['content'])
elif message['role'] == 'assistant':
if mode == "LLM":
container.chat_message("assistant").markdown(message['content'], unsafe_allow_html=True)
else:
container.chat_message("assistant").write(message['content'])
# is_api_key_provided = st.session_state['api_key']
with st.sidebar:
st.title("📝 Scientific Document Insights Q/A")
st.subheader("Upload a scientific article in PDF, ask questions, get insights.")
st.markdown(
":warning: [Usage disclaimer](https://github.com/lfoppiano/document-qa?tab=readme-ov-file#disclaimer-on-data-security-and-privacy-%EF%B8%8F) :warning: ")
st.divider()
st.session_state['model'] = model = st.selectbox(
"Model:",
options=OPENAI_MODELS + list(OPEN_MODELS.keys()),
index=(OPENAI_MODELS + list(OPEN_MODELS.keys())).index(
os.environ["DEFAULT_MODEL"]) if "DEFAULT_MODEL" in os.environ and os.environ["DEFAULT_MODEL"] else 0,
placeholder="Select model",
help="Select the LLM model:",
disabled=st.session_state['doc_id'] is not None or st.session_state['uploaded']
)
embedding_choices = OPENAI_EMBEDDINGS if model in OPENAI_MODELS else OPEN_EMBEDDINGS
st.session_state['embeddings'] = embedding_name = st.selectbox(
"Embeddings:",
options=embedding_choices,
index=0,
placeholder="Select embedding",
help="Select the Embedding function:",
disabled=st.session_state['doc_id'] is not None or st.session_state['uploaded']
)
if (model in OPEN_MODELS) and model not in st.session_state['api_keys']:
if 'HUGGINGFACEHUB_API_TOKEN' not in os.environ:
api_key = st.text_input('Huggingface API Key', type="password")
st.markdown("Get it [here](https://huggingface.co/docs/hub/security-tokens)")
else:
api_key = os.environ['HUGGINGFACEHUB_API_TOKEN']
if api_key:
# st.session_state['api_key'] = is_api_key_provided = True
if model not in st.session_state['rqa'] or model not in st.session_state['api_keys']:
with st.spinner("Preparing environment"):
st.session_state['api_keys'][model] = api_key
# if 'HUGGINGFACEHUB_API_TOKEN' not in os.environ:
# os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key
st.session_state['rqa'][model] = init_qa(model, embedding_name)
elif model in OPENAI_MODELS and model not in st.session_state['api_keys']:
if 'OPENAI_API_KEY' not in os.environ:
api_key = st.text_input('OpenAI API Key', type="password")
st.markdown("Get it [here](https://platform.openai.com/account/api-keys)")
else:
api_key = os.environ['OPENAI_API_KEY']
if api_key:
if model not in st.session_state['rqa'] or model not in st.session_state['api_keys']:
with st.spinner("Preparing environment"):
st.session_state['api_keys'][model] = api_key
if 'OPENAI_API_KEY' not in os.environ:
st.session_state['rqa'][model] = init_qa(model, st.session_state['embeddings'], api_key)
else:
st.session_state['rqa'][model] = init_qa(model, st.session_state['embeddings'])
# else:
# is_api_key_provided = st.session_state['api_key']
# st.button(
# 'Reset chat memory.',
# key="reset-memory-button",
# on_click=clear_memory,
# help="Clear the conversational memory. Currently implemented to retrain the 4 most recent messages.",
# disabled=model in st.session_state['rqa'] and st.session_state['rqa'][model].memory is None)
left_column, right_column = st.columns([1, 1])
right_column = right_column.container(border=True)
left_column = left_column.container(border=True)
with right_column:
uploaded_file = st.file_uploader(
"Upload an article",
type=("pdf", "txt"),
on_change=new_file,
disabled=st.session_state['model'] is not None and st.session_state['model'] not in
st.session_state['api_keys'],
help="The full-text is extracted using Grobid."
)
placeholder = st.empty()
messages = st.container(height=300)
question = st.chat_input(
"Ask something about the article",
# placeholder="Can you give me a short summary?",
disabled=not uploaded_file
)
query_modes = {
"llm": "LLM Q/A",
"embeddings": "Embeddings",
"question_coefficient": "Question coefficient"
}
with st.sidebar:
st.header("Settings")
mode = st.radio(
"Query mode",
("llm", "embeddings", "question_coefficient"),
disabled=not uploaded_file,
index=0,
horizontal=True,
format_func=lambda x: query_modes[x],
help="LLM will respond the question, Embedding will show the "
"relevant paragraphs to the question in the paper. "
"Question coefficient attempt to estimate how effective the question will be answered."
)
st.session_state['ner_processing'] = st.checkbox(
"Identify materials and properties.",
help='The LLM responses undergo post-processing to extract physical quantities, measurements, and materials mentions.'
)
# Add a checkbox for showing annotations
# st.session_state['show_annotations'] = st.checkbox("Show annotations", value=True)
# st.session_state['should_show_annotations'] = st.checkbox("Show annotations", value=True)
chunk_size = st.slider("Text chunks size", -1, 2000, value=-1,
help="Size of chunks in which split the document. -1: use paragraphs, > 0 paragraphs are aggregated.",
disabled=uploaded_file is not None)
if chunk_size == -1:
context_size = st.slider("Context size (paragraphs)", 3, 20, value=10,
help="Number of paragraphs to consider when answering a question",
disabled=not uploaded_file)
else:
context_size = st.slider("Context size (chunks)", 3, 10, value=4,
help="Number of chunks to consider when answering a question",
disabled=not uploaded_file)
st.divider()
st.header("Documentation")
st.markdown("https://github.com/lfoppiano/document-qa")
st.markdown(
"""Upload a scientific article as PDF document. Once the spinner stops, you can proceed to ask your questions.""")
if st.session_state['git_rev'] != "unknown":
st.markdown("**Revision number**: [" + st.session_state[
'git_rev'] + "](https://github.com/lfoppiano/document-qa/commit/" + st.session_state['git_rev'] + ")")
if uploaded_file and not st.session_state.loaded_embeddings:
if model not in st.session_state['api_keys']:
st.error("Before uploading a document, you must enter the API key. ")
st.stop()
with left_column:
with st.spinner('Reading file, calling Grobid, and creating memory embeddings...'):
binary = uploaded_file.getvalue()
tmp_file = NamedTemporaryFile()
tmp_file.write(bytearray(binary))
st.session_state['binary'] = binary
st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
chunk_size=chunk_size,
perc_overlap=0.1)
st.session_state['loaded_embeddings'] = True
st.session_state.messages = []
def rgb_to_hex(rgb):
return "#{:02x}{:02x}{:02x}".format(*rgb)
def generate_color_gradient(num_elements):
# Define warm and cold colors in RGB format
warm_color = (255, 165, 0) # Orange
cold_color = (0, 0, 255) # Blue
# Generate a linear gradient of colors
color_gradient = [
rgb_to_hex(tuple(int(warm * (1 - i / num_elements) + cold * (i / num_elements)) for warm, cold in
zip(warm_color, cold_color)))
for i in range(num_elements)
]
return color_gradient
with right_column:
if st.session_state.loaded_embeddings and question and len(question) > 0 and st.session_state.doc_id:
for message in st.session_state.messages:
with messages.chat_message(message["role"]):
if message['mode'] == "llm":
messages.chat_message(message["role"]).markdown(message["content"], unsafe_allow_html=True)
elif message['mode'] == "embeddings":
messages.chat_message(message["role"]).write(message["content"])
if message['mode'] == "question_coefficient":
messages.chat_message(message["role"]).markdown(message["content"], unsafe_allow_html=True)
if model not in st.session_state['rqa']:
st.error("The API Key for the " + model + " is missing. Please add it before sending any query. `")
st.stop()
messages.chat_message("user").markdown(question)
st.session_state.messages.append({"role": "user", "mode": mode, "content": question})
text_response = None
if mode == "embeddings":
with placeholder:
with st.spinner("Fetching the relevant context..."):
text_response, coordinates = st.session_state['rqa'][model].query_storage(
question,
st.session_state.doc_id,
context_size=context_size
)
elif mode == "llm":
with placeholder:
with st.spinner("Generating LLM response..."):
_, text_response, coordinates = st.session_state['rqa'][model].query_document(
question,
st.session_state.doc_id,
context_size=context_size
)
elif mode == "question_coefficient":
with st.spinner("Estimate question/context relevancy..."):
text_response, coordinates = st.session_state['rqa'][model].analyse_query(
question,
st.session_state.doc_id,
context_size=context_size
)
annotations = [[GrobidAggregationProcessor.box_to_dict([cs for cs in c.split(",")]) for c in coord_doc]
for coord_doc in coordinates]
gradients = generate_color_gradient(len(annotations))
for i, color in enumerate(gradients):
for annotation in annotations[i]:
annotation['color'] = color
st.session_state['annotations'] = [annotation for annotation_doc in annotations for annotation in
annotation_doc]
if not text_response:
st.error("Something went wrong. Contact Luca Foppiano (Foppiano.Luca@nims.co.jp) to report the issue.")
if mode == "llm":
if st.session_state['ner_processing']:
with st.spinner("Processing NER on LLM response..."):
entities = gqa.process_single_text(text_response)
decorated_text = decorate_text_with_annotations(text_response.strip(), entities)
decorated_text = decorated_text.replace('class="label material"', 'style="color:green"')
decorated_text = re.sub(r'class="label[^"]+"', 'style="color:orange"', decorated_text)
text_response = decorated_text
messages.chat_message("assistant").markdown(text_response, unsafe_allow_html=True)
else:
messages.chat_message("assistant").write(text_response)
st.session_state.messages.append({"role": "assistant", "mode": mode, "content": text_response})
elif st.session_state.loaded_embeddings and st.session_state.doc_id:
play_old_messages(messages)
with left_column:
if st.session_state['binary']:
pdf_viewer(
input=st.session_state['binary'],
annotation_outline_size=2,
annotations=st.session_state['annotations'],
render_text=True,
height=600
)