lfoppiano commited on
Commit
924fb11
Β·
1 Parent(s): cd17f01

load pdf in page

Browse files
Files changed (1) hide show
  1. streamlit_app.py +66 -52
streamlit_app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import re
3
  from hashlib import blake2b
@@ -55,6 +56,7 @@ st.set_page_config(
55
  page_title="Scientific Document Insights Q/A",
56
  page_icon="πŸ“",
57
  initial_sidebar_state="expanded",
 
58
  menu_items={
59
  'Get Help': 'https://github.com/lfoppiano/document-qa',
60
  'Report a bug': "https://github.com/lfoppiano/document-qa/issues",
@@ -204,17 +206,19 @@ with st.sidebar:
204
  st.session_state['rqa'][model] = init_qa(model)
205
  # else:
206
  # is_api_key_provided = st.session_state['api_key']
 
207
 
208
- st.title("πŸ“ Scientific Document Insights Q/A")
209
- st.subheader("Upload a scientific article in PDF, ask questions, get insights.")
 
210
 
211
- st.markdown(
212
- ":warning: Do not upload sensitive data. We **temporarily** store text from the uploaded PDF documents solely for the purpose of processing your request, and we **do not assume responsibility** for any subsequent use or handling of the data submitted to third parties LLMs.")
213
 
214
- uploaded_file = st.file_uploader("Upload an article", type=("pdf", "txt"), on_change=new_file,
215
- disabled=st.session_state['model'] is not None and st.session_state['model'] not in
216
- st.session_state['api_keys'],
217
- help="The full-text is extracted using Grobid. ")
218
 
219
  question = st.chat_input(
220
  "Ask something about the article",
@@ -257,6 +261,13 @@ with st.sidebar:
257
  st.markdown(
258
  """If you switch the mode to "Embedding," the system will return specific chunks from the document that are semantically related to your query. This mode helps to test why sometimes the answers are not satisfying or incomplete. """)
259
 
 
 
 
 
 
 
 
260
  if uploaded_file and not st.session_state.loaded_embeddings:
261
  if model not in st.session_state['api_keys']:
262
  st.error("Before uploading a document, you must enter the API key. ")
@@ -265,7 +276,9 @@ if uploaded_file and not st.session_state.loaded_embeddings:
265
  binary = uploaded_file.getvalue()
266
  tmp_file = NamedTemporaryFile()
267
  tmp_file.write(bytearray(binary))
268
- # hash = get_file_hash(tmp_file.name)[:10]
 
 
269
  st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
270
  chunk_size=chunk_size,
271
  perc_overlap=0.1)
@@ -274,47 +287,48 @@ if uploaded_file and not st.session_state.loaded_embeddings:
274
 
275
  # timestamp = datetime.utcnow()
276
 
277
- if st.session_state.loaded_embeddings and question and len(question) > 0 and st.session_state.doc_id:
278
- for message in st.session_state.messages:
279
- with st.chat_message(message["role"]):
280
- if message['mode'] == "LLM":
281
- st.markdown(message["content"], unsafe_allow_html=True)
282
- elif message['mode'] == "Embeddings":
283
- st.write(message["content"])
284
- if model not in st.session_state['rqa']:
285
- st.error("The API Key for the " + model + " is missing. Please add it before sending any query. `")
286
- st.stop()
287
-
288
- with st.chat_message("user"):
289
- st.markdown(question)
290
- st.session_state.messages.append({"role": "user", "mode": mode, "content": question})
291
-
292
- text_response = None
293
- if mode == "Embeddings":
294
- with st.spinner("Generating LLM response..."):
295
- text_response = st.session_state['rqa'][model].query_storage(question, st.session_state.doc_id,
296
- context_size=context_size)
297
- elif mode == "LLM":
298
- with st.spinner("Generating response..."):
299
- _, text_response = st.session_state['rqa'][model].query_document(question, st.session_state.doc_id,
300
  context_size=context_size)
301
-
302
- if not text_response:
303
- st.error("Something went wrong. Contact Luca Foppiano (Foppiano.Luca@nims.co.jp) to report the issue.")
304
-
305
- with st.chat_message("assistant"):
306
- if mode == "LLM":
307
- if st.session_state['ner_processing']:
308
- with st.spinner("Processing NER on LLM response..."):
309
- entities = gqa.process_single_text(text_response)
310
- decorated_text = decorate_text_with_annotations(text_response.strip(), entities)
311
- decorated_text = decorated_text.replace('class="label material"', 'style="color:green"')
312
- decorated_text = re.sub(r'class="label[^"]+"', 'style="color:orange"', decorated_text)
313
- text_response = decorated_text
314
- st.markdown(text_response, unsafe_allow_html=True)
315
- else:
316
- st.write(text_response)
317
- st.session_state.messages.append({"role": "assistant", "mode": mode, "content": text_response})
318
-
319
- elif st.session_state.loaded_embeddings and st.session_state.doc_id:
320
- play_old_messages()
 
 
 
 
 
1
+ import base64
2
  import os
3
  import re
4
  from hashlib import blake2b
 
56
  page_title="Scientific Document Insights Q/A",
57
  page_icon="πŸ“",
58
  initial_sidebar_state="expanded",
59
+ layout="wide",
60
  menu_items={
61
  'Get Help': 'https://github.com/lfoppiano/document-qa',
62
  'Report a bug': "https://github.com/lfoppiano/document-qa/issues",
 
206
  st.session_state['rqa'][model] = init_qa(model)
207
  # else:
208
  # is_api_key_provided = st.session_state['api_key']
209
+ left_column, right_column = st.columns([1, 1])
210
 
211
+ with right_column:
212
+ st.title("πŸ“ Scientific Document Insights Q/A")
213
+ st.subheader("Upload a scientific article in PDF, ask questions, get insights.")
214
 
215
+ st.markdown(
216
+ ":warning: Do not upload sensitive data. We **temporarily** store text from the uploaded PDF documents solely for the purpose of processing your request, and we **do not assume responsibility** for any subsequent use or handling of the data submitted to third parties LLMs.")
217
 
218
+ uploaded_file = st.file_uploader("Upload an article", type=("pdf", "txt"), on_change=new_file,
219
+ disabled=st.session_state['model'] is not None and st.session_state['model'] not in
220
+ st.session_state['api_keys'],
221
+ help="The full-text is extracted using Grobid. ")
222
 
223
  question = st.chat_input(
224
  "Ask something about the article",
 
261
  st.markdown(
262
  """If you switch the mode to "Embedding," the system will return specific chunks from the document that are semantically related to your query. This mode helps to test why sometimes the answers are not satisfying or incomplete. """)
263
 
264
+
265
+ @st.cache_resource
266
+ def get_pdf_display(binary):
267
+ base64_pdf = base64.b64encode(binary).decode('utf-8')
268
+ return F'<embed src="data:application/pdf;base64,{base64_pdf}" width="700" height="950" type="application/pdf"></embed>'
269
+
270
+
271
  if uploaded_file and not st.session_state.loaded_embeddings:
272
  if model not in st.session_state['api_keys']:
273
  st.error("Before uploading a document, you must enter the API key. ")
 
276
  binary = uploaded_file.getvalue()
277
  tmp_file = NamedTemporaryFile()
278
  tmp_file.write(bytearray(binary))
279
+
280
+ left_column.markdown(get_pdf_display(binary), unsafe_allow_html=True)
281
+
282
  st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
283
  chunk_size=chunk_size,
284
  perc_overlap=0.1)
 
287
 
288
  # timestamp = datetime.utcnow()
289
 
290
+ with right_column:
291
+ if st.session_state.loaded_embeddings and question and len(question) > 0 and st.session_state.doc_id:
292
+ for message in st.session_state.messages:
293
+ with st.chat_message(message["role"]):
294
+ if message['mode'] == "LLM":
295
+ st.markdown(message["content"], unsafe_allow_html=True)
296
+ elif message['mode'] == "Embeddings":
297
+ st.write(message["content"])
298
+ if model not in st.session_state['rqa']:
299
+ st.error("The API Key for the " + model + " is missing. Please add it before sending any query. `")
300
+ st.stop()
301
+
302
+ with st.chat_message("user"):
303
+ st.markdown(question)
304
+ st.session_state.messages.append({"role": "user", "mode": mode, "content": question})
305
+
306
+ text_response = None
307
+ if mode == "Embeddings":
308
+ with st.spinner("Generating LLM response..."):
309
+ text_response = st.session_state['rqa'][model].query_storage(question, st.session_state.doc_id,
 
 
 
310
  context_size=context_size)
311
+ elif mode == "LLM":
312
+ with st.spinner("Generating response..."):
313
+ _, text_response = st.session_state['rqa'][model].query_document(question, st.session_state.doc_id,
314
+ context_size=context_size)
315
+
316
+ if not text_response:
317
+ st.error("Something went wrong. Contact Luca Foppiano (Foppiano.Luca@nims.co.jp) to report the issue.")
318
+
319
+ with st.chat_message("assistant"):
320
+ if mode == "LLM":
321
+ if st.session_state['ner_processing']:
322
+ with st.spinner("Processing NER on LLM response..."):
323
+ entities = gqa.process_single_text(text_response)
324
+ decorated_text = decorate_text_with_annotations(text_response.strip(), entities)
325
+ decorated_text = decorated_text.replace('class="label material"', 'style="color:green"')
326
+ decorated_text = re.sub(r'class="label[^"]+"', 'style="color:orange"', decorated_text)
327
+ text_response = decorated_text
328
+ st.markdown(text_response, unsafe_allow_html=True)
329
+ else:
330
+ st.write(text_response)
331
+ st.session_state.messages.append({"role": "assistant", "mode": mode, "content": text_response})
332
+
333
+ elif st.session_state.loaded_embeddings and st.session_state.doc_id:
334
+ play_old_messages()