Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -286,11 +286,19 @@ def retrieve_answer_for_country(query_and_country: str) -> str: # TODO, change d
|
|
286 |
then there is no record for the country and no answer can be obtained."""
|
287 |
|
288 |
# different retrievers
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
# ensemble (below) reranks results from both retrievers above
|
295 |
ensemble = EnsembleRetriever(retrievers=[bm, chroma], weights=[st.session_state['keyword_retriever_weight'], 1 - st.session_state['keyword_retriever_weight']])
|
296 |
# for user to make selection
|
@@ -396,17 +404,31 @@ with st.sidebar:
|
|
396 |
],
|
397 |
icons=['house', 'gear', 'gear', 'gear'],
|
398 |
menu_icon="", default_index=0)
|
399 |
-
|
400 |
|
401 |
with st.expander("Warning", expanded = True):
|
402 |
st.write("⚠️ DO NOT navigate between pages or change config when chat is ongoing. Wait for query to complete first.")
|
403 |
|
404 |
-
st.
|
405 |
|
406 |
-
# see if retrievers created by user's own uploaded PDF or newly scraped data is found
|
407 |
new_documents_chroma = glob.glob("chromadb/new*")
|
408 |
-
new_documents_bm25 = glob.glob("
|
409 |
-
new_countries = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
410 |
if len(new_countries) == 0:
|
411 |
info = '(Own documents not found. Must first scrape or upload own PDF (see menu above) to use this.)'
|
412 |
else:
|
@@ -605,10 +627,13 @@ if page == "Scrape or Upload Own Docs":
|
|
605 |
options=options
|
606 |
)
|
607 |
|
608 |
-
|
609 |
-
|
|
|
|
|
|
|
610 |
with st.form(key='upload_pdf_form'):
|
611 |
-
|
612 |
uploaded_pdf = st.file_uploader("Upload a PDF")
|
613 |
if uploaded_pdf:
|
614 |
temp_file = "./temp.pdf"
|
@@ -617,9 +642,10 @@ if page == "Scrape or Upload Own Docs":
|
|
617 |
pdf_filename, = uploaded_pdf.name
|
618 |
submit_upload_pdf = st.form_submit_button(label='Submit')
|
619 |
|
620 |
-
|
621 |
-
|
622 |
-
with st.form(key='
|
|
|
623 |
n_search_results = st.number_input(
|
624 |
"How many DuckDuckGo search results would you like to scrape?",
|
625 |
0, 20,
|
@@ -631,19 +657,26 @@ if page == "Scrape or Upload Own Docs":
|
|
631 |
)
|
632 |
submit_scrape_web = st.form_submit_button(label='Submit')
|
633 |
|
634 |
-
if submit_upload_pdf | submit_scrape_web:
|
635 |
-
|
636 |
if submit_upload_pdf:
|
637 |
-
|
|
|
638 |
|
639 |
if submit_scrape_web:
|
640 |
-
|
641 |
-
|
|
|
|
|
|
|
642 |
|
643 |
-
# vectorstore for this country will be stored in "chroma_db/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
|
644 |
-
# can be used to override existing vectorstore for this country in sidebar document configuration
|
645 |
-
setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country)
|
646 |
|
647 |
-
|
648 |
-
|
649 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
then there is no record for the country and no answer can be obtained."""
|
287 |
|
288 |
# different retrievers
|
289 |
+
if country in st.session_state['countries_override']:
|
290 |
+
# # keyword
|
291 |
+
# bm = new_bm25_retrievers[country]
|
292 |
+
# bm.k = st.session_state['bm25_n_similar_documents']
|
293 |
+
# # semantic
|
294 |
+
# chroma = new_chroma_db.as_retriever(search_kwargs={'filter': {'country':country}, 'k': st.session_state['chroma_n_similar_documents']})
|
295 |
+
pass
|
296 |
+
else:
|
297 |
+
# keyword
|
298 |
+
bm = bm25_retrievers[country]
|
299 |
+
bm.k = st.session_state['bm25_n_similar_documents']
|
300 |
+
# semantic
|
301 |
+
chroma = chroma_db.as_retriever(search_kwargs={'filter': {'country':country}, 'k': st.session_state['chroma_n_similar_documents']})
|
302 |
# ensemble (below) reranks results from both retrievers above
|
303 |
ensemble = EnsembleRetriever(retrievers=[bm, chroma], weights=[st.session_state['keyword_retriever_weight'], 1 - st.session_state['keyword_retriever_weight']])
|
304 |
# for user to make selection
|
|
|
404 |
],
|
405 |
icons=['house', 'gear', 'gear', 'gear'],
|
406 |
menu_icon="", default_index=0)
|
|
|
407 |
|
408 |
with st.expander("Warning", expanded = True):
|
409 |
st.write("⚠️ DO NOT navigate between pages or change config when chat is ongoing. Wait for query to complete first.")
|
410 |
|
411 |
+
st.write("")
|
412 |
|
413 |
+
# see if retrievers/vector stores created by user's own uploaded PDF or newly scraped data is found
|
414 |
new_documents_chroma = glob.glob("chromadb/new*")
|
415 |
+
new_documents_bm25 = glob.glob("bm25/new*")
|
416 |
+
new_countries = []
|
417 |
+
|
418 |
+
# loop through new docs in chroma retrievers created by user scraping/pdf
|
419 |
+
for i, doc in enumerate(new_documents_chroma):
|
420 |
+
if (doc.split('/')[1] == new_documents_bm25[i].split('/')[1]: # check that the doc also exists for bm25 retriever
|
421 |
+
new_doc_country = doc.split('_')[1]
|
422 |
+
new_doc_chunk_size = doc.split('_')[3]
|
423 |
+
new_doc_chunk_overlap = doc.split('_')[5]
|
424 |
+
|
425 |
+
# check that the retrievers are created for the current selected chunk sizes
|
426 |
+
if ((new_doc_chunk_overlap == st.session_state['chunk_overlap']) & (new_doc_chunk_size == st.session_state['chunk_size'])):
|
427 |
+
new_countries.append(new_doc_country)
|
428 |
+
|
429 |
+
# if new retrievers that pass the above criteria are found, let the user know their countries
|
430 |
+
# the user can select from these countries to override existing retrievers
|
431 |
+
# otherwise prompt user to scrape or upload own PDF to create the new retrievers
|
432 |
if len(new_countries) == 0:
|
433 |
info = '(Own documents not found. Must first scrape or upload own PDF (see menu above) to use this.)'
|
434 |
else:
|
|
|
627 |
options=options
|
628 |
)
|
629 |
|
630 |
+
submit_upload_pdf = False
|
631 |
+
submit_scrape_web = False
|
632 |
+
|
633 |
+
# form for user to configure pdf loading options
|
634 |
+
if option == options[0]:
|
635 |
with st.form(key='upload_pdf_form'):
|
636 |
+
st.subheader(f"Selected Option: {option}")
|
637 |
uploaded_pdf = st.file_uploader("Upload a PDF")
|
638 |
if uploaded_pdf:
|
639 |
temp_file = "./temp.pdf"
|
|
|
642 |
pdf_filename, = uploaded_pdf.name
|
643 |
submit_upload_pdf = st.form_submit_button(label='Submit')
|
644 |
|
645 |
+
# form for user to configure web scraping for duckduckgo
|
646 |
+
if option == options[1]:
|
647 |
+
with st.form(key='scrape_web_form'):
|
648 |
+
st.subheader(f"Selected Option: {option}")
|
649 |
n_search_results = st.number_input(
|
650 |
"How many DuckDuckGo search results would you like to scrape?",
|
651 |
0, 20,
|
|
|
657 |
)
|
658 |
submit_scrape_web = st.form_submit_button(label='Submit')
|
659 |
|
660 |
+
if (submit_upload_pdf | submit_scrape_web):
|
|
|
661 |
if submit_upload_pdf:
|
662 |
+
with st.spinner('Generating documents from PDF...'):
|
663 |
+
all_documents = pdf_loader_local(pdf_filename, country_scrape_upload)
|
664 |
|
665 |
if submit_scrape_web:
|
666 |
+
with st.spinner('Scraping web using Duck Duck Go search...'):
|
667 |
+
all_links, df_links = duckduckgo_scrape(country_scrape_upload, search_term, n_search_results)
|
668 |
+
st.write(df_links)
|
669 |
+
with st.spinner('Generating documents from web search results...'):
|
670 |
+
all_documents = process_links_load_documents(all_links)
|
671 |
|
|
|
|
|
|
|
672 |
|
673 |
+
with st.spinner('Setting up new bm25 retrievers with documents, can take very long...'):
|
674 |
+
# vectorstore for this country will be stored in "bm25/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
|
675 |
+
# can be used to override existing vectorstore for this country in sidebar document configuration
|
676 |
+
setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country_scrape_upload)
|
677 |
+
|
678 |
+
|
679 |
+
with st.spinner('Setting up new chromadb vectores with documents, can take 5 mins and above...'):
|
680 |
+
# vectorstore for this country will be stored in "chroma_db/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
|
681 |
+
# can be used to override existing vectorstore for this country in sidebar document configuration
|
682 |
+
setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country_scrape_upload)
|