bohmian commited on
Commit
abd0396
·
verified ·
1 Parent(s): 72d76ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -27
app.py CHANGED
@@ -286,11 +286,19 @@ def retrieve_answer_for_country(query_and_country: str) -> str: # TODO, change d
286
  then there is no record for the country and no answer can be obtained."""
287
 
288
  # different retrievers
289
- # keyword
290
- bm = bm25_retrievers[country]
291
- bm.k = st.session_state['bm25_n_similar_documents']
292
- # semantic
293
- chroma = chroma_db.as_retriever(search_kwargs={'filter': {'country':country}, 'k': st.session_state['chroma_n_similar_documents']})
 
 
 
 
 
 
 
 
294
  # ensemble (below) reranks results from both retrievers above
295
  ensemble = EnsembleRetriever(retrievers=[bm, chroma], weights=[st.session_state['keyword_retriever_weight'], 1 - st.session_state['keyword_retriever_weight']])
296
  # for user to make selection
@@ -396,17 +404,31 @@ with st.sidebar:
396
  ],
397
  icons=['house', 'gear', 'gear', 'gear'],
398
  menu_icon="", default_index=0)
399
-
400
 
401
  with st.expander("Warning", expanded = True):
402
  st.write("⚠️ DO NOT navigate between pages or change config when chat is ongoing. Wait for query to complete first.")
403
 
404
- st.container()
405
 
406
- # see if retrievers created by user's own uploaded PDF or newly scraped data is found
407
  new_documents_chroma = glob.glob("chromadb/new*")
408
- new_documents_bm25 = glob.glob("chromadb/new*")
409
- new_countries = [doc.split('_')[1] for doc in new_documents_chroma]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
  if len(new_countries) == 0:
411
  info = '(Own documents not found. Must first scrape or upload own PDF (see menu above) to use this.)'
412
  else:
@@ -605,10 +627,13 @@ if page == "Scrape or Upload Own Docs":
605
  options=options
606
  )
607
 
608
- if option == options[0]:
609
- st.subheader(f"Selected Option: {option}")
 
 
 
610
  with st.form(key='upload_pdf_form'):
611
-
612
  uploaded_pdf = st.file_uploader("Upload a PDF")
613
  if uploaded_pdf:
614
  temp_file = "./temp.pdf"
@@ -617,9 +642,10 @@ if page == "Scrape or Upload Own Docs":
617
  pdf_filename, = uploaded_pdf.name
618
  submit_upload_pdf = st.form_submit_button(label='Submit')
619
 
620
- if option == options[1]:
621
- st.subheader(f"Selected Option: {option}")
622
- with st.form(key='upload_pdf_form'):
 
623
  n_search_results = st.number_input(
624
  "How many DuckDuckGo search results would you like to scrape?",
625
  0, 20,
@@ -631,19 +657,26 @@ if page == "Scrape or Upload Own Docs":
631
  )
632
  submit_scrape_web = st.form_submit_button(label='Submit')
633
 
634
- if submit_upload_pdf | submit_scrape_web:
635
-
636
  if submit_upload_pdf:
637
- all_documents = pdf_loader_local(pdf_filename, country)
 
638
 
639
  if submit_scrape_web:
640
- all_links, df_links = duckduckgo_scrape(country, search_term, n_search_results)
641
- all_documents = process_links_load_documents(all_links)
 
 
 
642
 
643
- # vectorstore for this country will be stored in "chroma_db/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
644
- # can be used to override existing vectorstore for this country in sidebar document configuration
645
- setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country)
646
 
647
- # vectorstore for this country will be stored in "bm25/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
648
- # can be used to override existing vectorstore for this country in sidebar document configuration
649
- setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country)
 
 
 
 
 
 
 
 
286
  then there is no record for the country and no answer can be obtained."""
287
 
288
  # different retrievers
289
+ if country in st.session_state['countries_override']:
290
+ # # keyword
291
+ # bm = new_bm25_retrievers[country]
292
+ # bm.k = st.session_state['bm25_n_similar_documents']
293
+ # # semantic
294
+ # chroma = new_chroma_db.as_retriever(search_kwargs={'filter': {'country':country}, 'k': st.session_state['chroma_n_similar_documents']})
295
+ pass
296
+ else:
297
+ # keyword
298
+ bm = bm25_retrievers[country]
299
+ bm.k = st.session_state['bm25_n_similar_documents']
300
+ # semantic
301
+ chroma = chroma_db.as_retriever(search_kwargs={'filter': {'country':country}, 'k': st.session_state['chroma_n_similar_documents']})
302
  # ensemble (below) reranks results from both retrievers above
303
  ensemble = EnsembleRetriever(retrievers=[bm, chroma], weights=[st.session_state['keyword_retriever_weight'], 1 - st.session_state['keyword_retriever_weight']])
304
  # for user to make selection
 
404
  ],
405
  icons=['house', 'gear', 'gear', 'gear'],
406
  menu_icon="", default_index=0)
 
407
 
408
  with st.expander("Warning", expanded = True):
409
  st.write("⚠️ DO NOT navigate between pages or change config when chat is ongoing. Wait for query to complete first.")
410
 
411
+ st.write("")
412
 
413
+ # see if retrievers/vector stores created by user's own uploaded PDF or newly scraped data is found
414
  new_documents_chroma = glob.glob("chromadb/new*")
415
+ new_documents_bm25 = glob.glob("bm25/new*")
416
+ new_countries = []
417
+
418
+ # loop through new docs in chroma retrievers created by user scraping/pdf
419
+ for i, doc in enumerate(new_documents_chroma):
420
+ if (doc.split('/')[1] == new_documents_bm25[i].split('/')[1]: # check that the doc also exists for bm25 retriever
421
+ new_doc_country = doc.split('_')[1]
422
+ new_doc_chunk_size = doc.split('_')[3]
423
+ new_doc_chunk_overlap = doc.split('_')[5]
424
+
425
+ # check that the retrievers are created for the current selected chunk sizes
426
+ if ((new_doc_chunk_overlap == st.session_state['chunk_overlap']) & (new_doc_chunk_size == st.session_state['chunk_size'])):
427
+ new_countries.append(new_doc_country)
428
+
429
+ # if new retrievers that pass the above criteria are found, let the user know their countries
430
+ # the user can select from these countries to override existing retrievers
431
+ # otherwise prompt user to scrape or upload own PDF to create the new retrievers
432
  if len(new_countries) == 0:
433
  info = '(Own documents not found. Must first scrape or upload own PDF (see menu above) to use this.)'
434
  else:
 
627
  options=options
628
  )
629
 
630
+ submit_upload_pdf = False
631
+ submit_scrape_web = False
632
+
633
+ # form for user to configure pdf loading options
634
+ if option == options[0]:
635
  with st.form(key='upload_pdf_form'):
636
+ st.subheader(f"Selected Option: {option}")
637
  uploaded_pdf = st.file_uploader("Upload a PDF")
638
  if uploaded_pdf:
639
  temp_file = "./temp.pdf"
 
642
  pdf_filename, = uploaded_pdf.name
643
  submit_upload_pdf = st.form_submit_button(label='Submit')
644
 
645
+ # form for user to configure web scraping for duckduckgo
646
+ if option == options[1]:
647
+ with st.form(key='scrape_web_form'):
648
+ st.subheader(f"Selected Option: {option}")
649
  n_search_results = st.number_input(
650
  "How many DuckDuckGo search results would you like to scrape?",
651
  0, 20,
 
657
  )
658
  submit_scrape_web = st.form_submit_button(label='Submit')
659
 
660
+ if (submit_upload_pdf | submit_scrape_web):
 
661
  if submit_upload_pdf:
662
+ with st.spinner('Generating documents from PDF...'):
663
+ all_documents = pdf_loader_local(pdf_filename, country_scrape_upload)
664
 
665
  if submit_scrape_web:
666
+ with st.spinner('Scraping web using Duck Duck Go search...'):
667
+ all_links, df_links = duckduckgo_scrape(country_scrape_upload, search_term, n_search_results)
668
+ st.write(df_links)
669
+ with st.spinner('Generating documents from web search results...'):
670
+ all_documents = process_links_load_documents(all_links)
671
 
 
 
 
672
 
673
+ with st.spinner('Setting up new bm25 retrievers with documents, can take very long...'):
674
+ # vectorstore for this country will be stored in "bm25/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
675
+ # can be used to override existing vectorstore for this country in sidebar document configuration
676
+ setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country_scrape_upload)
677
+
678
+
679
+ with st.spinner('Setting up new chromadb vectores with documents, can take 5 mins and above...'):
680
+ # vectorstore for this country will be stored in "chroma_db/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
681
+ # can be used to override existing vectorstore for this country in sidebar document configuration
682
+ setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country_scrape_upload)