Spaces:

bohmian
/

esg_countries_chatbot

Sleeping

App Files Files Community

bohmian commited on Feb 15, 2024

Commit

abd0396

verified ·

1 Parent(s): 72d76ec

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -27

app.py CHANGED Viewed

@@ -286,11 +286,19 @@ def retrieve_answer_for_country(query_and_country: str) -> str: # TODO, change d
             then there is no record for the country and no answer can be obtained."""
         # different retrievers
-        # keyword
-        bm = bm25_retrievers[country]
-        bm.k = st.session_state['bm25_n_similar_documents']
-        # semantic
-        chroma = chroma_db.as_retriever(search_kwargs={'filter': {'country':country}, 'k': st.session_state['chroma_n_similar_documents']})
         # ensemble (below) reranks results from both retrievers above
         ensemble = EnsembleRetriever(retrievers=[bm, chroma], weights=[st.session_state['keyword_retriever_weight'], 1 - st.session_state['keyword_retriever_weight']])
         # for user to make selection
@@ -396,17 +404,31 @@ with st.sidebar:
         ],
         icons=['house', 'gear', 'gear', 'gear'],
         menu_icon="", default_index=0)
     with st.expander("Warning", expanded = True):
         st.write("⚠️ DO NOT navigate between pages or change config when chat is ongoing.  Wait for query to complete first.")
-    st.container()
-    # see if retrievers created by user's own uploaded PDF or newly scraped data is found
     new_documents_chroma = glob.glob("chromadb/new*")
-    new_documents_bm25 = glob.glob("chromadb/new*")
-    new_countries = [doc.split('_')[1] for doc in new_documents_chroma]
     if len(new_countries) == 0:
         info = '(Own documents not found. Must first scrape or upload own PDF (see menu above) to use this.)'
     else:
@@ -605,10 +627,13 @@ if page == "Scrape or Upload Own Docs":
         options=options
     )
-    if option == options[0]:
-        st.subheader(f"Selected Option: {option}")
         with st.form(key='upload_pdf_form'):
             uploaded_pdf = st.file_uploader("Upload a PDF")
             if uploaded_pdf:
                 temp_file = "./temp.pdf"
@@ -617,9 +642,10 @@ if page == "Scrape or Upload Own Docs":
                     pdf_filename, = uploaded_pdf.name
             submit_upload_pdf = st.form_submit_button(label='Submit')
-    if option == options[1]:
-        st.subheader(f"Selected Option: {option}")
-        with st.form(key='upload_pdf_form'):
             n_search_results = st.number_input(
                                         "How many DuckDuckGo search results would you like to scrape?",
                                         0, 20,
@@ -631,19 +657,26 @@ if page == "Scrape or Upload Own Docs":
                                     )
             submit_scrape_web = st.form_submit_button(label='Submit')
-    if submit_upload_pdf | submit_scrape_web:
         if submit_upload_pdf:
-            all_documents = pdf_loader_local(pdf_filename, country)
         if submit_scrape_web:
-            all_links, df_links = duckduckgo_scrape(country, search_term, n_search_results)
-            all_documents = process_links_load_documents(all_links)
-        # vectorstore for this country will be stored in "chroma_db/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
-        # can be used to override existing vectorstore for this country in sidebar document configuration
-        setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country)
-        # vectorstore for this country will be stored in "bm25/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
-        # can be used to override existing vectorstore for this country in sidebar document configuration
-        setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country)

             then there is no record for the country and no answer can be obtained."""
         # different retrievers
+        if country in st.session_state['countries_override']:
+            # # keyword
+            # bm = new_bm25_retrievers[country]
+            # bm.k = st.session_state['bm25_n_similar_documents']
+            # # semantic
+            # chroma = new_chroma_db.as_retriever(search_kwargs={'filter': {'country':country}, 'k': st.session_state['chroma_n_similar_documents']})
+            pass
+        else:
+            # keyword
+            bm = bm25_retrievers[country]
+            bm.k = st.session_state['bm25_n_similar_documents']
+            # semantic
+            chroma = chroma_db.as_retriever(search_kwargs={'filter': {'country':country}, 'k': st.session_state['chroma_n_similar_documents']})
         # ensemble (below) reranks results from both retrievers above
         ensemble = EnsembleRetriever(retrievers=[bm, chroma], weights=[st.session_state['keyword_retriever_weight'], 1 - st.session_state['keyword_retriever_weight']])
         # for user to make selection
         ],
         icons=['house', 'gear', 'gear', 'gear'],
         menu_icon="", default_index=0)
     with st.expander("Warning", expanded = True):
         st.write("⚠️ DO NOT navigate between pages or change config when chat is ongoing.  Wait for query to complete first.")
+    st.write("")
+    # see if retrievers/vector stores created by user's own uploaded PDF or newly scraped data is found
     new_documents_chroma = glob.glob("chromadb/new*")
+    new_documents_bm25 = glob.glob("bm25/new*")
+    new_countries = []
+    # loop through new docs in chroma retrievers created by user scraping/pdf
+    for i, doc in enumerate(new_documents_chroma):
+        if (doc.split('/')[1] == new_documents_bm25[i].split('/')[1]: # check that the doc also exists for bm25 retriever
+            new_doc_country = doc.split('_')[1]
+            new_doc_chunk_size = doc.split('_')[3]
+            new_doc_chunk_overlap = doc.split('_')[5]
+            # check that the retrievers are created for the current selected chunk sizes
+            if ((new_doc_chunk_overlap == st.session_state['chunk_overlap']) & (new_doc_chunk_size == st.session_state['chunk_size'])):
+                new_countries.append(new_doc_country)
+    # if new retrievers that pass the above criteria are found, let the user know their countries
+    # the user can select from these countries to override existing retrievers
+    # otherwise prompt user to scrape or upload own PDF to create the new retrievers
     if len(new_countries) == 0:
         info = '(Own documents not found. Must first scrape or upload own PDF (see menu above) to use this.)'
     else:
         options=options
     )
+    submit_upload_pdf = False
+    submit_scrape_web = False
+    # form for user to configure pdf loading options
+    if option == options[0]:
         with st.form(key='upload_pdf_form'):
+            st.subheader(f"Selected Option: {option}")
             uploaded_pdf = st.file_uploader("Upload a PDF")
             if uploaded_pdf:
                 temp_file = "./temp.pdf"
                     pdf_filename, = uploaded_pdf.name
             submit_upload_pdf = st.form_submit_button(label='Submit')
+    # form for user to configure web scraping for duckduckgo
+    if option == options[1]:
+        with st.form(key='scrape_web_form'):
+            st.subheader(f"Selected Option: {option}")
             n_search_results = st.number_input(
                                         "How many DuckDuckGo search results would you like to scrape?",
                                         0, 20,
                                     )
             submit_scrape_web = st.form_submit_button(label='Submit')
+    if (submit_upload_pdf | submit_scrape_web):
         if submit_upload_pdf:
+            with st.spinner('Generating documents from PDF...'):
+                all_documents = pdf_loader_local(pdf_filename, country_scrape_upload)
         if submit_scrape_web:
+            with st.spinner('Scraping web using Duck Duck Go search...'):
+                all_links, df_links = duckduckgo_scrape(country_scrape_upload, search_term, n_search_results)
+            st.write(df_links)
+            with st.spinner('Generating documents from web search results...'):
+                all_documents = process_links_load_documents(all_links)
+        with st.spinner('Setting up new bm25 retrievers with documents, can take very long...'):
+            # vectorstore for this country will be stored in "bm25/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
+            # can be used to override existing vectorstore for this country in sidebar document configuration
+            setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country_scrape_upload)
+        with st.spinner('Setting up new chromadb vectores with documents, can take 5 mins and above...'):
+            # vectorstore for this country will be stored in "chroma_db/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
+            # can be used to override existing vectorstore for this country in sidebar document configuration
+            setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country_scrape_upload)