Spaces:

Chintan-Donda
/

KKMS-KSSW-HF

Runtime error

App Files Files Community

Chintan Donda commited on Apr 24, 2023

Commit

8f40cff

•

1 Parent(s): 2cd07d5

Fixing issues

Browse files

Files changed (4) hide show

app.py +36 -18
src/constants.py +2 -1
src/data_loader.py +17 -2
src/langchain_utils.py +20 -16

app.py CHANGED Viewed

@@ -39,8 +39,8 @@ class DomState:
     def click_handler_for_get_relevant_paragraphs(
         self,
-        question,
-        question_category='general'
     ):
         self.relevant_paragraphs = self.kkms_kssw_obj.query(
             question=question,
@@ -69,19 +69,24 @@ class DomState:
     def click_handler_for_get_answer(
         self,
-        relevant_paragraphs, question
     ):
-        self.answer = self.kkms_kssw_obj.langchain_utils_obj.get_answer_from_para(relevant_paragraphs, question)
         return self.answer
-    def click_handler_for_mandi_price(self,
-                                      state_name,
-                                      apmc_name,
-                                      commodity_name,
-                                      from_date,
-                                      to_date
-                                     ):
         if state_name and apmc_name and commodity_name and from_date and to_date:
             self.mandi_price = self.kkms_kssw_obj.mandi_utils_obj.get_mandi_price(state_name, apmc_name, commodity_name, from_date, to_date)
         return self.mandi_price
@@ -117,12 +122,12 @@ class DomState:
         self,
         doc_type,
         files_or_urls,
-        index_category='general'
     ):
         self.kkms_kssw_obj.upload_data(
             doc_type=constants_utils.DATA_SOURCES[doc_type],
             files_or_urls=files_or_urls,
-            index_category=index_category
         )
@@ -262,13 +267,16 @@ with gr.Blocks(title='KKMS-KSSW Demo') as demo:
     with gr.Row(visible=True) as rowCustomQuery:
         with gr.Column(scale=1, min_width=600):
             with gr.Tab(label='Relevant paragraphs'):
                 question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
                 # Get the Relevant paragraphs for the question asked
                 relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are:", value=dom.relevant_paragraphs, interactive=False)
                 b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
                 b_relevant_paragraphs.click(
                     fn=dom.click_handler_for_get_relevant_paragraphs,
-                    inputs=question,
                     outputs=[relevant_paragraphs]
                 )
@@ -396,6 +404,10 @@ with gr.Blocks(title='KKMS-KSSW Demo') as demo:
     with gr.Row(visible=False) as rowLoadCustomData:
         with gr.Column(scale=1, min_width=600):
             with gr.Tab(label='Load Custom Data'):
                 doc_type = gr.Radio(
                     list(constants_utils.DATA_SOURCES.keys()),
                     label="Select data source (Supports uploading multiple Files/URLs)",
@@ -414,14 +426,17 @@ with gr.Blocks(title='KKMS-KSSW Demo') as demo:
                         b_files = gr.Button("Load PDF Files").style(size='sm')
                         b_files.click(
                             fn=dom.click_handler_for_load_files_urls,
-                            inputs=[doc_type, upload_button]
                         )
                 with gr.Row(visible=False) as rowUploadOnlinePdf:
                     with gr.Column(scale=1, min_width=600):
                         urls = gr.Textbox(label="Enter URLs for Online PDF (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
                         b_urls = gr.Button("Load Online PDFs").style(size='sm')
-                        b_urls.click(fn=dom.click_handler_for_load_files_urls, inputs=[doc_type, urls])
                 with gr.Row(visible=False) as rowUploadTextFile:
                     with gr.Column(scale=1, min_width=600):
@@ -435,14 +450,17 @@ with gr.Blocks(title='KKMS-KSSW Demo') as demo:
                         b_files = gr.Button("Load Text Files").style(size='sm')
                         b_files.click(
                             fn=dom.click_handler_for_load_files_urls,
-                            inputs=[doc_type, file_output]
                         )
                 with gr.Row(visible=False) as rowUploadUrls:
                     with gr.Column(scale=1, min_width=600):
                         urls = gr.Textbox(label="Enter URLs (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
                         b_urls = gr.Button("Load URLs").style(size='sm')
-                        b_urls.click(fn=dom.click_handler_for_load_files_urls, inputs=[doc_type, urls])
                 doc_type.change(
                     fn=dom.select_files_urls,

     def click_handler_for_get_relevant_paragraphs(
         self,
+        question_category,
+        question
     ):
         self.relevant_paragraphs = self.kkms_kssw_obj.query(
             question=question,
     def click_handler_for_get_answer(
         self,
+        relevant_paragraphs,
+        question
     ):
+        self.answer = self.kkms_kssw_obj.langchain_utils_obj.get_answer_from_para(
+            relevant_paragraphs,
+            question
+        )
         return self.answer
+    def click_handler_for_mandi_price(
+        self,
+        state_name,
+        apmc_name,
+        commodity_name,
+        from_date,
+        to_date
+    ):
         if state_name and apmc_name and commodity_name and from_date and to_date:
             self.mandi_price = self.kkms_kssw_obj.mandi_utils_obj.get_mandi_price(state_name, apmc_name, commodity_name, from_date, to_date)
         return self.mandi_price
         self,
         doc_type,
         files_or_urls,
+        question_category
     ):
         self.kkms_kssw_obj.upload_data(
             doc_type=constants_utils.DATA_SOURCES[doc_type],
             files_or_urls=files_or_urls,
+            index_category=question_category
         )
     with gr.Row(visible=True) as rowCustomQuery:
         with gr.Column(scale=1, min_width=600):
             with gr.Tab(label='Relevant paragraphs'):
+                question_category = gr.Dropdown(
+                    constants_utils.INDEX_CATEGORY,
+                    label="Select Query Type")
                 question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
                 # Get the Relevant paragraphs for the question asked
                 relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are:", value=dom.relevant_paragraphs, interactive=False)
                 b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
                 b_relevant_paragraphs.click(
                     fn=dom.click_handler_for_get_relevant_paragraphs,
+                    inputs=[question_category, question],
                     outputs=[relevant_paragraphs]
                 )
     with gr.Row(visible=False) as rowLoadCustomData:
         with gr.Column(scale=1, min_width=600):
             with gr.Tab(label='Load Custom Data'):
+                question_category = gr.Dropdown(
+                    constants_utils.INDEX_CATEGORY,
+                    label="Select Query Type")
                 doc_type = gr.Radio(
                     list(constants_utils.DATA_SOURCES.keys()),
                     label="Select data source (Supports uploading multiple Files/URLs)",
                         b_files = gr.Button("Load PDF Files").style(size='sm')
                         b_files.click(
                             fn=dom.click_handler_for_load_files_urls,
+                            inputs=[doc_type, upload_button, question_category]
                         )
                 with gr.Row(visible=False) as rowUploadOnlinePdf:
                     with gr.Column(scale=1, min_width=600):
                         urls = gr.Textbox(label="Enter URLs for Online PDF (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
                         b_urls = gr.Button("Load Online PDFs").style(size='sm')
+                        b_urls.click(
+                            fn=dom.click_handler_for_load_files_urls,
+                            inputs=[doc_type, urls, question_category]
+                        )
                 with gr.Row(visible=False) as rowUploadTextFile:
                     with gr.Column(scale=1, min_width=600):
                         b_files = gr.Button("Load Text Files").style(size='sm')
                         b_files.click(
                             fn=dom.click_handler_for_load_files_urls,
+                            inputs=[doc_type, file_output, question_category]
                         )
                 with gr.Row(visible=False) as rowUploadUrls:
                     with gr.Column(scale=1, min_width=600):
                         urls = gr.Textbox(label="Enter URLs (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
                         b_urls = gr.Button("Load URLs").style(size='sm')
+                        b_urls.click(
+                            fn=dom.click_handler_for_load_files_urls,
+                            inputs=[doc_type, urls, question_category]
+                        )
                 doc_type.change(
                     fn=dom.select_files_urls,

src/constants.py CHANGED Viewed

@@ -22,7 +22,7 @@ INDEX_CATEGORY = [
     # 'insurance',
     # 'soil',
     'general',
-    'vegetables',
 ]
 # Doctype of the master index of each index category. Master index for each index category would be stored under this key.
@@ -43,6 +43,7 @@ DATA_SOURCES = {
 # LangChain related constants
 TEXT_SPLITTER_CHUNK_SIZE = 1000
 TEXT_SPLITTER_CHUNK_OVERLAP = 0
 URLS = [

     # 'insurance',
     # 'soil',
     'general',
+    'vegetables'
 ]
 # Doctype of the master index of each index category. Master index for each index category would be stored under this key.
 # LangChain related constants
 TEXT_SPLITTER_CHUNK_SIZE = 1000
 TEXT_SPLITTER_CHUNK_OVERLAP = 0
+TEXT_SPLITTER_SEPARATOR = '\n\n'
 URLS = [

src/data_loader.py CHANGED Viewed

@@ -135,7 +135,17 @@ class DATA_LOADER:
         # Load data from files on the local directory (files may be of type .pdf, .txt, .doc, etc.)
         elif doc_type == 'directory':
-            documents = SimpleDirectoryReader(doc_filepath).load_data()
         # Load data from URLs in Knowledge Base format
         elif doc_type == 'url-kb':
@@ -190,7 +200,12 @@ class DATA_LOADER:
     ):
         cleaned_documents = []
         for document in documents:
-            document.page_content = self.utils_obj.replace_newlines_and_spaces(document.page_content)
             cleaned_documents.append(document)
         return cleaned_documents

         # Load data from files on the local directory (files may be of type .pdf, .txt, .doc, etc.)
         elif doc_type == 'directory':
+            # Load multiple PDFs from directory
+            if os.path.isdir(doc_filepath):
+                documents = SimpleDirectoryReader(
+                    input_dir=doc_filepath
+                ).load_data()
+            # Loading from a file
+            elif os.path.isfile(doc_filepath):
+                documents.extend(SimpleDirectoryReader(
+                    input_files=[doc_filepath]
+                ).load_data())
         # Load data from URLs in Knowledge Base format
         elif doc_type == 'url-kb':
     ):
         cleaned_documents = []
         for document in documents:
+            if hasattr(document, 'page_content'):
+                document.page_content = self.utils_obj.replace_newlines_and_spaces(document.page_content)
+            elif hasattr(document, 'text'):
+                document.text = self.utils_obj.replace_newlines_and_spaces(document.text)
+            else:
+                document = self.utils_obj.replace_newlines_and_spaces(document)
             cleaned_documents.append(document)
         return cleaned_documents

src/langchain_utils.py CHANGED Viewed

@@ -22,7 +22,6 @@ from typing import Dict, List, Optional
 import os
 os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
-os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 import logging
 logger = logging.getLogger(__name__)
@@ -186,7 +185,11 @@ class LANGCHAIN_UTILS:
         custom_prompt=True
     ):
         # Prepare data (Split paragraph into chunks of small documents)
-        text_splitter = CharacterTextSplitter(chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE, chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP)
         texts = text_splitter.split_text(para)
         if self.index_type == 'FAISS':
@@ -299,15 +302,16 @@ class LANGCHAIN_UTILS:
         logger.info(f'Creating index')
         ############## Build the Vector store for docs ##############
         # Vector store using Facebook AI Similarity Search
         if self.index_type == 'FAISS':
-            text_splitter = CharacterTextSplitter(
-                chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
-                chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
-            )
-            self.documents = text_splitter.split_documents(self.documents)
             self.index = FAISS.from_documents(
                 self.documents,
                 self.embeddings
@@ -318,11 +322,6 @@ class LANGCHAIN_UTILS:
             if not os.path.exists(self.index_filepath):
                 os.makedirs(self.index_filepath)
-            text_splitter = CharacterTextSplitter(
-                chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
-                chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP
-            )
-            self.documents = text_splitter.split_documents(self.documents)
             self.index = Chroma.from_documents(
                 self.documents,
                 self.embeddings,
@@ -478,7 +477,7 @@ class LANGCHAIN_UTILS:
         logger.info(f'Saving index to: {index_filepath}')
-        if not os.path.exists(index_filepath):
             os.makedirs(index_filepath)
         if self.index_type == 'FAISS':
@@ -598,6 +597,7 @@ class LANGCHAIN_UTILS:
                 if not index or not isinstance(index, GPTSimpleVectorIndex):
                     logger.warning(f'{doc_type} index to be merged is not an instance of type llama_index.GPTSimpleVectorIndex')
                     continue
                 raise NotImplementedError
         # Store index_category master index
@@ -634,7 +634,11 @@ class LANGCHAIN_UTILS:
         logger.info('Chroma DB initialized successfully!')
-    def query_chromadb(self, question, k=1):
         return self.index.similarity_search(query=question, k=k)
@@ -658,7 +662,7 @@ class LANGCHAIN_UTILS:
         response = None
         # Get the index of the given question_category
-        index = self.index_category_doc_type_wise_index[question_category]['master']
         if self.index_type == 'FAISS':
             response = index.similarity_search(

 import os
 os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
 import logging
 logger = logging.getLogger(__name__)
         custom_prompt=True
     ):
         # Prepare data (Split paragraph into chunks of small documents)
+        text_splitter = CharacterTextSplitter(
+            chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
+            chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
+            separator=constants_utils.TEXT_SPLITTER_SEPARATOR
+        )
         texts = text_splitter.split_text(para)
         if self.index_type == 'FAISS':
         logger.info(f'Creating index')
+        text_splitter = CharacterTextSplitter(
+            chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
+            chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
+            separator=constants_utils.TEXT_SPLITTER_SEPARATOR
+        )
+        self.documents = text_splitter.split_documents(self.documents)
         ############## Build the Vector store for docs ##############
         # Vector store using Facebook AI Similarity Search
         if self.index_type == 'FAISS':
             self.index = FAISS.from_documents(
                 self.documents,
                 self.embeddings
             if not os.path.exists(self.index_filepath):
                 os.makedirs(self.index_filepath)
             self.index = Chroma.from_documents(
                 self.documents,
                 self.embeddings,
         logger.info(f'Saving index to: {index_filepath}')
+        if not os.path.exists(index_filepath) and os.path.isdir(index_filepath):
             os.makedirs(index_filepath)
         if self.index_type == 'FAISS':
                 if not index or not isinstance(index, GPTSimpleVectorIndex):
                     logger.warning(f'{doc_type} index to be merged is not an instance of type llama_index.GPTSimpleVectorIndex')
                     continue
+                import pdb; pdb.set_trace()
                 raise NotImplementedError
         # Store index_category master index
         logger.info('Chroma DB initialized successfully!')
+    def query_chromadb(
+        self,
+        question,
+        k=1
+    ):
         return self.index.similarity_search(query=question, k=k)
         response = None
         # Get the index of the given question_category
+        index = self.index_category_doc_type_wise_index[question_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]
         if self.index_type == 'FAISS':
             response = index.similarity_search(