Chintan Donda commited on
Commit
8f40cff
1 Parent(s): 2cd07d5

Fixing issues

Browse files
Files changed (4) hide show
  1. app.py +36 -18
  2. src/constants.py +2 -1
  3. src/data_loader.py +17 -2
  4. src/langchain_utils.py +20 -16
app.py CHANGED
@@ -39,8 +39,8 @@ class DomState:
39
 
40
  def click_handler_for_get_relevant_paragraphs(
41
  self,
42
- question,
43
- question_category='general'
44
  ):
45
  self.relevant_paragraphs = self.kkms_kssw_obj.query(
46
  question=question,
@@ -69,19 +69,24 @@ class DomState:
69
 
70
  def click_handler_for_get_answer(
71
  self,
72
- relevant_paragraphs, question
 
73
  ):
74
- self.answer = self.kkms_kssw_obj.langchain_utils_obj.get_answer_from_para(relevant_paragraphs, question)
 
 
 
75
  return self.answer
76
 
77
 
78
- def click_handler_for_mandi_price(self,
79
- state_name,
80
- apmc_name,
81
- commodity_name,
82
- from_date,
83
- to_date
84
- ):
 
85
  if state_name and apmc_name and commodity_name and from_date and to_date:
86
  self.mandi_price = self.kkms_kssw_obj.mandi_utils_obj.get_mandi_price(state_name, apmc_name, commodity_name, from_date, to_date)
87
  return self.mandi_price
@@ -117,12 +122,12 @@ class DomState:
117
  self,
118
  doc_type,
119
  files_or_urls,
120
- index_category='general'
121
  ):
122
  self.kkms_kssw_obj.upload_data(
123
  doc_type=constants_utils.DATA_SOURCES[doc_type],
124
  files_or_urls=files_or_urls,
125
- index_category=index_category
126
  )
127
 
128
 
@@ -262,13 +267,16 @@ with gr.Blocks(title='KKMS-KSSW Demo') as demo:
262
  with gr.Row(visible=True) as rowCustomQuery:
263
  with gr.Column(scale=1, min_width=600):
264
  with gr.Tab(label='Relevant paragraphs'):
 
 
 
265
  question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
266
  # Get the Relevant paragraphs for the question asked
267
  relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are:", value=dom.relevant_paragraphs, interactive=False)
268
  b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
269
  b_relevant_paragraphs.click(
270
  fn=dom.click_handler_for_get_relevant_paragraphs,
271
- inputs=question,
272
  outputs=[relevant_paragraphs]
273
  )
274
 
@@ -396,6 +404,10 @@ with gr.Blocks(title='KKMS-KSSW Demo') as demo:
396
  with gr.Row(visible=False) as rowLoadCustomData:
397
  with gr.Column(scale=1, min_width=600):
398
  with gr.Tab(label='Load Custom Data'):
 
 
 
 
399
  doc_type = gr.Radio(
400
  list(constants_utils.DATA_SOURCES.keys()),
401
  label="Select data source (Supports uploading multiple Files/URLs)",
@@ -414,14 +426,17 @@ with gr.Blocks(title='KKMS-KSSW Demo') as demo:
414
  b_files = gr.Button("Load PDF Files").style(size='sm')
415
  b_files.click(
416
  fn=dom.click_handler_for_load_files_urls,
417
- inputs=[doc_type, upload_button]
418
  )
419
 
420
  with gr.Row(visible=False) as rowUploadOnlinePdf:
421
  with gr.Column(scale=1, min_width=600):
422
  urls = gr.Textbox(label="Enter URLs for Online PDF (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
423
  b_urls = gr.Button("Load Online PDFs").style(size='sm')
424
- b_urls.click(fn=dom.click_handler_for_load_files_urls, inputs=[doc_type, urls])
 
 
 
425
 
426
  with gr.Row(visible=False) as rowUploadTextFile:
427
  with gr.Column(scale=1, min_width=600):
@@ -435,14 +450,17 @@ with gr.Blocks(title='KKMS-KSSW Demo') as demo:
435
  b_files = gr.Button("Load Text Files").style(size='sm')
436
  b_files.click(
437
  fn=dom.click_handler_for_load_files_urls,
438
- inputs=[doc_type, file_output]
439
  )
440
 
441
  with gr.Row(visible=False) as rowUploadUrls:
442
  with gr.Column(scale=1, min_width=600):
443
  urls = gr.Textbox(label="Enter URLs (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
444
  b_urls = gr.Button("Load URLs").style(size='sm')
445
- b_urls.click(fn=dom.click_handler_for_load_files_urls, inputs=[doc_type, urls])
 
 
 
446
 
447
  doc_type.change(
448
  fn=dom.select_files_urls,
 
39
 
40
  def click_handler_for_get_relevant_paragraphs(
41
  self,
42
+ question_category,
43
+ question
44
  ):
45
  self.relevant_paragraphs = self.kkms_kssw_obj.query(
46
  question=question,
 
69
 
70
  def click_handler_for_get_answer(
71
  self,
72
+ relevant_paragraphs,
73
+ question
74
  ):
75
+ self.answer = self.kkms_kssw_obj.langchain_utils_obj.get_answer_from_para(
76
+ relevant_paragraphs,
77
+ question
78
+ )
79
  return self.answer
80
 
81
 
82
+ def click_handler_for_mandi_price(
83
+ self,
84
+ state_name,
85
+ apmc_name,
86
+ commodity_name,
87
+ from_date,
88
+ to_date
89
+ ):
90
  if state_name and apmc_name and commodity_name and from_date and to_date:
91
  self.mandi_price = self.kkms_kssw_obj.mandi_utils_obj.get_mandi_price(state_name, apmc_name, commodity_name, from_date, to_date)
92
  return self.mandi_price
 
122
  self,
123
  doc_type,
124
  files_or_urls,
125
+ question_category
126
  ):
127
  self.kkms_kssw_obj.upload_data(
128
  doc_type=constants_utils.DATA_SOURCES[doc_type],
129
  files_or_urls=files_or_urls,
130
+ index_category=question_category
131
  )
132
 
133
 
 
267
  with gr.Row(visible=True) as rowCustomQuery:
268
  with gr.Column(scale=1, min_width=600):
269
  with gr.Tab(label='Relevant paragraphs'):
270
+ question_category = gr.Dropdown(
271
+ constants_utils.INDEX_CATEGORY,
272
+ label="Select Query Type")
273
  question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
274
  # Get the Relevant paragraphs for the question asked
275
  relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are:", value=dom.relevant_paragraphs, interactive=False)
276
  b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
277
  b_relevant_paragraphs.click(
278
  fn=dom.click_handler_for_get_relevant_paragraphs,
279
+ inputs=[question_category, question],
280
  outputs=[relevant_paragraphs]
281
  )
282
 
 
404
  with gr.Row(visible=False) as rowLoadCustomData:
405
  with gr.Column(scale=1, min_width=600):
406
  with gr.Tab(label='Load Custom Data'):
407
+ question_category = gr.Dropdown(
408
+ constants_utils.INDEX_CATEGORY,
409
+ label="Select Query Type")
410
+
411
  doc_type = gr.Radio(
412
  list(constants_utils.DATA_SOURCES.keys()),
413
  label="Select data source (Supports uploading multiple Files/URLs)",
 
426
  b_files = gr.Button("Load PDF Files").style(size='sm')
427
  b_files.click(
428
  fn=dom.click_handler_for_load_files_urls,
429
+ inputs=[doc_type, upload_button, question_category]
430
  )
431
 
432
  with gr.Row(visible=False) as rowUploadOnlinePdf:
433
  with gr.Column(scale=1, min_width=600):
434
  urls = gr.Textbox(label="Enter URLs for Online PDF (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
435
  b_urls = gr.Button("Load Online PDFs").style(size='sm')
436
+ b_urls.click(
437
+ fn=dom.click_handler_for_load_files_urls,
438
+ inputs=[doc_type, urls, question_category]
439
+ )
440
 
441
  with gr.Row(visible=False) as rowUploadTextFile:
442
  with gr.Column(scale=1, min_width=600):
 
450
  b_files = gr.Button("Load Text Files").style(size='sm')
451
  b_files.click(
452
  fn=dom.click_handler_for_load_files_urls,
453
+ inputs=[doc_type, file_output, question_category]
454
  )
455
 
456
  with gr.Row(visible=False) as rowUploadUrls:
457
  with gr.Column(scale=1, min_width=600):
458
  urls = gr.Textbox(label="Enter URLs (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
459
  b_urls = gr.Button("Load URLs").style(size='sm')
460
+ b_urls.click(
461
+ fn=dom.click_handler_for_load_files_urls,
462
+ inputs=[doc_type, urls, question_category]
463
+ )
464
 
465
  doc_type.change(
466
  fn=dom.select_files_urls,
src/constants.py CHANGED
@@ -22,7 +22,7 @@ INDEX_CATEGORY = [
22
  # 'insurance',
23
  # 'soil',
24
  'general',
25
- 'vegetables',
26
  ]
27
 
28
  # Doctype of the master index of each index category. Master index for each index category would be stored under this key.
@@ -43,6 +43,7 @@ DATA_SOURCES = {
43
  # LangChain related constants
44
  TEXT_SPLITTER_CHUNK_SIZE = 1000
45
  TEXT_SPLITTER_CHUNK_OVERLAP = 0
 
46
 
47
 
48
  URLS = [
 
22
  # 'insurance',
23
  # 'soil',
24
  'general',
25
+ 'vegetables'
26
  ]
27
 
28
  # Doctype of the master index of each index category. Master index for each index category would be stored under this key.
 
43
  # LangChain related constants
44
  TEXT_SPLITTER_CHUNK_SIZE = 1000
45
  TEXT_SPLITTER_CHUNK_OVERLAP = 0
46
+ TEXT_SPLITTER_SEPARATOR = '\n\n'
47
 
48
 
49
  URLS = [
src/data_loader.py CHANGED
@@ -135,7 +135,17 @@ class DATA_LOADER:
135
 
136
  # Load data from files on the local directory (files may be of type .pdf, .txt, .doc, etc.)
137
  elif doc_type == 'directory':
138
- documents = SimpleDirectoryReader(doc_filepath).load_data()
 
 
 
 
 
 
 
 
 
 
139
 
140
  # Load data from URLs in Knowledge Base format
141
  elif doc_type == 'url-kb':
@@ -190,7 +200,12 @@ class DATA_LOADER:
190
  ):
191
  cleaned_documents = []
192
  for document in documents:
193
- document.page_content = self.utils_obj.replace_newlines_and_spaces(document.page_content)
 
 
 
 
 
194
  cleaned_documents.append(document)
195
  return cleaned_documents
196
 
 
135
 
136
  # Load data from files on the local directory (files may be of type .pdf, .txt, .doc, etc.)
137
  elif doc_type == 'directory':
138
+ # Load multiple PDFs from directory
139
+ if os.path.isdir(doc_filepath):
140
+ documents = SimpleDirectoryReader(
141
+ input_dir=doc_filepath
142
+ ).load_data()
143
+
144
+ # Loading from a file
145
+ elif os.path.isfile(doc_filepath):
146
+ documents.extend(SimpleDirectoryReader(
147
+ input_files=[doc_filepath]
148
+ ).load_data())
149
 
150
  # Load data from URLs in Knowledge Base format
151
  elif doc_type == 'url-kb':
 
200
  ):
201
  cleaned_documents = []
202
  for document in documents:
203
+ if hasattr(document, 'page_content'):
204
+ document.page_content = self.utils_obj.replace_newlines_and_spaces(document.page_content)
205
+ elif hasattr(document, 'text'):
206
+ document.text = self.utils_obj.replace_newlines_and_spaces(document.text)
207
+ else:
208
+ document = self.utils_obj.replace_newlines_and_spaces(document)
209
  cleaned_documents.append(document)
210
  return cleaned_documents
211
 
src/langchain_utils.py CHANGED
@@ -22,7 +22,6 @@ from typing import Dict, List, Optional
22
 
23
  import os
24
  os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
25
- os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')
26
 
27
  import logging
28
  logger = logging.getLogger(__name__)
@@ -186,7 +185,11 @@ class LANGCHAIN_UTILS:
186
  custom_prompt=True
187
  ):
188
  # Prepare data (Split paragraph into chunks of small documents)
189
- text_splitter = CharacterTextSplitter(chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE, chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP)
 
 
 
 
190
  texts = text_splitter.split_text(para)
191
 
192
  if self.index_type == 'FAISS':
@@ -299,15 +302,16 @@ class LANGCHAIN_UTILS:
299
 
300
  logger.info(f'Creating index')
301
 
 
 
 
 
 
 
 
302
  ############## Build the Vector store for docs ##############
303
  # Vector store using Facebook AI Similarity Search
304
  if self.index_type == 'FAISS':
305
- text_splitter = CharacterTextSplitter(
306
- chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
307
- chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
308
- )
309
- self.documents = text_splitter.split_documents(self.documents)
310
-
311
  self.index = FAISS.from_documents(
312
  self.documents,
313
  self.embeddings
@@ -318,11 +322,6 @@ class LANGCHAIN_UTILS:
318
  if not os.path.exists(self.index_filepath):
319
  os.makedirs(self.index_filepath)
320
 
321
- text_splitter = CharacterTextSplitter(
322
- chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
323
- chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP
324
- )
325
- self.documents = text_splitter.split_documents(self.documents)
326
  self.index = Chroma.from_documents(
327
  self.documents,
328
  self.embeddings,
@@ -478,7 +477,7 @@ class LANGCHAIN_UTILS:
478
 
479
  logger.info(f'Saving index to: {index_filepath}')
480
 
481
- if not os.path.exists(index_filepath):
482
  os.makedirs(index_filepath)
483
 
484
  if self.index_type == 'FAISS':
@@ -598,6 +597,7 @@ class LANGCHAIN_UTILS:
598
  if not index or not isinstance(index, GPTSimpleVectorIndex):
599
  logger.warning(f'{doc_type} index to be merged is not an instance of type llama_index.GPTSimpleVectorIndex')
600
  continue
 
601
  raise NotImplementedError
602
 
603
  # Store index_category master index
@@ -634,7 +634,11 @@ class LANGCHAIN_UTILS:
634
  logger.info('Chroma DB initialized successfully!')
635
 
636
 
637
- def query_chromadb(self, question, k=1):
 
 
 
 
638
  return self.index.similarity_search(query=question, k=k)
639
 
640
 
@@ -658,7 +662,7 @@ class LANGCHAIN_UTILS:
658
  response = None
659
 
660
  # Get the index of the given question_category
661
- index = self.index_category_doc_type_wise_index[question_category]['master']
662
 
663
  if self.index_type == 'FAISS':
664
  response = index.similarity_search(
 
22
 
23
  import os
24
  os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
 
25
 
26
  import logging
27
  logger = logging.getLogger(__name__)
 
185
  custom_prompt=True
186
  ):
187
  # Prepare data (Split paragraph into chunks of small documents)
188
+ text_splitter = CharacterTextSplitter(
189
+ chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
190
+ chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
191
+ separator=constants_utils.TEXT_SPLITTER_SEPARATOR
192
+ )
193
  texts = text_splitter.split_text(para)
194
 
195
  if self.index_type == 'FAISS':
 
302
 
303
  logger.info(f'Creating index')
304
 
305
+ text_splitter = CharacterTextSplitter(
306
+ chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
307
+ chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
308
+ separator=constants_utils.TEXT_SPLITTER_SEPARATOR
309
+ )
310
+ self.documents = text_splitter.split_documents(self.documents)
311
+
312
  ############## Build the Vector store for docs ##############
313
  # Vector store using Facebook AI Similarity Search
314
  if self.index_type == 'FAISS':
 
 
 
 
 
 
315
  self.index = FAISS.from_documents(
316
  self.documents,
317
  self.embeddings
 
322
  if not os.path.exists(self.index_filepath):
323
  os.makedirs(self.index_filepath)
324
 
 
 
 
 
 
325
  self.index = Chroma.from_documents(
326
  self.documents,
327
  self.embeddings,
 
477
 
478
  logger.info(f'Saving index to: {index_filepath}')
479
 
480
+ if not os.path.exists(index_filepath) and os.path.isdir(index_filepath):
481
  os.makedirs(index_filepath)
482
 
483
  if self.index_type == 'FAISS':
 
597
  if not index or not isinstance(index, GPTSimpleVectorIndex):
598
  logger.warning(f'{doc_type} index to be merged is not an instance of type llama_index.GPTSimpleVectorIndex')
599
  continue
600
+ import pdb; pdb.set_trace()
601
  raise NotImplementedError
602
 
603
  # Store index_category master index
 
634
  logger.info('Chroma DB initialized successfully!')
635
 
636
 
637
+ def query_chromadb(
638
+ self,
639
+ question,
640
+ k=1
641
+ ):
642
  return self.index.similarity_search(query=question, k=k)
643
 
644
 
 
662
  response = None
663
 
664
  # Get the index of the given question_category
665
+ index = self.index_category_doc_type_wise_index[question_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]
666
 
667
  if self.index_type == 'FAISS':
668
  response = index.similarity_search(