prashant commited on
Commit
d7ce857
·
1 Parent(s): 7af394d
appStore/keyword_search.py CHANGED
@@ -6,7 +6,7 @@ import streamlit as st
6
  import json
7
  import logging
8
  from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
9
- from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_search
10
  from utils.checkconfig import getconfig
11
 
12
  # Declare all the necessary variables
@@ -21,6 +21,7 @@ embedding_model = config.get('semantic_search','RETRIEVER')
21
  embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
22
  embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
23
  embedding_dim = int(config.get('semantic_search','EMBEDDING_DIM'))
 
24
  retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
25
  reader_model = config.get('semantic_search','READER')
26
  reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
@@ -100,7 +101,7 @@ def app():
100
  if 'filepath' in st.session_state:
101
 
102
  if searchtype:
103
- allDocuments = runLexicalPreprocessingPipeline(
104
  file_name=st.session_state['filename'],
105
  file_path=st.session_state['filepath'],
106
  split_by=lexical_split_by,
@@ -110,13 +111,12 @@ def app():
110
  logging.info("performing lexical search")
111
  with st.spinner("Performing Exact matching search \
112
  (Lexical search) for you"):
113
- st.markdown("##### Top few lexical search (TFIDF) hits #####")
114
  lexical_search(
115
  query=queryList,
116
- documents = allDocuments['documents'],
117
  top_k = lexical_top_k )
118
  else:
119
- allDocuments = runSemanticPreprocessingPipeline(
120
  file_path= st.session_state['filepath'],
121
  file_name = st.session_state['filename'],
122
  split_by=split_by,
@@ -124,20 +124,21 @@ def app():
124
  split_overlap=split_overlap,
125
  removePunc= remove_punc,
126
  split_respect_sentence_boundary=split_respect_sentence_boundary)
127
- if len(allDocuments['documents']) > 100:
128
  warning_msg = ": This might take sometime, please sit back and relax."
129
  else:
130
  warning_msg = ""
131
 
132
  logging.info("starting semantic search")
133
  with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
134
- semantic_search(query = queryList,
135
- documents = allDocuments['documents'],
136
  embedding_model=embedding_model,
137
  embedding_layer=embedding_layer,
138
  embedding_model_format=embedding_model_format,
139
  reader_model=reader_model,reader_top_k=reader_top_k,
140
- retriever_top_k=retriever_top_k, embedding_dim=embedding_dim)
 
141
 
142
  else:
143
  st.info("🤔 No document found, please try to upload it at the sidebar!")
 
6
  import json
7
  import logging
8
  from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
9
+ from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_keywordsearch
10
  from utils.checkconfig import getconfig
11
 
12
  # Declare all the necessary variables
 
21
  embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
22
  embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
23
  embedding_dim = int(config.get('semantic_search','EMBEDDING_DIM'))
24
+ max_seq_len = int(config.get('semantic_search','MAX_SEQ_LENGTH'))
25
  retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
26
  reader_model = config.get('semantic_search','READER')
27
  reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
 
101
  if 'filepath' in st.session_state:
102
 
103
  if searchtype:
104
+ all_documents = runLexicalPreprocessingPipeline(
105
  file_name=st.session_state['filename'],
106
  file_path=st.session_state['filepath'],
107
  split_by=lexical_split_by,
 
111
  logging.info("performing lexical search")
112
  with st.spinner("Performing Exact matching search \
113
  (Lexical search) for you"):
 
114
  lexical_search(
115
  query=queryList,
116
+ documents = all_documents['documents'],
117
  top_k = lexical_top_k )
118
  else:
119
+ all_documents = runSemanticPreprocessingPipeline(
120
  file_path= st.session_state['filepath'],
121
  file_name = st.session_state['filename'],
122
  split_by=split_by,
 
124
  split_overlap=split_overlap,
125
  removePunc= remove_punc,
126
  split_respect_sentence_boundary=split_respect_sentence_boundary)
127
+ if len(all_documents['documents']) > 100:
128
  warning_msg = ": This might take sometime, please sit back and relax."
129
  else:
130
  warning_msg = ""
131
 
132
  logging.info("starting semantic search")
133
  with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
134
+ semantic_keywordsearch(query = queryList,
135
+ documents = all_documents['documents'],
136
  embedding_model=embedding_model,
137
  embedding_layer=embedding_layer,
138
  embedding_model_format=embedding_model_format,
139
  reader_model=reader_model,reader_top_k=reader_top_k,
140
+ retriever_top_k=retriever_top_k, embedding_dim=embedding_dim,
141
+ max_seq_len=max_seq_len)
142
 
143
  else:
144
  st.info("🤔 No document found, please try to upload it at the sidebar!")
appStore/sdg_analysis.py CHANGED
@@ -93,31 +93,31 @@ def app():
93
  file_path = st.session_state['filepath']
94
  classifier = load_sdgClassifier(classifier_name=model_name)
95
  st.session_state['sdg_classifier'] = classifier
96
- allDocuments = runSDGPreprocessingPipeline(fileName= file_name,
97
  filePath= file_path, split_by= split_by,
98
  split_length= split_length,
99
  split_overlap= split_overlap,
100
  split_respect_sentence_boundary= split_respect_sentence_boundary,
101
- removePunc= remove_punc)
102
 
103
- if len(allDocuments['documents']) > 100:
104
  warning_msg = ": This might take sometime, please sit back and relax."
105
  else:
106
  warning_msg = ""
107
 
108
  with st.spinner("Running SDG Classification{}".format(warning_msg)):
109
 
110
- df, x = sdg_classification(haystackdoc=allDocuments['documents'],
111
  threshold= threshold)
112
  df = df.drop(['Relevancy'], axis = 1)
113
  sdg_labels = x.SDG.unique()[::-1]
114
- textrankkeywordlist = []
115
  for label in sdg_labels:
116
  sdgdata = " ".join(df[df.SDG == label].text.to_list())
117
  textranklist_ = textrank(textdata=sdgdata, words= top_n)
118
  if len(textranklist_) > 0:
119
- textrankkeywordlist.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
120
- tRkeywordsDf = pd.DataFrame(textrankkeywordlist)
121
 
122
 
123
  plt.rcParams['font.size'] = 25
 
93
  file_path = st.session_state['filepath']
94
  classifier = load_sdgClassifier(classifier_name=model_name)
95
  st.session_state['sdg_classifier'] = classifier
96
+ all_documents = runSDGPreprocessingPipeline(fileName= file_name,
97
  filePath= file_path, split_by= split_by,
98
  split_length= split_length,
99
  split_overlap= split_overlap,
100
  split_respect_sentence_boundary= split_respect_sentence_boundary,
101
+ remove_punc= remove_punc)
102
 
103
+ if len(all_documents['documents']) > 100:
104
  warning_msg = ": This might take sometime, please sit back and relax."
105
  else:
106
  warning_msg = ""
107
 
108
  with st.spinner("Running SDG Classification{}".format(warning_msg)):
109
 
110
+ df, x = sdg_classification(haystack_doc=all_documents['documents'],
111
  threshold= threshold)
112
  df = df.drop(['Relevancy'], axis = 1)
113
  sdg_labels = x.SDG.unique()[::-1]
114
+ textrank_keyword_list = []
115
  for label in sdg_labels:
116
  sdgdata = " ".join(df[df.SDG == label].text.to_list())
117
  textranklist_ = textrank(textdata=sdgdata, words= top_n)
118
  if len(textranklist_) > 0:
119
+ textrank_keyword_list.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
120
+ tRkeywordsDf = pd.DataFrame(textrank_keyword_list)
121
 
122
 
123
  plt.rcParams['font.size'] = 25
utils/keyword_extraction.py CHANGED
@@ -58,7 +58,7 @@ def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
58
  return results
59
 
60
 
61
- def tfidfKeyword(textdata, vectorizer, tfidfmodel, top_n):
62
  """
63
  TFIDF based keywords extraction
64
 
@@ -81,7 +81,7 @@ def tfidfKeyword(textdata, vectorizer, tfidfmodel, top_n):
81
  keywords = [keyword for keyword in results]
82
  return keywords
83
 
84
- def keywordExtraction(sdg:int,sdgdata:List[Text]):
85
  """
86
  TFIDF based keywords extraction
87
 
@@ -102,7 +102,7 @@ def keywordExtraction(sdg:int,sdgdata:List[Text]):
102
  features = vectorizer.get_feature_names_out()
103
  tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
104
  sorted_items=sort_coo(tf_idf_vector.tocoo())
105
- top_n = int(config.get('tfidf', 'TOP_N'))
106
  results=extract_topn_from_vector(features,sorted_items,top_n)
107
  keywords = [keyword for keyword in results]
108
  return keywords
 
58
  return results
59
 
60
 
61
+ def tfidf_keyword(textdata, vectorizer, tfidfmodel, top_n):
62
  """
63
  TFIDF based keywords extraction
64
 
 
81
  keywords = [keyword for keyword in results]
82
  return keywords
83
 
84
+ def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
85
  """
86
  TFIDF based keywords extraction
87
 
 
102
  features = vectorizer.get_feature_names_out()
103
  tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
104
  sorted_items=sort_coo(tf_idf_vector.tocoo())
105
+ top_n = top_n
106
  results=extract_topn_from_vector(features,sorted_items,top_n)
107
  keywords = [keyword for keyword in results]
108
  return keywords
utils/lexical_search.py CHANGED
@@ -25,7 +25,7 @@ except ImportError:
25
 
26
  def runLexicalPreprocessingPipeline(file_path,file_name,
27
  split_by: Literal["sentence", "word"] = 'word',
28
- split_length:int = 80, removePunc:bool = False,
29
  split_overlap:int = 0 )->List[Document]:
30
  """
31
  creates the pipeline and runs the preprocessing pipeline,
@@ -61,7 +61,7 @@ def runLexicalPreprocessingPipeline(file_path,file_name,
61
  output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
62
  params= {"FileConverter": {"file_path": file_path, \
63
  "file_name": file_name},
64
- "UdfPreProcessor": {"removePunc": removePunc, \
65
  "split_by": split_by, \
66
  "split_length":split_length,\
67
  "split_overlap": split_overlap}})
@@ -223,12 +223,23 @@ def lexical_search(query:Text,top_k:int, documents:List[Document]):
223
  retriever = TfidfRetriever(document_store)
224
  results = retriever.retrieve(query=query, top_k = top_k)
225
  query_tokens = tokenize_lexical_query(query)
 
226
  for count, result in enumerate(results):
227
  matches, doc = runSpacyMatcher(query_tokens,result.content)
 
228
  if len(matches) != 0:
 
 
 
 
 
 
 
229
  if check_streamlit():
230
  st.write("Result {}".format(count+1))
231
  else:
232
  print("Results {}".format(count +1))
233
  spacyAnnotator(matches, doc)
234
-
 
 
 
25
 
26
  def runLexicalPreprocessingPipeline(file_path,file_name,
27
  split_by: Literal["sentence", "word"] = 'word',
28
+ split_length:int = 80, remove_punc:bool = False,
29
  split_overlap:int = 0 )->List[Document]:
30
  """
31
  creates the pipeline and runs the preprocessing pipeline,
 
61
  output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
62
  params= {"FileConverter": {"file_path": file_path, \
63
  "file_name": file_name},
64
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
65
  "split_by": split_by, \
66
  "split_length":split_length,\
67
  "split_overlap": split_overlap}})
 
223
  retriever = TfidfRetriever(document_store)
224
  results = retriever.retrieve(query=query, top_k = top_k)
225
  query_tokens = tokenize_lexical_query(query)
226
+ flag = True
227
  for count, result in enumerate(results):
228
  matches, doc = runSpacyMatcher(query_tokens,result.content)
229
+
230
  if len(matches) != 0:
231
+ if flag:
232
+ flag = False
233
+ if check_streamlit:
234
+ st.markdown("##### Top few lexical search (TFIDF) hits #####")
235
+ else:
236
+ print("Top few lexical search (TFIDF) hits")
237
+
238
  if check_streamlit():
239
  st.write("Result {}".format(count+1))
240
  else:
241
  print("Results {}".format(count +1))
242
  spacyAnnotator(matches, doc)
243
+
244
+ if flag:
245
+ st.info("🤔 No relevant result found. Please try another keyword.")
utils/preprocessing.py CHANGED
@@ -120,7 +120,7 @@ class FileConverter(BaseComponent):
120
  return
121
 
122
 
123
- def basic(s, removePunc:bool = False):
124
 
125
  """
126
  Performs basic cleaning of text.
@@ -141,7 +141,7 @@ def basic(s, removePunc:bool = False):
141
  s = re.sub('\n', ' ', s)
142
 
143
  # Remove punctuations
144
- if removePunc == True:
145
  translator = str.maketrans(' ', ' ', string.punctuation)
146
  s = s.translate(translator)
147
  # Remove distracting single quotes and dotted pattern
@@ -164,7 +164,7 @@ class UdfPreProcessor(BaseComponent):
164
  """
165
  outgoing_edges = 1
166
 
167
- def run(self, documents:List[Document], removePunc:bool,
168
  split_by: Literal["sentence", "word"] = 'sentence',
169
  split_respect_sentence_boundary = False,
170
  split_length:int = 2, split_overlap:int = 0):
@@ -220,7 +220,7 @@ class UdfPreProcessor(BaseComponent):
220
  # i = basic(i)
221
  docs_processed = preprocessor.process([i])
222
  for item in docs_processed:
223
- item.content = basic(item.content, removePunc= removePunc)
224
 
225
  df = pd.DataFrame(docs_processed)
226
  all_text = " ".join(df.content.to_list())
@@ -248,12 +248,12 @@ def processingpipeline():
248
  """
249
 
250
  preprocessing_pipeline = Pipeline()
251
- fileconverter = FileConverter()
252
- customPreprocessor = UdfPreProcessor()
253
 
254
- preprocessing_pipeline.add_node(component=fileconverter,
255
  name="FileConverter", inputs=["File"])
256
- preprocessing_pipeline.add_node(component = customPreprocessor,
257
  name ='UdfPreProcessor', inputs=["FileConverter"])
258
 
259
  return preprocessing_pipeline
 
120
  return
121
 
122
 
123
+ def basic(s, remove_punc:bool = False):
124
 
125
  """
126
  Performs basic cleaning of text.
 
141
  s = re.sub('\n', ' ', s)
142
 
143
  # Remove punctuations
144
+ if remove_punc == True:
145
  translator = str.maketrans(' ', ' ', string.punctuation)
146
  s = s.translate(translator)
147
  # Remove distracting single quotes and dotted pattern
 
164
  """
165
  outgoing_edges = 1
166
 
167
+ def run(self, documents:List[Document], remove_punc:bool,
168
  split_by: Literal["sentence", "word"] = 'sentence',
169
  split_respect_sentence_boundary = False,
170
  split_length:int = 2, split_overlap:int = 0):
 
220
  # i = basic(i)
221
  docs_processed = preprocessor.process([i])
222
  for item in docs_processed:
223
+ item.content = basic(item.content, remove_punc= remove_punc)
224
 
225
  df = pd.DataFrame(docs_processed)
226
  all_text = " ".join(df.content.to_list())
 
248
  """
249
 
250
  preprocessing_pipeline = Pipeline()
251
+ file_converter = FileConverter()
252
+ custom_preprocessor = UdfPreProcessor()
253
 
254
+ preprocessing_pipeline.add_node(component=file_converter,
255
  name="FileConverter", inputs=["File"])
256
+ preprocessing_pipeline.add_node(component = custom_preprocessor,
257
  name ='UdfPreProcessor', inputs=["FileConverter"])
258
 
259
  return preprocessing_pipeline
utils/sdg_classifier.py CHANGED
@@ -34,7 +34,7 @@ _lab_dict = {0: 'no_cat',
34
  17:'SDG 17 - Partnership for the goals',}
35
 
36
  @st.cache(allow_output_mutation=True)
37
- def load_sdgClassifier(configFile = None, classifier_name = None):
38
  """
39
  loads the document classifier using haystack, where the name/path of model
40
  in HF-hub as string is used to fetch the model object.Either configfile or
@@ -52,11 +52,11 @@ def load_sdgClassifier(configFile = None, classifier_name = None):
52
  Return: document classifier model
53
  """
54
  if not classifier_name:
55
- if not configFile:
56
  logging.warning("Pass either model name or config file")
57
  return
58
  else:
59
- config = getconfig(configFile)
60
  classifier_name = config.get('sdg','MODEL')
61
 
62
  logging.info("Loading classifier")
@@ -68,8 +68,8 @@ def load_sdgClassifier(configFile = None, classifier_name = None):
68
 
69
 
70
  @st.cache(allow_output_mutation=True)
71
- def sdg_classification(haystackdoc:List[Document],
72
- threshold:float, classifiermodel= None)->Tuple[DataFrame,Series]:
73
  """
74
  Text-Classification on the list of texts provided. Classifier provides the
75
  most appropriate label for each text. these labels are in terms of if text
@@ -93,14 +93,14 @@ def sdg_classification(haystackdoc:List[Document],
93
 
94
  """
95
  logging.info("Working on SDG Classification")
96
- if not classifiermodel:
97
  if check_streamlit:
98
- classifiermodel = st.session_state['sdg_classifier']
99
  else:
100
  logging.warning("No streamlit envinornment found, Pass the classifier")
101
  return
102
 
103
- results = classifiermodel.predict(haystackdoc)
104
 
105
 
106
  labels_= [(l.meta['classification']['label'],
@@ -130,7 +130,7 @@ def runSDGPreprocessingPipeline(filePath, fileName,
130
  split_by: Literal["sentence", "word"] = 'sentence',
131
  split_respect_sentence_boundary = False,
132
  split_length:int = 2, split_overlap = 0,
133
- removePunc = False)->List[Document]:
134
  """
135
  creates the pipeline and runs the preprocessing pipeline,
136
  the params for pipeline are fetched from paramconfig
@@ -163,7 +163,7 @@ def runSDGPreprocessingPipeline(filePath, fileName,
163
  output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
164
  params= {"FileConverter": {"file_path": filePath, \
165
  "file_name": fileName},
166
- "UdfPreProcessor": {"removePunc": removePunc, \
167
  "split_by": split_by, \
168
  "split_length":split_length,\
169
  "split_overlap": split_overlap, \
 
34
  17:'SDG 17 - Partnership for the goals',}
35
 
36
  @st.cache(allow_output_mutation=True)
37
+ def load_sdgClassifier(config_file = None, classifier_name = None):
38
  """
39
  loads the document classifier using haystack, where the name/path of model
40
  in HF-hub as string is used to fetch the model object.Either configfile or
 
52
  Return: document classifier model
53
  """
54
  if not classifier_name:
55
+ if not config_file:
56
  logging.warning("Pass either model name or config file")
57
  return
58
  else:
59
+ config = getconfig(config_file)
60
  classifier_name = config.get('sdg','MODEL')
61
 
62
  logging.info("Loading classifier")
 
68
 
69
 
70
  @st.cache(allow_output_mutation=True)
71
+ def sdg_classification(haystack_doc:List[Document],
72
+ threshold:float, classifier_model= None)->Tuple[DataFrame,Series]:
73
  """
74
  Text-Classification on the list of texts provided. Classifier provides the
75
  most appropriate label for each text. these labels are in terms of if text
 
93
 
94
  """
95
  logging.info("Working on SDG Classification")
96
+ if not classifier_model:
97
  if check_streamlit:
98
+ classifier_model = st.session_state['sdg_classifier']
99
  else:
100
  logging.warning("No streamlit envinornment found, Pass the classifier")
101
  return
102
 
103
+ results = classifier_model.predict(haystack_doc)
104
 
105
 
106
  labels_= [(l.meta['classification']['label'],
 
130
  split_by: Literal["sentence", "word"] = 'sentence',
131
  split_respect_sentence_boundary = False,
132
  split_length:int = 2, split_overlap = 0,
133
+ remove_punc = False)->List[Document]:
134
  """
135
  creates the pipeline and runs the preprocessing pipeline,
136
  the params for pipeline are fetched from paramconfig
 
163
  output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
164
  params= {"FileConverter": {"file_path": filePath, \
165
  "file_name": fileName},
166
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
167
  "split_by": split_by, \
168
  "split_length":split_length,\
169
  "split_overlap": split_overlap, \
utils/semantic_search.py CHANGED
@@ -37,8 +37,8 @@ class QueryCheck(BaseComponent):
37
  Uses Query Classifier from Haystack, process the query based on query type.
38
  Ability to determine the statements is not so good, therefore the chances
39
  statement also get modified. Ex: "List water related issues" will be
40
- identified by the model as keywords, and therefore it be processed as "find
41
- all issues related to 'list all water related issues'". This is one shortcoming
42
  but is igonred for now, as semantic search will not get affected a lot, by this.
43
 
44
  1. https://docs.haystack.deepset.ai/docs/query_classifier
@@ -61,7 +61,7 @@ class QueryCheck(BaseComponent):
61
  output = {"query":query,
62
  "query_type": 'question/statement'}
63
  else:
64
- output = {"query": "find all issues related to {}".format(query),
65
  "query_type": 'statements/keyword'}
66
  logging.info(output)
67
  return output, "output_1"
@@ -74,7 +74,7 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
74
  split_by: Literal["sentence", "word"] = 'sentence',
75
  split_respect_sentence_boundary = False,
76
  split_length:int = 2, split_overlap = 0,
77
- removePunc = False)->List[Document]:
78
  """
79
  creates the pipeline and runs the preprocessing pipeline.
80
 
@@ -106,7 +106,7 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
106
  output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
107
  params= {"FileConverter": {"file_path": file_path, \
108
  "file_name": file_name},
109
- "UdfPreProcessor": {"removePunc": removePunc, \
110
  "split_by": split_by, \
111
  "split_length":split_length,\
112
  "split_overlap": split_overlap,
@@ -118,7 +118,7 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
118
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
119
  def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = None,
120
  embedding_layer:int = None, retriever_top_k:int = 10,
121
- document_store:InMemoryDocumentStore = None):
122
  """
123
  Returns the Retriever model based on params provided.
124
  1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
@@ -133,6 +133,8 @@ def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = No
133
  embedding_model_format: check the github link of Haystack provided in documentation
134
  embedding_layer: check the github link of Haystack provided in documentation
135
  retriever_top_k: Number of Top results to be returned by retriever
 
 
136
  document_store: InMemoryDocumentStore, write haystack Document list to DocumentStore
137
  and pass the same to function call. Can be done using createDocumentStore from utils.
138
 
@@ -149,14 +151,15 @@ def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = No
149
  embedding_model=embedding_model,top_k = retriever_top_k,
150
  document_store = document_store,
151
  emb_extraction_layer=embedding_layer, scale_score =True,
152
- model_format=embedding_model_format, use_gpu = True)
 
153
  if check_streamlit:
154
  st.session_state['retriever'] = retriever
155
  return retriever
156
 
157
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
158
  def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
159
- embedding_dim:int = 768):
160
  """
161
  Creates the InMemory Document Store from haystack list of Documents.
162
  It is mandatory component for Retriever to work in Haystack frame work.
@@ -185,15 +188,20 @@ def createDocumentStore(documents:List[Document], similarity:str = 'dot_product'
185
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
186
  def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
187
  useQueryCheck = True, embedding_model_format:Text = None,
 
188
  embedding_layer:int = None, retriever_top_k:int = 10,
189
- reader_model:str = None, reader_top_k:int = 10,
190
- embedding_dim:int = 768):
191
  """
192
  creates the semantic search pipeline and document Store object from the
193
  list of haystack documents. The top_k for the Reader and Retirever are kept
194
  same, so that all the results returned by Retriever are used, however the
195
  context is extracted by Reader for each retrieved result. The querycheck is
196
- added as node to process the query.
 
 
 
 
197
  1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
198
  2. https://www.sbert.net/examples/applications/semantic-search/README.html
199
  3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
@@ -218,6 +226,8 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
218
  embedding_dim: Document store has default value of embedding size = 768, and
219
  update_embeddings method of Docstore cannot infer the embedding size of
220
  retiever automaticallu, therefore set this value as per the model card.
 
 
221
 
222
 
223
  Return
@@ -237,27 +247,28 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
237
  embedding_model_format=embedding_model_format,
238
  embedding_layer=embedding_layer,
239
  retriever_top_k= retriever_top_k,
240
- document_store = document_store)
 
241
 
242
  document_store.update_embeddings(retriever)
243
  reader = FARMReader(model_name_or_path=reader_model,
244
  top_k = reader_top_k, use_gpu=True)
245
- semanticsearch_pipeline = Pipeline()
246
  if useQueryCheck:
247
  querycheck = QueryCheck()
248
- semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
249
  inputs = ["Query"])
250
- semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
251
  inputs = ["QueryCheck.output_1"])
252
- semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
253
  inputs= ["EmbeddingRetriever"])
254
  else:
255
- semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
256
  inputs = ["Query"])
257
- semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
258
  inputs= ["EmbeddingRetriever"])
259
 
260
- return semanticsearch_pipeline, document_store
261
 
262
 
263
  def semanticsearchAnnotator(matches: List[List[int]], document):
@@ -296,11 +307,12 @@ def semanticsearchAnnotator(matches: List[List[int]], document):
296
  print(annotated_text)
297
 
298
 
299
- def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
300
  embedding_model_format:Text,
301
  embedding_layer:int, reader_model:str,
302
  retriever_top_k:int = 10, reader_top_k:int = 10,
303
- return_results:bool = False, embedding_dim:int = 768):
 
304
  """
305
  Performs the Semantic search on the List of haystack documents which is
306
  returned by preprocessing Pipeline.
@@ -316,7 +328,8 @@ def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
316
  embedding_layer= embedding_layer,
317
  embedding_model_format= embedding_model_format,
318
  reader_model= reader_model, retriever_top_k= retriever_top_k,
319
- reader_top_k= reader_top_k, embedding_dim=embedding_dim)
 
320
 
321
  results = semanticsearch_pipeline.run(query = query)
322
  if return_results:
@@ -328,10 +341,10 @@ def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
328
  print("Top few semantic search results")
329
  for i,answer in enumerate(results['answers']):
330
  temp = answer.to_dict()
331
- start_idx = temp['offsets_in_document'][0]['start']
332
- end_idx = temp['offsets_in_document'][0]['end']
333
- match = [[start_idx,end_idx]]
334
  doc = doc_store.get_document_by_id(temp['document_id']).content
 
 
 
335
  if check_streamlit:
336
  st.write("Result {}".format(i+1))
337
  else:
 
37
  Uses Query Classifier from Haystack, process the query based on query type.
38
  Ability to determine the statements is not so good, therefore the chances
39
  statement also get modified. Ex: "List water related issues" will be
40
+ identified by the model as keywords, and therefore it be processed as "what are
41
+ the 'list all water related issues' related issues and discussions?". This is one shortcoming
42
  but is igonred for now, as semantic search will not get affected a lot, by this.
43
 
44
  1. https://docs.haystack.deepset.ai/docs/query_classifier
 
61
  output = {"query":query,
62
  "query_type": 'question/statement'}
63
  else:
64
+ output = {"query": "what are the {} related issues and discussions?".format(query),
65
  "query_type": 'statements/keyword'}
66
  logging.info(output)
67
  return output, "output_1"
 
74
  split_by: Literal["sentence", "word"] = 'sentence',
75
  split_respect_sentence_boundary = False,
76
  split_length:int = 2, split_overlap = 0,
77
+ remove_punc = False)->List[Document]:
78
  """
79
  creates the pipeline and runs the preprocessing pipeline.
80
 
 
106
  output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
107
  params= {"FileConverter": {"file_path": file_path, \
108
  "file_name": file_name},
109
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
110
  "split_by": split_by, \
111
  "split_length":split_length,\
112
  "split_overlap": split_overlap,
 
118
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
119
  def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = None,
120
  embedding_layer:int = None, retriever_top_k:int = 10,
121
+ max_seq_len:int = 512, document_store:InMemoryDocumentStore = None):
122
  """
123
  Returns the Retriever model based on params provided.
124
  1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
 
133
  embedding_model_format: check the github link of Haystack provided in documentation
134
  embedding_layer: check the github link of Haystack provided in documentation
135
  retriever_top_k: Number of Top results to be returned by retriever
136
+ max_seq_len: everymodel has max seq len it can handle, check in model card.
137
+ Needed to hanlde the edge cases.
138
  document_store: InMemoryDocumentStore, write haystack Document list to DocumentStore
139
  and pass the same to function call. Can be done using createDocumentStore from utils.
140
 
 
151
  embedding_model=embedding_model,top_k = retriever_top_k,
152
  document_store = document_store,
153
  emb_extraction_layer=embedding_layer, scale_score =True,
154
+ model_format=embedding_model_format, use_gpu = True,
155
+ max_seq_len = max_seq_len )
156
  if check_streamlit:
157
  st.session_state['retriever'] = retriever
158
  return retriever
159
 
160
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
161
  def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
162
+ embedding_dim:int = 768):
163
  """
164
  Creates the InMemory Document Store from haystack list of Documents.
165
  It is mandatory component for Retriever to work in Haystack frame work.
 
188
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
189
  def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
190
  useQueryCheck = True, embedding_model_format:Text = None,
191
+ max_seq_len:int =512,embedding_dim:int = 768,
192
  embedding_layer:int = None, retriever_top_k:int = 10,
193
+ reader_model:str = None, reader_top_k:int = 10
194
+ ):
195
  """
196
  creates the semantic search pipeline and document Store object from the
197
  list of haystack documents. The top_k for the Reader and Retirever are kept
198
  same, so that all the results returned by Retriever are used, however the
199
  context is extracted by Reader for each retrieved result. The querycheck is
200
+ added as node to process the query. This pipeline is suited for keyword search,
201
+ and to some extent extractive QA purpose. The purpose of Reader is strictly to
202
+ highlight the context for retrieved result and not for QA, however as stated
203
+ it can work for QA too in limited sense.
204
+
205
  1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
206
  2. https://www.sbert.net/examples/applications/semantic-search/README.html
207
  3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
 
226
  embedding_dim: Document store has default value of embedding size = 768, and
227
  update_embeddings method of Docstore cannot infer the embedding size of
228
  retiever automaticallu, therefore set this value as per the model card.
229
+ max_seq_len:everymodel has max seq len it can handle, check in model card.
230
+ Needed to hanlde the edge cases
231
 
232
 
233
  Return
 
247
  embedding_model_format=embedding_model_format,
248
  embedding_layer=embedding_layer,
249
  retriever_top_k= retriever_top_k,
250
+ document_store = document_store,
251
+ max_seq_len=max_seq_len)
252
 
253
  document_store.update_embeddings(retriever)
254
  reader = FARMReader(model_name_or_path=reader_model,
255
  top_k = reader_top_k, use_gpu=True)
256
+ semantic_search_pipeline = Pipeline()
257
  if useQueryCheck:
258
  querycheck = QueryCheck()
259
+ semantic_search_pipeline.add_node(component = querycheck, name = "QueryCheck",
260
  inputs = ["Query"])
261
+ semantic_search_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
262
  inputs = ["QueryCheck.output_1"])
263
+ semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
264
  inputs= ["EmbeddingRetriever"])
265
  else:
266
+ semantic_search_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
267
  inputs = ["Query"])
268
+ semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
269
  inputs= ["EmbeddingRetriever"])
270
 
271
+ return semantic_search_pipeline, document_store
272
 
273
 
274
  def semanticsearchAnnotator(matches: List[List[int]], document):
 
307
  print(annotated_text)
308
 
309
 
310
+ def semantic_keywordsearch(query:Text,documents:List[Document],embedding_model:Text,
311
  embedding_model_format:Text,
312
  embedding_layer:int, reader_model:str,
313
  retriever_top_k:int = 10, reader_top_k:int = 10,
314
+ return_results:bool = False, embedding_dim:int = 768,
315
+ max_seq_len:int = 512):
316
  """
317
  Performs the Semantic search on the List of haystack documents which is
318
  returned by preprocessing Pipeline.
 
328
  embedding_layer= embedding_layer,
329
  embedding_model_format= embedding_model_format,
330
  reader_model= reader_model, retriever_top_k= retriever_top_k,
331
+ reader_top_k= reader_top_k, embedding_dim=embedding_dim,
332
+ max_seq_len=max_seq_len)
333
 
334
  results = semanticsearch_pipeline.run(query = query)
335
  if return_results:
 
341
  print("Top few semantic search results")
342
  for i,answer in enumerate(results['answers']):
343
  temp = answer.to_dict()
 
 
 
344
  doc = doc_store.get_document_by_id(temp['document_id']).content
345
+ start_idx = doc.find(temp['context'])
346
+ end_idx = start_idx + len(temp['context'])
347
+ match = [[start_idx,end_idx]]
348
  if check_streamlit:
349
  st.write("Result {}".format(i+1))
350
  else: