prashant commited on
Commit
99ae6d0
·
1 Parent(s): ac18b03

semantic update

Browse files
appStore/keyword_search.py CHANGED
@@ -85,11 +85,13 @@ def app():
85
  st.markdown("##### Top few lexical search (TFIDF) hits #####")
86
  lexical_search(queryList,allDocuments['documents'])
87
  else:
88
- pass
89
- # paraList = runSemanticPreprocessingPipeline()
90
- # logging.info("starting semantic search")
91
- # with st.spinner("Performing Similar/Contextual search"):
92
- # semantic_search(queryList,paraList)
 
 
93
 
94
  else:
95
  st.info("🤔 No document found, please try to upload it at the sidebar!")
 
85
  st.markdown("##### Top few lexical search (TFIDF) hits #####")
86
  lexical_search(queryList,allDocuments['documents'])
87
  else:
88
+ allDocuments = runSemanticPreprocessingPipeline(
89
+ st.session_state['filepath'],
90
+ st.session_state['filename'])
91
+
92
+ logging.info("starting semantic search")
93
+ with st.spinner("Performing Similar/Contextual search"):
94
+ semantic_search(queryList,allDocuments['documents'])
95
 
96
  else:
97
  st.info("🤔 No document found, please try to upload it at the sidebar!")
paramconfig.cfg CHANGED
@@ -1,6 +1,5 @@
1
  [lexical_search]
2
  TOP_K = 20
3
- THRESHOLD = 0.1
4
  SPLIT_BY = sentence
5
  SPLIT_LENGTH = 3
6
  SPLIT_OVERLAP = 0
 
1
  [lexical_search]
2
  TOP_K = 20
 
3
  SPLIT_BY = sentence
4
  SPLIT_LENGTH = 3
5
  SPLIT_OVERLAP = 0
utils/lexical_search.py CHANGED
@@ -18,15 +18,14 @@ except:
18
  pass
19
 
20
  try:
21
- import streamlit as st
22
-
23
  except ImportError:
24
  logging.info("Streamlit not installed")
25
  config = configparser.ConfigParser()
26
  try:
27
  config.read_file(open('paramconfig.cfg'))
28
  except Exception:
29
- logging.info("paramconfig file not found")
30
  st.info("Please place the paramconfig file in the same directory as app.py")
31
 
32
 
 
18
  pass
19
 
20
  try:
21
+ import streamlit as st
 
22
  except ImportError:
23
  logging.info("Streamlit not installed")
24
  config = configparser.ConfigParser()
25
  try:
26
  config.read_file(open('paramconfig.cfg'))
27
  except Exception:
28
+ logging.warning("paramconfig file not found")
29
  st.info("Please place the paramconfig file in the same directory as app.py")
30
 
31
 
utils/semantic_search.py CHANGED
@@ -3,20 +3,41 @@ from haystack.nodes import EmbeddingRetriever, FARMReader
3
  from haystack.nodes.base import BaseComponent
4
  from haystack.document_stores import InMemoryDocumentStore
5
  import configparser
6
- import streamlit as st
7
  from markdown import markdown
8
  from annotated_text import annotation
9
  from haystack.schema import Document
10
  from typing import List, Text
11
  from utils.preprocessing import processingpipeline
 
12
  from haystack.pipelines import Pipeline
13
-
 
 
 
 
 
 
 
 
14
  config = configparser.ConfigParser()
15
- config.read_file(open('paramconfig.cfg'))
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  class QueryCheck(BaseComponent):
18
  """
19
  Uses Query Classifier from Haystack, process the query based on query type
 
 
20
  """
21
 
22
  outgoing_edges = 1
@@ -28,11 +49,7 @@ class QueryCheck(BaseComponent):
28
  useful for sentence transoformers.
29
 
30
  """
31
-
32
- query_classifier = TransformersQueryClassifier(model_name_or_path=
33
- "shahrukhx01/bert-mini-finetune-question-detection")
34
-
35
-
36
  result = query_classifier.run(query=query)
37
 
38
  if result[1] == "output_1":
@@ -46,11 +63,20 @@ class QueryCheck(BaseComponent):
46
  def run_batch(self, query):
47
  pass
48
 
49
- def runSemanticPreprocessingPipeline()->List[Document]:
 
50
  """
51
  creates the pipeline and runs the preprocessing pipeline,
52
  the params for pipeline are fetched from paramconfig
53
 
 
 
 
 
 
 
 
 
54
  Return
55
  --------------
56
  List[Document]: When preprocessing pipeline is run, the output dictionary
@@ -59,8 +85,7 @@ def runSemanticPreprocessingPipeline()->List[Document]:
59
  key = 'documents' on output.
60
 
61
  """
62
- file_path = st.session_state['filepath']
63
- file_name = st.session_state['filename']
64
  semantic_processing_pipeline = processingpipeline()
65
  split_by = config.get('semantic_search','SPLIT_BY')
66
  split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
@@ -74,9 +99,48 @@ def runSemanticPreprocessingPipeline()->List[Document]:
74
  "split_length":split_length,\
75
  "split_overlap": split_overlap}})
76
 
77
- return output_semantic_pre['documents']
 
 
 
 
 
 
 
 
 
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
 
 
80
  def semanticSearchPipeline(documents:List[Document]):
81
  """
82
  creates the semantic search pipeline and document Store object from the
@@ -100,73 +164,19 @@ def semanticSearchPipeline(documents:List[Document]):
100
  list of document returned by preprocessing pipeline.
101
 
102
  """
103
- if 'document_store' in st.session_state:
104
- document_store = st.session_state['document_store']
105
- temp = document_store.get_all_documents()
106
- if st.session_state['filename'] != temp[0].meta['name']:
107
-
108
- document_store = InMemoryDocumentStore()
109
- document_store.write_documents(documents)
110
- if 'retriever' in st.session_state:
111
- retriever = st.session_state['retriever']
112
- document_store.update_embeddings(retriever)
113
- # querycheck =
114
-
115
-
116
- # embedding_model = config.get('semantic_search','RETRIEVER')
117
- # embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
118
- # embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
119
- # retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
120
- # retriever = EmbeddingRetriever(
121
- # document_store=document_store,
122
- # embedding_model=embedding_model,top_k = retriever_top_k,
123
- # emb_extraction_layer=embedding_layer, scale_score =True,
124
- # model_format=embedding_model_format, use_gpu = True)
125
- # document_store.update_embeddings(retriever)
126
- else:
127
- embedding_model = config.get('semantic_search','RETRIEVER')
128
- embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
129
- retriever = EmbeddingRetriever(
130
- document_store=document_store,
131
- embedding_model=embedding_model,top_k = retriever_top_k,
132
- emb_extraction_layer=embedding_layer, scale_score =True,
133
- model_format=embedding_model_format, use_gpu = True)
134
-
135
  else:
136
- document_store = InMemoryDocumentStore()
137
- document_store.write_documents(documents)
138
-
139
- embedding_model = config.get('semantic_search','RETRIEVER')
140
- embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
141
- embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
142
- retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
143
-
144
-
145
- retriever = EmbeddingRetriever(
146
- document_store=document_store,
147
- embedding_model=embedding_model,top_k = retriever_top_k,
148
- emb_extraction_layer=embedding_layer, scale_score =True,
149
- model_format=embedding_model_format, use_gpu = True)
150
- st.session_state['retriever'] = retriever
151
- document_store.update_embeddings(retriever)
152
- st.session_state['document_store'] = document_store
153
- querycheck = QueryCheck()
154
- st.session_state['querycheck'] = querycheck
155
  reader_model = config.get('semantic_search','READER')
156
- reader_top_k = retriever_top_k
157
  reader = FARMReader(model_name_or_path=reader_model,
158
  top_k = reader_top_k, use_gpu=True)
159
-
160
  st.session_state['reader'] = reader
161
 
162
- querycheck = QueryCheck()
163
-
164
- reader_model = config.get('semantic_search','READER')
165
- reader_top_k = retriever_top_k
166
- reader = FARMReader(model_name_or_path=reader_model,
167
- top_k = reader_top_k, use_gpu=True)
168
-
169
-
170
  semanticsearch_pipeline = Pipeline()
171
  semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
172
  inputs = ["Query"])
@@ -174,9 +184,88 @@ def semanticSearchPipeline(documents:List[Document]):
174
  inputs = ["QueryCheck.output_1"])
175
  semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
176
  inputs= ["EmbeddingRetriever"])
177
-
178
  return semanticsearch_pipeline, document_store
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  def semanticsearchAnnotator(matches: List[List[int]], document):
181
  """
182
  Annotates the text in the document defined by list of [start index, end index]
@@ -191,18 +280,27 @@ def semanticsearchAnnotator(matches: List[List[int]], document):
191
  for match in matches:
192
  start_idx = match[0]
193
  end_idx = match[1]
194
- annotated_text = (annotated_text + document[start:start_idx]
195
- + str(annotation(body=document[start_idx:end_idx],
196
- label="CONTEXT", background="#964448", color='#ffffff')))
 
 
 
 
 
197
  start = end_idx
198
 
199
  annotated_text = annotated_text + document[end_idx:]
200
-
201
- st.write(
202
- markdown(annotated_text),
203
- unsafe_allow_html=True,
204
- )
205
 
 
 
 
 
 
 
 
 
 
206
 
207
  def semantic_search(query:Text,documents:List[Document]):
208
  """
 
3
  from haystack.nodes.base import BaseComponent
4
  from haystack.document_stores import InMemoryDocumentStore
5
  import configparser
 
6
  from markdown import markdown
7
  from annotated_text import annotation
8
  from haystack.schema import Document
9
  from typing import List, Text
10
  from utils.preprocessing import processingpipeline
11
+ from utils.streamlitcheck import check_streamlit
12
  from haystack.pipelines import Pipeline
13
+ import logging
14
+ try:
15
+ from termcolor import colored
16
+ except:
17
+ pass
18
+ try:
19
+ import streamlit as st
20
+ except ImportError:
21
+ logging.info("Streamlit not installed")
22
  config = configparser.ConfigParser()
23
+ try:
24
+ config.read_file(open('paramconfig.cfg'))
25
+ except Exception:
26
+ logging.info("paramconfig file not found")
27
+ st.info("Please place the paramconfig file in the same directory as app.py")
28
+
29
+
30
+ @st.cache(allow_output_mutation=True)
31
+ def loadQueryClassifier():
32
+ query_classifier = TransformersQueryClassifier(model_name_or_path=
33
+ "shahrukhx01/bert-mini-finetune-question-detection")
34
+ return query_classifier
35
 
36
  class QueryCheck(BaseComponent):
37
  """
38
  Uses Query Classifier from Haystack, process the query based on query type
39
+ 1. https://docs.haystack.deepset.ai/docs/query_classifier
40
+
41
  """
42
 
43
  outgoing_edges = 1
 
49
  useful for sentence transoformers.
50
 
51
  """
52
+ query_classifier = loadQueryClassifier()
 
 
 
 
53
  result = query_classifier.run(query=query)
54
 
55
  if result[1] == "output_1":
 
63
  def run_batch(self, query):
64
  pass
65
 
66
+
67
+ def runSemanticPreprocessingPipeline(file_path, file_name)->List[Document]:
68
  """
69
  creates the pipeline and runs the preprocessing pipeline,
70
  the params for pipeline are fetched from paramconfig
71
 
72
+ Params
73
+ ------------
74
+
75
+ file_name: filename, in case of streamlit application use
76
+ st.session_state['filename']
77
+ file_path: filepath, in case of streamlit application use
78
+ st.session_state['filepath']
79
+
80
  Return
81
  --------------
82
  List[Document]: When preprocessing pipeline is run, the output dictionary
 
85
  key = 'documents' on output.
86
 
87
  """
88
+
 
89
  semantic_processing_pipeline = processingpipeline()
90
  split_by = config.get('semantic_search','SPLIT_BY')
91
  split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
 
99
  "split_length":split_length,\
100
  "split_overlap": split_overlap}})
101
 
102
+ return output_semantic_pre
103
+
104
+
105
+ @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
106
+ def loadRetriever(embedding_model = None, embedding_model_format = None,
107
+ embedding_layer = None, retriever_top_k = 10, document_store = None):
108
+ logging.info("loading retriever")
109
+ if document_store is None:
110
+ logging.warning("Retriever initialization requires the DocumentStore")
111
+ return
112
+
113
 
114
+ if embedding_model is None:
115
+ try:
116
+ embedding_model = config.get('semantic_search','RETRIEVER')
117
+ embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
118
+ embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
119
+ retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
120
+ except Exception as e:
121
+ logging.info(e)
122
+ st.info(e)
123
+
124
+ retriever = EmbeddingRetriever(
125
+ embedding_model=embedding_model,top_k = retriever_top_k,
126
+ document_store = document_store,
127
+ emb_extraction_layer=embedding_layer, scale_score =True,
128
+ model_format=embedding_model_format, use_gpu = True)
129
+ st.session_state['retriever'] = retriever
130
+ return retriever
131
+
132
+ @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
133
+ def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
134
+ document_store = InMemoryDocumentStore(similarity = similarity)
135
+ document_store.write_documents(documents)
136
+ if 'retriever' in st.session_state:
137
+ retriever = st.session_state['retriever']
138
+ document_store.update_embeddings(retriever)
139
+
140
+ return document_store
141
 
142
+
143
+ @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
144
  def semanticSearchPipeline(documents:List[Document]):
145
  """
146
  creates the semantic search pipeline and document Store object from the
 
164
  list of document returned by preprocessing pipeline.
165
 
166
  """
167
+ document_store = createDocumentStore(documents)
168
+ retriever = loadRetriever(document_store=document_store)
169
+ document_store.update_embeddings(retriever)
170
+ querycheck = QueryCheck()
171
+ if 'reader' in st.session_state:
172
+ reader = st.session_state['reader']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  reader_model = config.get('semantic_search','READER')
175
+ reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
176
  reader = FARMReader(model_name_or_path=reader_model,
177
  top_k = reader_top_k, use_gpu=True)
 
178
  st.session_state['reader'] = reader
179
 
 
 
 
 
 
 
 
 
180
  semanticsearch_pipeline = Pipeline()
181
  semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
182
  inputs = ["Query"])
 
184
  inputs = ["QueryCheck.output_1"])
185
  semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
186
  inputs= ["EmbeddingRetriever"])
187
+
188
  return semanticsearch_pipeline, document_store
189
 
190
+
191
+
192
+ # if 'document_store' in st.session_state:
193
+ # document_store = st.session_state['document_store']
194
+ # temp = document_store.get_all_documents()
195
+ # if st.session_state['filename'] != temp[0].meta['name']:
196
+
197
+ # document_store = InMemoryDocumentStore()
198
+ # document_store.write_documents(documents)
199
+ # if 'retriever' in st.session_state:
200
+ # retriever = st.session_state['retriever']
201
+ # document_store.update_embeddings(retriever)
202
+ # # querycheck =
203
+
204
+
205
+ # # embedding_model = config.get('semantic_search','RETRIEVER')
206
+ # # embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
207
+ # # embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
208
+ # # retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
209
+ # # retriever = EmbeddingRetriever(
210
+ # # document_store=document_store,
211
+ # # embedding_model=embedding_model,top_k = retriever_top_k,
212
+ # # emb_extraction_layer=embedding_layer, scale_score =True,
213
+ # # model_format=embedding_model_format, use_gpu = True)
214
+ # # document_store.update_embeddings(retriever)
215
+ # else:
216
+ # embedding_model = config.get('semantic_search','RETRIEVER')
217
+ # embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
218
+ # retriever = EmbeddingRetriever(
219
+ # document_store=document_store,
220
+ # embedding_model=embedding_model,top_k = retriever_top_k,
221
+ # emb_extraction_layer=embedding_layer, scale_score =True,
222
+ # model_format=embedding_model_format, use_gpu = True)
223
+
224
+ # else:
225
+ # document_store = InMemoryDocumentStore()
226
+ # document_store.write_documents(documents)
227
+
228
+ # embedding_model = config.get('semantic_search','RETRIEVER')
229
+ # embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
230
+ # embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
231
+ # retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
232
+
233
+
234
+ # retriever = EmbeddingRetriever(
235
+ # document_store=document_store,
236
+ # embedding_model=embedding_model,top_k = retriever_top_k,
237
+ # emb_extraction_layer=embedding_layer, scale_score =True,
238
+ # model_format=embedding_model_format, use_gpu = True)
239
+ # st.session_state['retriever'] = retriever
240
+ # document_store.update_embeddings(retriever)
241
+ # st.session_state['document_store'] = document_store
242
+ # querycheck = QueryCheck()
243
+ # st.session_state['querycheck'] = querycheck
244
+ # reader_model = config.get('semantic_search','READER')
245
+ # reader_top_k = retriever_top_k
246
+ # reader = FARMReader(model_name_or_path=reader_model,
247
+ # top_k = reader_top_k, use_gpu=True)
248
+
249
+ # st.session_state['reader'] = reader
250
+
251
+ # querycheck = QueryCheck()
252
+
253
+ # reader_model = config.get('semantic_search','READER')
254
+ # reader_top_k = retriever_top_k
255
+ # reader = FARMReader(model_name_or_path=reader_model,
256
+ # top_k = reader_top_k, use_gpu=True)
257
+
258
+
259
+ # semanticsearch_pipeline = Pipeline()
260
+ # semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
261
+ # inputs = ["Query"])
262
+ # semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
263
+ # inputs = ["QueryCheck.output_1"])
264
+ # semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
265
+ # inputs= ["EmbeddingRetriever"])
266
+
267
+ # return semanticsearch_pipeline, document_store
268
+
269
  def semanticsearchAnnotator(matches: List[List[int]], document):
270
  """
271
  Annotates the text in the document defined by list of [start index, end index]
 
280
  for match in matches:
281
  start_idx = match[0]
282
  end_idx = match[1]
283
+ if check_streamlit():
284
+ annotated_text = (annotated_text + document[start:start_idx]
285
+ + str(annotation(body=document[start_idx:end_idx],
286
+ label="ANSWER", background="#964448", color='#ffffff')))
287
+ else:
288
+ annotated_text = (annotated_text + document[start:start_idx]
289
+ + colored(document[start_idx:end_idx],
290
+ "green", attrs = ['bold']))
291
  start = end_idx
292
 
293
  annotated_text = annotated_text + document[end_idx:]
 
 
 
 
 
294
 
295
+ if check_streamlit():
296
+
297
+ st.write(
298
+ markdown(annotated_text),
299
+ unsafe_allow_html=True,
300
+ )
301
+ else:
302
+ print(annotated_text)
303
+
304
 
305
  def semantic_search(query:Text,documents:List[Document]):
306
  """