prashant commited on
Commit
ed0fd13
·
1 Parent(s): 8eb1cf0

search update

Browse files
appStore/keyword_search.py CHANGED
@@ -49,8 +49,8 @@ def app():
49
 
50
  searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context",
51
  ['Exact Matches', 'Similar context/meaning'])
52
- if searchtype == 'Similar context/meaning':
53
- show_answers = st.sidebar.checkbox("Show context")
54
 
55
 
56
 
@@ -87,10 +87,6 @@ def app():
87
  paraList = runSemanticPreprocessingPipeline()
88
  logging.info("starting semantic search")
89
  with st.spinner("Performing Similar/Contextual search"):
90
- st.markdown("##### Top few semantic search results #####")
91
- if show_answers:
92
- semantic_search(queryList,paraList,show_answers=True)
93
- else:
94
- semantic_search(queryList,paraList,show_answers=False)
95
 
96
 
 
49
 
50
  searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context",
51
  ['Exact Matches', 'Similar context/meaning'])
52
+ # if searchtype == 'Similar context/meaning':
53
+ # show_answers = st.sidebar.checkbox("Show context")
54
 
55
 
56
 
 
87
  paraList = runSemanticPreprocessingPipeline()
88
  logging.info("starting semantic search")
89
  with st.spinner("Performing Similar/Contextual search"):
90
+ semantic_search(queryList,paraList,show_answers=True)
 
 
 
 
91
 
92
 
utils/lexical_search.py CHANGED
@@ -160,6 +160,12 @@ def lexical_search(query:Text,documents:List[Document]):
160
  """
161
  Performs the Lexical search on the List of haystack documents which is
162
  returned by preprocessing Pipeline.
 
 
 
 
 
 
163
  """
164
 
165
  document_store = InMemoryDocumentStore()
 
160
  """
161
  Performs the Lexical search on the List of haystack documents which is
162
  returned by preprocessing Pipeline.
163
+
164
+ Params
165
+ -------
166
+ query: Keywords that need to be searche in documents.
167
+ documents: List fo Haystack documents returned by preprocessing pipeline.
168
+
169
  """
170
 
171
  document_store = InMemoryDocumentStore()
utils/semantic_search.py CHANGED
@@ -15,10 +15,19 @@ config = configparser.ConfigParser()
15
  config.read_file(open('paramconfig.cfg'))
16
 
17
  class QueryCheck(BaseComponent):
 
 
 
18
 
19
  outgoing_edges = 1
20
 
21
  def run(self, query):
 
 
 
 
 
 
22
 
23
  query_classifier = TransformersQueryClassifier(model_name_or_path=
24
  "shahrukhx01/bert-mini-finetune-question-detection")
@@ -32,7 +41,6 @@ class QueryCheck(BaseComponent):
32
  else:
33
  output = {"query": "find all issues related to {}".format(query),
34
  "query_type": 'statements/keyword'}
35
-
36
  return output, "output_1"
37
 
38
  def run_batch(self, query):
@@ -69,7 +77,30 @@ def runSemanticPreprocessingPipeline()->List[Document]:
69
  return output_semantic_pre['documents']
70
 
71
 
72
- def semanticSearchPipeline(documents, show_answers = False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  document_store = InMemoryDocumentStore()
74
  document_store.write_documents(documents)
75
 
@@ -87,6 +118,10 @@ def semanticSearchPipeline(documents, show_answers = False):
87
  emb_extraction_layer=embedding_layer, scale_score =True,
88
  model_format=embedding_model_format, use_gpu = True)
89
  document_store.update_embeddings(retriever)
 
 
 
 
90
 
91
 
92
  semanticsearch_pipeline = Pipeline()
@@ -94,14 +129,8 @@ def semanticSearchPipeline(documents, show_answers = False):
94
  inputs = ["Query"])
95
  semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
96
  inputs = ["QueryCheck.output_1"])
97
- if show_answers == True:
98
- reader_model = config.get('semantic_search','READER')
99
- reader_top_k = retriever_top_k
100
- reader = FARMReader(model_name_or_path=reader_model,
101
- top_k = reader_top_k, use_gpu=True)
102
-
103
- semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
104
- inputs= ["EmbeddingRetriever"])
105
 
106
  return semanticsearch_pipeline, document_store
107
 
@@ -132,41 +161,25 @@ def semanticsearchAnnotator(matches: List[List[int]], document):
132
  )
133
 
134
 
135
- def semantic_search(query:Text,documents:List[Document],show_answers = False):
136
  """
137
- Performs the Lexical search on the List of haystack documents which is
138
  returned by preprocessing Pipeline.
 
 
 
 
 
 
139
  """
140
- threshold = 0.4
141
- semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents,
142
- show_answers=show_answers)
143
  results = semanticsearch_pipeline.run(query = query)
144
-
145
-
146
- if show_answers == False:
147
- results = results['documents']
148
- for i,queryhit in enumerate(results):
149
-
150
- if queryhit.score > threshold:
151
- st.write("\t {}: \t {}".format(i+1, queryhit.content.replace("\n", " ")))
152
- st.markdown("---")
153
-
154
- else:
155
-
156
- for answer in results['answers']:
157
- # st.write(answer)
158
- # matches = []
159
- # doc = []
160
- if answer.score >0.01:
161
- temp = answer.to_dict()
162
- start_idx = temp['offsets_in_document'][0]['start']
163
- end_idx = temp['offsets_in_document'][0]['end']
164
-
165
- # matches.append([start_idx,end_idx])
166
- # doc.append(doc_store.get_document_by_id(temp['document_id']).content)
167
- match = [[start_idx,end_idx]]
168
- doc = doc_store.get_document_by_id(temp['document_id']).content
169
- semanticsearchAnnotator(match,doc)
170
-
171
-
172
-
 
15
  config.read_file(open('paramconfig.cfg'))
16
 
17
  class QueryCheck(BaseComponent):
18
+ """
19
+ Uses Query Classifier from Haystack, process the query based on query type
20
+ """
21
 
22
  outgoing_edges = 1
23
 
24
  def run(self, query):
25
+ """
26
+ mandatory method to use the cusotm node. Determines the query type, if
27
+ if the query is of type keyword/statement will modify it to make it more
28
+ useful for sentence transoformers.
29
+
30
+ """
31
 
32
  query_classifier = TransformersQueryClassifier(model_name_or_path=
33
  "shahrukhx01/bert-mini-finetune-question-detection")
 
41
  else:
42
  output = {"query": "find all issues related to {}".format(query),
43
  "query_type": 'statements/keyword'}
 
44
  return output, "output_1"
45
 
46
  def run_batch(self, query):
 
77
  return output_semantic_pre['documents']
78
 
79
 
80
+ def semanticSearchPipeline(documents:List[Document]):
81
+ """
82
+ creates the semantic search pipeline and document Store object from the
83
+ list of haystack documents. Retriever and Reader model are read from
84
+ paramconfig. The top_k for the Reader and Retirever are kept same, so that
85
+ all the results returned by Retriever are used, however the context is
86
+ extracted by Reader for each retrieved result. The querycheck is added as
87
+ node to process the query.
88
+
89
+
90
+ Params
91
+ ----------
92
+ documents: list of Haystack Documents, returned by preprocessig pipeline.
93
+
94
+ Return
95
+ ---------
96
+ semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
97
+ nodes [QueryCheck, Retriever, Reader]
98
+
99
+ document_store: As retriever cna work only with Haystack Document Store, the
100
+ list of document returned by preprocessing pipeline.
101
+
102
+ """
103
+
104
  document_store = InMemoryDocumentStore()
105
  document_store.write_documents(documents)
106
 
 
118
  emb_extraction_layer=embedding_layer, scale_score =True,
119
  model_format=embedding_model_format, use_gpu = True)
120
  document_store.update_embeddings(retriever)
121
+ reader_model = config.get('semantic_search','READER')
122
+ reader_top_k = retriever_top_k
123
+ reader = FARMReader(model_name_or_path=reader_model,
124
+ top_k = reader_top_k, use_gpu=True)
125
 
126
 
127
  semanticsearch_pipeline = Pipeline()
 
129
  inputs = ["Query"])
130
  semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
131
  inputs = ["QueryCheck.output_1"])
132
+ semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
133
+ inputs= ["EmbeddingRetriever"])
 
 
 
 
 
 
134
 
135
  return semanticsearch_pipeline, document_store
136
 
 
161
  )
162
 
163
 
164
+ def semantic_search(query:Text,documents:List[Document]):
165
  """
166
+ Performs the Semantic search on the List of haystack documents which is
167
  returned by preprocessing Pipeline.
168
+
169
+ Params
170
+ -------
171
+ query: Keywords that need to be searche in documents.
172
+ documents: List fo Haystack documents returned by preprocessing pipeline.
173
+
174
  """
175
+ semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents)
 
 
176
  results = semanticsearch_pipeline.run(query = query)
177
+ st.markdown("##### Top few semantic search results #####")
178
+ for i,answer in enumerate(results['answers']):
179
+ temp = answer.to_dict()
180
+ start_idx = temp['offsets_in_document'][0]['start']
181
+ end_idx = temp['offsets_in_document'][0]['end']
182
+ match = [[start_idx,end_idx]]
183
+ doc = doc_store.get_document_by_id(temp['document_id']).content
184
+ st.write("Result {}".format(i+1))
185
+ semanticsearchAnnotator(match, doc)