prashant
commited on
Commit
·
ed0fd13
1
Parent(s):
8eb1cf0
search update
Browse files- appStore/keyword_search.py +3 -7
- utils/lexical_search.py +6 -0
- utils/semantic_search.py +57 -44
appStore/keyword_search.py
CHANGED
@@ -49,8 +49,8 @@ def app():
|
|
49 |
|
50 |
searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context",
|
51 |
['Exact Matches', 'Similar context/meaning'])
|
52 |
-
if searchtype == 'Similar context/meaning':
|
53 |
-
|
54 |
|
55 |
|
56 |
|
@@ -87,10 +87,6 @@ def app():
|
|
87 |
paraList = runSemanticPreprocessingPipeline()
|
88 |
logging.info("starting semantic search")
|
89 |
with st.spinner("Performing Similar/Contextual search"):
|
90 |
-
|
91 |
-
if show_answers:
|
92 |
-
semantic_search(queryList,paraList,show_answers=True)
|
93 |
-
else:
|
94 |
-
semantic_search(queryList,paraList,show_answers=False)
|
95 |
|
96 |
|
|
|
49 |
|
50 |
searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context",
|
51 |
['Exact Matches', 'Similar context/meaning'])
|
52 |
+
# if searchtype == 'Similar context/meaning':
|
53 |
+
# show_answers = st.sidebar.checkbox("Show context")
|
54 |
|
55 |
|
56 |
|
|
|
87 |
paraList = runSemanticPreprocessingPipeline()
|
88 |
logging.info("starting semantic search")
|
89 |
with st.spinner("Performing Similar/Contextual search"):
|
90 |
+
semantic_search(queryList,paraList,show_answers=True)
|
|
|
|
|
|
|
|
|
91 |
|
92 |
|
utils/lexical_search.py
CHANGED
@@ -160,6 +160,12 @@ def lexical_search(query:Text,documents:List[Document]):
|
|
160 |
"""
|
161 |
Performs the Lexical search on the List of haystack documents which is
|
162 |
returned by preprocessing Pipeline.
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
"""
|
164 |
|
165 |
document_store = InMemoryDocumentStore()
|
|
|
160 |
"""
|
161 |
Performs the Lexical search on the List of haystack documents which is
|
162 |
returned by preprocessing Pipeline.
|
163 |
+
|
164 |
+
Params
|
165 |
+
-------
|
166 |
+
query: Keywords that need to be searche in documents.
|
167 |
+
documents: List fo Haystack documents returned by preprocessing pipeline.
|
168 |
+
|
169 |
"""
|
170 |
|
171 |
document_store = InMemoryDocumentStore()
|
utils/semantic_search.py
CHANGED
@@ -15,10 +15,19 @@ config = configparser.ConfigParser()
|
|
15 |
config.read_file(open('paramconfig.cfg'))
|
16 |
|
17 |
class QueryCheck(BaseComponent):
|
|
|
|
|
|
|
18 |
|
19 |
outgoing_edges = 1
|
20 |
|
21 |
def run(self, query):
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
query_classifier = TransformersQueryClassifier(model_name_or_path=
|
24 |
"shahrukhx01/bert-mini-finetune-question-detection")
|
@@ -32,7 +41,6 @@ class QueryCheck(BaseComponent):
|
|
32 |
else:
|
33 |
output = {"query": "find all issues related to {}".format(query),
|
34 |
"query_type": 'statements/keyword'}
|
35 |
-
|
36 |
return output, "output_1"
|
37 |
|
38 |
def run_batch(self, query):
|
@@ -69,7 +77,30 @@ def runSemanticPreprocessingPipeline()->List[Document]:
|
|
69 |
return output_semantic_pre['documents']
|
70 |
|
71 |
|
72 |
-
def semanticSearchPipeline(documents
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
document_store = InMemoryDocumentStore()
|
74 |
document_store.write_documents(documents)
|
75 |
|
@@ -87,6 +118,10 @@ def semanticSearchPipeline(documents, show_answers = False):
|
|
87 |
emb_extraction_layer=embedding_layer, scale_score =True,
|
88 |
model_format=embedding_model_format, use_gpu = True)
|
89 |
document_store.update_embeddings(retriever)
|
|
|
|
|
|
|
|
|
90 |
|
91 |
|
92 |
semanticsearch_pipeline = Pipeline()
|
@@ -94,14 +129,8 @@ def semanticSearchPipeline(documents, show_answers = False):
|
|
94 |
inputs = ["Query"])
|
95 |
semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
|
96 |
inputs = ["QueryCheck.output_1"])
|
97 |
-
|
98 |
-
|
99 |
-
reader_top_k = retriever_top_k
|
100 |
-
reader = FARMReader(model_name_or_path=reader_model,
|
101 |
-
top_k = reader_top_k, use_gpu=True)
|
102 |
-
|
103 |
-
semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
|
104 |
-
inputs= ["EmbeddingRetriever"])
|
105 |
|
106 |
return semanticsearch_pipeline, document_store
|
107 |
|
@@ -132,41 +161,25 @@ def semanticsearchAnnotator(matches: List[List[int]], document):
|
|
132 |
)
|
133 |
|
134 |
|
135 |
-
def semantic_search(query:Text,documents:List[Document]
|
136 |
"""
|
137 |
-
Performs the
|
138 |
returned by preprocessing Pipeline.
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
"""
|
140 |
-
|
141 |
-
semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents,
|
142 |
-
show_answers=show_answers)
|
143 |
results = semanticsearch_pipeline.run(query = query)
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
else:
|
155 |
-
|
156 |
-
for answer in results['answers']:
|
157 |
-
# st.write(answer)
|
158 |
-
# matches = []
|
159 |
-
# doc = []
|
160 |
-
if answer.score >0.01:
|
161 |
-
temp = answer.to_dict()
|
162 |
-
start_idx = temp['offsets_in_document'][0]['start']
|
163 |
-
end_idx = temp['offsets_in_document'][0]['end']
|
164 |
-
|
165 |
-
# matches.append([start_idx,end_idx])
|
166 |
-
# doc.append(doc_store.get_document_by_id(temp['document_id']).content)
|
167 |
-
match = [[start_idx,end_idx]]
|
168 |
-
doc = doc_store.get_document_by_id(temp['document_id']).content
|
169 |
-
semanticsearchAnnotator(match,doc)
|
170 |
-
|
171 |
-
|
172 |
-
|
|
|
15 |
config.read_file(open('paramconfig.cfg'))
|
16 |
|
17 |
class QueryCheck(BaseComponent):
|
18 |
+
"""
|
19 |
+
Uses Query Classifier from Haystack, process the query based on query type
|
20 |
+
"""
|
21 |
|
22 |
outgoing_edges = 1
|
23 |
|
24 |
def run(self, query):
|
25 |
+
"""
|
26 |
+
mandatory method to use the cusotm node. Determines the query type, if
|
27 |
+
if the query is of type keyword/statement will modify it to make it more
|
28 |
+
useful for sentence transoformers.
|
29 |
+
|
30 |
+
"""
|
31 |
|
32 |
query_classifier = TransformersQueryClassifier(model_name_or_path=
|
33 |
"shahrukhx01/bert-mini-finetune-question-detection")
|
|
|
41 |
else:
|
42 |
output = {"query": "find all issues related to {}".format(query),
|
43 |
"query_type": 'statements/keyword'}
|
|
|
44 |
return output, "output_1"
|
45 |
|
46 |
def run_batch(self, query):
|
|
|
77 |
return output_semantic_pre['documents']
|
78 |
|
79 |
|
80 |
+
def semanticSearchPipeline(documents:List[Document]):
|
81 |
+
"""
|
82 |
+
creates the semantic search pipeline and document Store object from the
|
83 |
+
list of haystack documents. Retriever and Reader model are read from
|
84 |
+
paramconfig. The top_k for the Reader and Retirever are kept same, so that
|
85 |
+
all the results returned by Retriever are used, however the context is
|
86 |
+
extracted by Reader for each retrieved result. The querycheck is added as
|
87 |
+
node to process the query.
|
88 |
+
|
89 |
+
|
90 |
+
Params
|
91 |
+
----------
|
92 |
+
documents: list of Haystack Documents, returned by preprocessig pipeline.
|
93 |
+
|
94 |
+
Return
|
95 |
+
---------
|
96 |
+
semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
|
97 |
+
nodes [QueryCheck, Retriever, Reader]
|
98 |
+
|
99 |
+
document_store: As retriever cna work only with Haystack Document Store, the
|
100 |
+
list of document returned by preprocessing pipeline.
|
101 |
+
|
102 |
+
"""
|
103 |
+
|
104 |
document_store = InMemoryDocumentStore()
|
105 |
document_store.write_documents(documents)
|
106 |
|
|
|
118 |
emb_extraction_layer=embedding_layer, scale_score =True,
|
119 |
model_format=embedding_model_format, use_gpu = True)
|
120 |
document_store.update_embeddings(retriever)
|
121 |
+
reader_model = config.get('semantic_search','READER')
|
122 |
+
reader_top_k = retriever_top_k
|
123 |
+
reader = FARMReader(model_name_or_path=reader_model,
|
124 |
+
top_k = reader_top_k, use_gpu=True)
|
125 |
|
126 |
|
127 |
semanticsearch_pipeline = Pipeline()
|
|
|
129 |
inputs = ["Query"])
|
130 |
semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
|
131 |
inputs = ["QueryCheck.output_1"])
|
132 |
+
semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
|
133 |
+
inputs= ["EmbeddingRetriever"])
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
return semanticsearch_pipeline, document_store
|
136 |
|
|
|
161 |
)
|
162 |
|
163 |
|
164 |
+
def semantic_search(query:Text,documents:List[Document]):
|
165 |
"""
|
166 |
+
Performs the Semantic search on the List of haystack documents which is
|
167 |
returned by preprocessing Pipeline.
|
168 |
+
|
169 |
+
Params
|
170 |
+
-------
|
171 |
+
query: Keywords that need to be searche in documents.
|
172 |
+
documents: List fo Haystack documents returned by preprocessing pipeline.
|
173 |
+
|
174 |
"""
|
175 |
+
semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents)
|
|
|
|
|
176 |
results = semanticsearch_pipeline.run(query = query)
|
177 |
+
st.markdown("##### Top few semantic search results #####")
|
178 |
+
for i,answer in enumerate(results['answers']):
|
179 |
+
temp = answer.to_dict()
|
180 |
+
start_idx = temp['offsets_in_document'][0]['start']
|
181 |
+
end_idx = temp['offsets_in_document'][0]['end']
|
182 |
+
match = [[start_idx,end_idx]]
|
183 |
+
doc = doc_store.get_document_by_id(temp['document_id']).content
|
184 |
+
st.write("Result {}".format(i+1))
|
185 |
+
semanticsearchAnnotator(match, doc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|