prashant
commited on
Commit
·
43cd965
1
Parent(s):
3a88079
search UI changes
Browse files- appStore/keyword_search.py +39 -27
- docStore/sample/keywordexample.json +1 -1
- paramconfig.cfg +3 -2
- utils/lexical_search.py +18 -16
- utils/preprocessing.py +4 -1
- utils/semantic_search.py +23 -127
appStore/keyword_search.py
CHANGED
@@ -14,7 +14,8 @@ config = getconfig('paramconfig.cfg')
|
|
14 |
split_by = config.get('semantic_search','SPLIT_BY')
|
15 |
split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
|
16 |
split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
|
17 |
-
split_respect_sentence_boundary = bool(int(config.get('semantic_search',
|
|
|
18 |
remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
|
19 |
embedding_model = config.get('semantic_search','RETRIEVER')
|
20 |
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
@@ -22,6 +23,11 @@ embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
|
22 |
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
23 |
reader_model = config.get('semantic_search','READER')
|
24 |
reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
def app():
|
27 |
|
@@ -49,22 +55,23 @@ def app():
|
|
49 |
keywordexample = json.load(json_file)
|
50 |
|
51 |
genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
|
52 |
-
if genre
|
53 |
-
keywordList = keywordexample[
|
54 |
-
elif genre == 'Climate':
|
55 |
-
|
56 |
-
elif genre == 'Social':
|
57 |
-
|
58 |
-
elif genre == 'Nature':
|
59 |
-
|
60 |
-
elif genre == 'Implementation':
|
61 |
-
|
62 |
else:
|
63 |
keywordList = None
|
64 |
|
65 |
-
searchtype = st.selectbox("Do you want to find exact macthes or similar \
|
66 |
-
|
67 |
-
|
|
|
68 |
|
69 |
st.markdown("---")
|
70 |
|
@@ -80,7 +87,7 @@ def app():
|
|
80 |
for and we will we will look for similar\
|
81 |
context in the document.",
|
82 |
placeholder="Enter keyword here")
|
83 |
-
|
84 |
if st.button("Find them"):
|
85 |
|
86 |
if queryList == "":
|
@@ -91,16 +98,22 @@ def app():
|
|
91 |
if 'filepath' in st.session_state:
|
92 |
|
93 |
|
94 |
-
if searchtype
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
else:
|
105 |
allDocuments = runSemanticPreprocessingPipeline(
|
106 |
file_path= st.session_state['filepath'],
|
@@ -109,7 +122,7 @@ def app():
|
|
109 |
split_length= split_length,
|
110 |
split_overlap=split_overlap,
|
111 |
removePunc= remove_punc,
|
112 |
-
|
113 |
|
114 |
|
115 |
logging.info("starting semantic search")
|
@@ -120,7 +133,6 @@ def app():
|
|
120 |
embedding_layer=embedding_layer,
|
121 |
embedding_model_format=embedding_model_format,
|
122 |
reader_model=reader_model,reader_top_k=reader_top_k,
|
123 |
-
|
124 |
retriever_top_k=retriever_top_k)
|
125 |
|
126 |
else:
|
|
|
14 |
split_by = config.get('semantic_search','SPLIT_BY')
|
15 |
split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
|
16 |
split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
|
17 |
+
split_respect_sentence_boundary = bool(int(config.get('semantic_search',
|
18 |
+
'RESPECT_SENTENCE_BOUNDARY')))
|
19 |
remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
|
20 |
embedding_model = config.get('semantic_search','RETRIEVER')
|
21 |
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
|
|
23 |
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
24 |
reader_model = config.get('semantic_search','READER')
|
25 |
reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
26 |
+
lexical_split_by= config.get('lexical_search','SPLIT_BY')
|
27 |
+
lexical_split_length=int(config.get('lexical_search','SPLIT_LENGTH'))
|
28 |
+
lexical_split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
|
29 |
+
lexical_remove_punc = bool(int(config.get('lexical_search','REMOVE_PUNC')))
|
30 |
+
lexical_top_k=int(config.get('lexical_search','TOP_K'))
|
31 |
|
32 |
def app():
|
33 |
|
|
|
55 |
keywordexample = json.load(json_file)
|
56 |
|
57 |
genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
|
58 |
+
if genre:
|
59 |
+
keywordList = keywordexample[genre]
|
60 |
+
# elif genre == 'Climate':
|
61 |
+
# keywordList = keywordexample['Climate']
|
62 |
+
# elif genre == 'Social':
|
63 |
+
# keywordList = keywordexample['Social']
|
64 |
+
# elif genre == 'Nature':
|
65 |
+
# keywordList = keywordexample['Nature']
|
66 |
+
# elif genre == 'Implementation':
|
67 |
+
# keywordList = keywordexample['Implementation']
|
68 |
else:
|
69 |
keywordList = None
|
70 |
|
71 |
+
# searchtype = st.selectbox("Do you want to find exact macthes or similar \
|
72 |
+
# meaning/context",
|
73 |
+
# ['Exact Matches', 'Similar context/meaning'])
|
74 |
+
|
75 |
|
76 |
st.markdown("---")
|
77 |
|
|
|
87 |
for and we will we will look for similar\
|
88 |
context in the document.",
|
89 |
placeholder="Enter keyword here")
|
90 |
+
searchtype = st.checkbox("Show only Exact Matches")
|
91 |
if st.button("Find them"):
|
92 |
|
93 |
if queryList == "":
|
|
|
98 |
if 'filepath' in st.session_state:
|
99 |
|
100 |
|
101 |
+
if searchtype:
|
102 |
+
allDocuments = runLexicalPreprocessingPipeline(
|
103 |
+
file_name=st.session_state['filename'],
|
104 |
+
file_path=st.session_state['filepath'],
|
105 |
+
split_by=lexical_split_by,
|
106 |
+
split_length=lexical_split_length,
|
107 |
+
split_overlap=lexical_split_overlap,
|
108 |
+
removePunc=lexical_remove_punc),
|
109 |
+
logging.info("performing lexical search")
|
110 |
+
with st.spinner("Performing Exact matching search \
|
111 |
+
(Lexical search) for you"):
|
112 |
+
st.markdown("##### Top few lexical search (TFIDF) hits #####")
|
113 |
+
lexical_search(
|
114 |
+
query=queryList,
|
115 |
+
documents = allDocuments['documents'],
|
116 |
+
top_k = lexical_top_k )
|
117 |
else:
|
118 |
allDocuments = runSemanticPreprocessingPipeline(
|
119 |
file_path= st.session_state['filepath'],
|
|
|
122 |
split_length= split_length,
|
123 |
split_overlap=split_overlap,
|
124 |
removePunc= remove_punc,
|
125 |
+
split_respect_sentence_boundary=split_respect_sentence_boundary)
|
126 |
|
127 |
|
128 |
logging.info("starting semantic search")
|
|
|
133 |
embedding_layer=embedding_layer,
|
134 |
embedding_model_format=embedding_model_format,
|
135 |
reader_model=reader_model,reader_top_k=reader_top_k,
|
|
|
136 |
retriever_top_k=retriever_top_k)
|
137 |
|
138 |
else:
|
docStore/sample/keywordexample.json
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
{
|
2 |
"Food":"Food security,Nutrition,Diets,Food loss",
|
3 |
"Climate":"Climate,Adaptation,Mitigation,Decarbonization,Carbon neutrality,Net zero Emissions",
|
4 |
"Social":"Indigenous,Local community(ies),Gender,Rural livelihoods,Minority",
|
|
|
1 |
+
{
|
2 |
"Food":"Food security,Nutrition,Diets,Food loss",
|
3 |
"Climate":"Climate,Adaptation,Mitigation,Decarbonization,Carbon neutrality,Net zero Emissions",
|
4 |
"Social":"Indigenous,Local community(ies),Gender,Rural livelihoods,Minority",
|
paramconfig.cfg
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
[lexical_search]
|
2 |
TOP_K = 20
|
3 |
-
SPLIT_BY =
|
4 |
-
SPLIT_LENGTH =
|
5 |
SPLIT_OVERLAP = 0
|
|
|
6 |
|
7 |
[semantic_search]
|
8 |
RETRIEVER_TOP_K = 10
|
|
|
1 |
[lexical_search]
|
2 |
TOP_K = 20
|
3 |
+
SPLIT_BY = word
|
4 |
+
SPLIT_LENGTH = 120
|
5 |
SPLIT_OVERLAP = 0
|
6 |
+
REMOVE_PUNC = 0
|
7 |
|
8 |
[semantic_search]
|
9 |
RETRIEVER_TOP_K = 10
|
utils/lexical_search.py
CHANGED
@@ -8,9 +8,9 @@ from markdown import markdown
|
|
8 |
from annotated_text import annotation
|
9 |
from haystack.schema import Document
|
10 |
from typing import List, Text
|
|
|
11 |
from utils.preprocessing import processingpipeline
|
12 |
from utils.streamlitcheck import check_streamlit
|
13 |
-
import configparser
|
14 |
import logging
|
15 |
try:
|
16 |
from termcolor import colored
|
@@ -21,18 +21,17 @@ try:
|
|
21 |
import streamlit as st
|
22 |
except ImportError:
|
23 |
logging.info("Streamlit not installed")
|
24 |
-
config = configparser.ConfigParser()
|
25 |
-
try:
|
26 |
-
config.read_file(open('paramconfig.cfg'))
|
27 |
-
except Exception:
|
28 |
-
logging.warning("paramconfig file not found")
|
29 |
-
st.info("Please place the paramconfig file in the same directory as app.py")
|
30 |
|
31 |
|
32 |
-
def runLexicalPreprocessingPipeline(file_path,
|
|
|
|
|
|
|
33 |
"""
|
34 |
creates the pipeline and runs the preprocessing pipeline,
|
35 |
-
the params for pipeline are fetched from paramconfig
|
|
|
|
|
36 |
|
37 |
Params
|
38 |
------------
|
@@ -41,6 +40,11 @@ def runLexicalPreprocessingPipeline(file_path, file_name)->List[Document]:
|
|
41 |
st.session_state['filename']
|
42 |
file_path: filepath, in case of streamlit application use
|
43 |
st.session_state['filepath']
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
Return
|
46 |
--------------
|
@@ -52,14 +56,12 @@ def runLexicalPreprocessingPipeline(file_path, file_name)->List[Document]:
|
|
52 |
"""
|
53 |
|
54 |
lexical_processing_pipeline = processingpipeline()
|
55 |
-
|
56 |
-
split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
|
57 |
-
split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
|
58 |
|
59 |
output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
|
60 |
params= {"FileConverter": {"file_path": file_path, \
|
61 |
"file_name": file_name},
|
62 |
-
"UdfPreProcessor": {"removePunc":
|
63 |
"split_by": split_by, \
|
64 |
"split_length":split_length,\
|
65 |
"split_overlap": split_overlap}})
|
@@ -201,7 +203,7 @@ def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
|
|
201 |
else:
|
202 |
print(annotated_text)
|
203 |
|
204 |
-
def lexical_search(query:Text,documents:List[Document]):
|
205 |
"""
|
206 |
Performs the Lexical search on the List of haystack documents which is
|
207 |
returned by preprocessing Pipeline.
|
@@ -210,6 +212,7 @@ def lexical_search(query:Text,documents:List[Document]):
|
|
210 |
-------
|
211 |
query: Keywords that need to be searche in documents.
|
212 |
documents: List of Haystack documents returned by preprocessing pipeline.
|
|
|
213 |
|
214 |
"""
|
215 |
|
@@ -218,8 +221,7 @@ def lexical_search(query:Text,documents:List[Document]):
|
|
218 |
|
219 |
# Haystack Retriever works with document stores only.
|
220 |
retriever = TfidfRetriever(document_store)
|
221 |
-
results = retriever.retrieve(query=query,
|
222 |
-
top_k= int(config.get('lexical_search','TOP_K')))
|
223 |
query_tokens = tokenize_lexical_query(query)
|
224 |
for count, result in enumerate(results):
|
225 |
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
|
|
8 |
from annotated_text import annotation
|
9 |
from haystack.schema import Document
|
10 |
from typing import List, Text
|
11 |
+
from typing_extensions import Literal
|
12 |
from utils.preprocessing import processingpipeline
|
13 |
from utils.streamlitcheck import check_streamlit
|
|
|
14 |
import logging
|
15 |
try:
|
16 |
from termcolor import colored
|
|
|
21 |
import streamlit as st
|
22 |
except ImportError:
|
23 |
logging.info("Streamlit not installed")
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
|
26 |
+
def runLexicalPreprocessingPipeline(file_path,file_name,
|
27 |
+
split_by: Literal["sentence", "word"] = 'word',
|
28 |
+
split_length:int = 80, removePunc:bool = False,
|
29 |
+
split_overlap:int = 0 )->List[Document]:
|
30 |
"""
|
31 |
creates the pipeline and runs the preprocessing pipeline,
|
32 |
+
the params for pipeline are fetched from paramconfig. As lexical doesnt gets
|
33 |
+
affected by overlap, threfore split_overlap = 0 in default paramconfig and
|
34 |
+
split_by = word.
|
35 |
|
36 |
Params
|
37 |
------------
|
|
|
40 |
st.session_state['filename']
|
41 |
file_path: filepath, in case of streamlit application use
|
42 |
st.session_state['filepath']
|
43 |
+
removePunc: to remove all Punctuation including ',' and '.' or not
|
44 |
+
split_by: document splitting strategy either as word or sentence
|
45 |
+
split_length: when synthetically creating the paragrpahs from document,
|
46 |
+
it defines the length of paragraph.
|
47 |
+
splititng of text.
|
48 |
|
49 |
Return
|
50 |
--------------
|
|
|
56 |
"""
|
57 |
|
58 |
lexical_processing_pipeline = processingpipeline()
|
59 |
+
|
|
|
|
|
60 |
|
61 |
output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
|
62 |
params= {"FileConverter": {"file_path": file_path, \
|
63 |
"file_name": file_name},
|
64 |
+
"UdfPreProcessor": {"removePunc": removePunc, \
|
65 |
"split_by": split_by, \
|
66 |
"split_length":split_length,\
|
67 |
"split_overlap": split_overlap}})
|
|
|
203 |
else:
|
204 |
print(annotated_text)
|
205 |
|
206 |
+
def lexical_search(query:Text,top_k:int, documents:List[Document]):
|
207 |
"""
|
208 |
Performs the Lexical search on the List of haystack documents which is
|
209 |
returned by preprocessing Pipeline.
|
|
|
212 |
-------
|
213 |
query: Keywords that need to be searche in documents.
|
214 |
documents: List of Haystack documents returned by preprocessing pipeline.
|
215 |
+
top_k: Number of Top results to be fetched.
|
216 |
|
217 |
"""
|
218 |
|
|
|
221 |
|
222 |
# Haystack Retriever works with document stores only.
|
223 |
retriever = TfidfRetriever(document_store)
|
224 |
+
results = retriever.retrieve(query=query, top_k = top_k)
|
|
|
225 |
query_tokens = tokenize_lexical_query(query)
|
226 |
for count, result in enumerate(results):
|
227 |
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
utils/preprocessing.py
CHANGED
@@ -167,7 +167,7 @@ class UdfPreProcessor(BaseComponent):
|
|
167 |
def run(self, documents:List[Document], removePunc:bool,
|
168 |
split_by: Literal["sentence", "word"] = 'sentence',
|
169 |
split_respect_sentence_boundary = False,
|
170 |
-
split_length:int = 2, split_overlap = 0):
|
171 |
|
172 |
""" this is required method to invoke the component in
|
173 |
the pipeline implementation.
|
@@ -181,6 +181,9 @@ class UdfPreProcessor(BaseComponent):
|
|
181 |
it defines the length of paragraph.
|
182 |
split_respect_sentence_boundary: Used when using 'word' strategy for
|
183 |
splititng of text.
|
|
|
|
|
|
|
184 |
|
185 |
Return
|
186 |
---------
|
|
|
167 |
def run(self, documents:List[Document], removePunc:bool,
|
168 |
split_by: Literal["sentence", "word"] = 'sentence',
|
169 |
split_respect_sentence_boundary = False,
|
170 |
+
split_length:int = 2, split_overlap:int = 0):
|
171 |
|
172 |
""" this is required method to invoke the component in
|
173 |
the pipeline implementation.
|
|
|
181 |
it defines the length of paragraph.
|
182 |
split_respect_sentence_boundary: Used when using 'word' strategy for
|
183 |
splititng of text.
|
184 |
+
split_overlap: Number of words or sentences that overlap when creating
|
185 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
186 |
+
when read in together with others. Therefore the overlap is used.
|
187 |
|
188 |
Return
|
189 |
---------
|
utils/semantic_search.py
CHANGED
@@ -34,7 +34,13 @@ def loadQueryClassifier():
|
|
34 |
|
35 |
class QueryCheck(BaseComponent):
|
36 |
"""
|
37 |
-
Uses Query Classifier from Haystack, process the query based on query type
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
1. https://docs.haystack.deepset.ai/docs/query_classifier
|
39 |
|
40 |
"""
|
@@ -69,8 +75,7 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
|
|
69 |
split_length:int = 2, split_overlap = 0,
|
70 |
removePunc = False)->List[Document]:
|
71 |
"""
|
72 |
-
creates the pipeline and runs the preprocessing pipeline
|
73 |
-
the params for pipeline are fetched from paramconfig
|
74 |
|
75 |
Params
|
76 |
------------
|
@@ -132,7 +137,7 @@ def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = No
|
|
132 |
|
133 |
Return
|
134 |
-------
|
135 |
-
retriever:
|
136 |
"""
|
137 |
logging.info("loading retriever")
|
138 |
if document_store is None:
|
@@ -151,7 +156,7 @@ def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = No
|
|
151 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
152 |
def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
|
153 |
"""
|
154 |
-
Creates the InMemory Document Store
|
155 |
It is mandatory component for Retriever to work in Haystack frame work.
|
156 |
|
157 |
Params
|
@@ -167,10 +172,6 @@ def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
|
|
167 |
"""
|
168 |
document_store = InMemoryDocumentStore(similarity = similarity)
|
169 |
document_store.write_documents(documents)
|
170 |
-
# if check_streamlit:
|
171 |
-
# if 'retriever' in st.session_state:
|
172 |
-
# retriever = st.session_state['retriever']
|
173 |
-
# document_store.update_embeddings(retriever)
|
174 |
|
175 |
return document_store
|
176 |
|
@@ -182,11 +183,10 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
|
|
182 |
reader_model:str = None, reader_top_k:int = 10):
|
183 |
"""
|
184 |
creates the semantic search pipeline and document Store object from the
|
185 |
-
list of haystack documents.
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
node to process the query.
|
190 |
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
191 |
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
192 |
3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
|
@@ -214,50 +214,22 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = Non
|
|
214 |
nodes [QueryCheck, Retriever, Reader]
|
215 |
|
216 |
document_store: As retriever can work only with Haystack Document Store, the
|
217 |
-
list of document returned by preprocessing pipeline
|
|
|
|
|
218 |
|
219 |
"""
|
220 |
-
document_store = createDocumentStore(documents)
|
221 |
-
# if check_streamlit:
|
222 |
-
# if 'retriever' in st.session_state:
|
223 |
-
# # if st.session_state['retriever']:
|
224 |
-
# retriever = st.session_state['retriever']
|
225 |
-
# else:
|
226 |
-
# if embedding_model:
|
227 |
retriever = loadRetriever(embedding_model = embedding_model,
|
228 |
embedding_model_format=embedding_model_format,
|
229 |
embedding_layer=embedding_layer,
|
230 |
retriever_top_k= retriever_top_k,
|
231 |
document_store = document_store)
|
232 |
|
233 |
-
# st.session_state['retriever'] = retriever
|
234 |
-
# else:
|
235 |
-
# logging.warning("no streamlit enviornment found, neither embedding model \
|
236 |
-
# provided")
|
237 |
-
# return
|
238 |
-
# elif embedding_model:
|
239 |
-
# retriever = loadRetriever(embedding_model = embedding_model,
|
240 |
-
# embedding_model_format=embedding_model_format,
|
241 |
-
# embedding_layer=embedding_layer,
|
242 |
-
# retriever_top_k= retriever_top_k,
|
243 |
-
# document_store = document_store)
|
244 |
-
|
245 |
-
|
246 |
document_store.update_embeddings(retriever)
|
247 |
-
# retriever.document_store = document_store
|
248 |
querycheck = QueryCheck()
|
249 |
-
# if check_streamlit:
|
250 |
-
# if 'reader' in st.session_state:
|
251 |
-
# reader = st.session_state['reader']
|
252 |
-
|
253 |
-
# else:
|
254 |
-
# if reader_model:
|
255 |
reader = FARMReader(model_name_or_path=reader_model,
|
256 |
top_k = reader_top_k, use_gpu=True)
|
257 |
-
# st.session_state['reader'] = reader
|
258 |
-
# elif reader_model:
|
259 |
-
# reader = FARMReader(model_name_or_path=reader_model,
|
260 |
-
# top_k = reader_top_k, use_gpu=True)
|
261 |
|
262 |
semanticsearch_pipeline = Pipeline()
|
263 |
semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
|
@@ -339,84 +311,8 @@ def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
|
|
339 |
end_idx = temp['offsets_in_document'][0]['end']
|
340 |
match = [[start_idx,end_idx]]
|
341 |
doc = doc_store.get_document_by_id(temp['document_id']).content
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
# if 'document_store' in st.session_state:
|
348 |
-
# document_store = st.session_state['document_store']
|
349 |
-
# temp = document_store.get_all_documents()
|
350 |
-
# if st.session_state['filename'] != temp[0].meta['name']:
|
351 |
-
|
352 |
-
# document_store = InMemoryDocumentStore()
|
353 |
-
# document_store.write_documents(documents)
|
354 |
-
# if 'retriever' in st.session_state:
|
355 |
-
# retriever = st.session_state['retriever']
|
356 |
-
# document_store.update_embeddings(retriever)
|
357 |
-
# # querycheck =
|
358 |
-
|
359 |
-
|
360 |
-
# # embedding_model = config.get('semantic_search','RETRIEVER')
|
361 |
-
# # embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
362 |
-
# # embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
363 |
-
# # retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
364 |
-
# # retriever = EmbeddingRetriever(
|
365 |
-
# # document_store=document_store,
|
366 |
-
# # embedding_model=embedding_model,top_k = retriever_top_k,
|
367 |
-
# # emb_extraction_layer=embedding_layer, scale_score =True,
|
368 |
-
# # model_format=embedding_model_format, use_gpu = True)
|
369 |
-
# # document_store.update_embeddings(retriever)
|
370 |
-
# else:
|
371 |
-
# embedding_model = config.get('semantic_search','RETRIEVER')
|
372 |
-
# embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
373 |
-
# retriever = EmbeddingRetriever(
|
374 |
-
# document_store=document_store,
|
375 |
-
# embedding_model=embedding_model,top_k = retriever_top_k,
|
376 |
-
# emb_extraction_layer=embedding_layer, scale_score =True,
|
377 |
-
# model_format=embedding_model_format, use_gpu = True)
|
378 |
-
|
379 |
-
# else:
|
380 |
-
# document_store = InMemoryDocumentStore()
|
381 |
-
# document_store.write_documents(documents)
|
382 |
-
|
383 |
-
# embedding_model = config.get('semantic_search','RETRIEVER')
|
384 |
-
# embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
385 |
-
# embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
386 |
-
# retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
387 |
-
|
388 |
-
|
389 |
-
# retriever = EmbeddingRetriever(
|
390 |
-
# document_store=document_store,
|
391 |
-
# embedding_model=embedding_model,top_k = retriever_top_k,
|
392 |
-
# emb_extraction_layer=embedding_layer, scale_score =True,
|
393 |
-
# model_format=embedding_model_format, use_gpu = True)
|
394 |
-
# st.session_state['retriever'] = retriever
|
395 |
-
# document_store.update_embeddings(retriever)
|
396 |
-
# st.session_state['document_store'] = document_store
|
397 |
-
# querycheck = QueryCheck()
|
398 |
-
# st.session_state['querycheck'] = querycheck
|
399 |
-
# reader_model = config.get('semantic_search','READER')
|
400 |
-
# reader_top_k = retriever_top_k
|
401 |
-
# reader = FARMReader(model_name_or_path=reader_model,
|
402 |
-
# top_k = reader_top_k, use_gpu=True)
|
403 |
-
|
404 |
-
# st.session_state['reader'] = reader
|
405 |
-
|
406 |
-
# querycheck = QueryCheck()
|
407 |
-
|
408 |
-
# reader_model = config.get('semantic_search','READER')
|
409 |
-
# reader_top_k = retriever_top_k
|
410 |
-
# reader = FARMReader(model_name_or_path=reader_model,
|
411 |
-
# top_k = reader_top_k, use_gpu=True)
|
412 |
-
|
413 |
-
|
414 |
-
# semanticsearch_pipeline = Pipeline()
|
415 |
-
# semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
|
416 |
-
# inputs = ["Query"])
|
417 |
-
# semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
|
418 |
-
# inputs = ["QueryCheck.output_1"])
|
419 |
-
# semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
|
420 |
-
# inputs= ["EmbeddingRetriever"])
|
421 |
-
|
422 |
-
# return semanticsearch_pipeline, document_store
|
|
|
34 |
|
35 |
class QueryCheck(BaseComponent):
|
36 |
"""
|
37 |
+
Uses Query Classifier from Haystack, process the query based on query type.
|
38 |
+
Ability to determine the statements is not so good, therefore the chances
|
39 |
+
statement also get modified. Ex: "List water related issues" will be
|
40 |
+
identified by the model as keywords, and therefore it be processed as "find
|
41 |
+
all issues related to 'list all water related issues'". This is one shortcoming
|
42 |
+
but is igonred for now, as semantic search will not get affected a lot, by this.
|
43 |
+
|
44 |
1. https://docs.haystack.deepset.ai/docs/query_classifier
|
45 |
|
46 |
"""
|
|
|
75 |
split_length:int = 2, split_overlap = 0,
|
76 |
removePunc = False)->List[Document]:
|
77 |
"""
|
78 |
+
creates the pipeline and runs the preprocessing pipeline.
|
|
|
79 |
|
80 |
Params
|
81 |
------------
|
|
|
137 |
|
138 |
Return
|
139 |
-------
|
140 |
+
retriever: embedding model
|
141 |
"""
|
142 |
logging.info("loading retriever")
|
143 |
if document_store is None:
|
|
|
156 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
157 |
def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
|
158 |
"""
|
159 |
+
Creates the InMemory Document Store from haystack list of Documents.
|
160 |
It is mandatory component for Retriever to work in Haystack frame work.
|
161 |
|
162 |
Params
|
|
|
172 |
"""
|
173 |
document_store = InMemoryDocumentStore(similarity = similarity)
|
174 |
document_store.write_documents(documents)
|
|
|
|
|
|
|
|
|
175 |
|
176 |
return document_store
|
177 |
|
|
|
183 |
reader_model:str = None, reader_top_k:int = 10):
|
184 |
"""
|
185 |
creates the semantic search pipeline and document Store object from the
|
186 |
+
list of haystack documents. The top_k for the Reader and Retirever are kept
|
187 |
+
same, so that all the results returned by Retriever are used, however the
|
188 |
+
context is extracted by Reader for each retrieved result. The querycheck is
|
189 |
+
added as node to process the query.
|
|
|
190 |
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
191 |
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
192 |
3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
|
|
|
214 |
nodes [QueryCheck, Retriever, Reader]
|
215 |
|
216 |
document_store: As retriever can work only with Haystack Document Store, the
|
217 |
+
list of document returned by preprocessing pipeline are fed into to get
|
218 |
+
InMemmoryDocumentStore object type, with retriever updating the embedding
|
219 |
+
embeddings of each paragraph in document store.
|
220 |
|
221 |
"""
|
222 |
+
document_store = createDocumentStore(documents)
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
retriever = loadRetriever(embedding_model = embedding_model,
|
224 |
embedding_model_format=embedding_model_format,
|
225 |
embedding_layer=embedding_layer,
|
226 |
retriever_top_k= retriever_top_k,
|
227 |
document_store = document_store)
|
228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
document_store.update_embeddings(retriever)
|
|
|
230 |
querycheck = QueryCheck()
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
reader = FARMReader(model_name_or_path=reader_model,
|
232 |
top_k = reader_top_k, use_gpu=True)
|
|
|
|
|
|
|
|
|
233 |
|
234 |
semanticsearch_pipeline = Pipeline()
|
235 |
semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
|
|
|
311 |
end_idx = temp['offsets_in_document'][0]['end']
|
312 |
match = [[start_idx,end_idx]]
|
313 |
doc = doc_store.get_document_by_id(temp['document_id']).content
|
314 |
+
if check_streamlit:
|
315 |
+
st.write("Result {}".format(i+1))
|
316 |
+
else:
|
317 |
+
print("Result {}".format(i+1))
|
318 |
+
semanticsearchAnnotator(match, doc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|