prashant
commited on
Commit
·
7d78a3b
1
Parent(s):
f59362a
semantic updates
Browse files- appStore/keyword_search.py +32 -10
- paramconfig.cfg +2 -0
- utils/sdg_classifier.py +0 -1
- utils/semantic_search.py +128 -43
appStore/keyword_search.py
CHANGED
@@ -7,6 +7,21 @@ import json
|
|
7 |
import logging
|
8 |
from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
|
9 |
from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_search
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def app():
|
12 |
|
@@ -77,19 +92,26 @@ def app():
|
|
77 |
|
78 |
|
79 |
if searchtype == 'Exact Matches':
|
80 |
-
allDocuments = runLexicalPreprocessingPipeline(
|
81 |
-
|
82 |
-
|
83 |
-
logging.info("performing lexical search")
|
84 |
-
with st.spinner("Performing Exact matching search \
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
88 |
else:
|
89 |
allDocuments = runSemanticPreprocessingPipeline(
|
90 |
-
st.session_state['filepath'],
|
91 |
-
st.session_state['filename']
|
|
|
|
|
|
|
|
|
|
|
92 |
|
|
|
93 |
logging.info("starting semantic search")
|
94 |
with st.spinner("Performing Similar/Contextual search"):
|
95 |
semantic_search(queryList,allDocuments['documents'])
|
|
|
7 |
import logging
|
8 |
from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
|
9 |
from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_search
|
10 |
+
from utils.checkconfig import getconfig
|
11 |
+
|
12 |
+
# Declare all the necessary variables
|
13 |
+
config = getconfig('paramconfig.cfg')
|
14 |
+
split_by = config.get('semantic_search','SPLIT_BY')
|
15 |
+
split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
|
16 |
+
split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
|
17 |
+
split_respect_sentence_boundary = bool(int(config.get('semantic_search','RESPECT_SENTENCE_BOUNDARY')))
|
18 |
+
remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
|
19 |
+
embedding_model = config.get('semantic_search','RETRIEVER')
|
20 |
+
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
21 |
+
embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
22 |
+
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
23 |
+
reader_model = config.get('semantic_search','READER')
|
24 |
+
reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
25 |
|
26 |
def app():
|
27 |
|
|
|
92 |
|
93 |
|
94 |
if searchtype == 'Exact Matches':
|
95 |
+
# allDocuments = runLexicalPreprocessingPipeline(
|
96 |
+
# st.session_state['filepath'],
|
97 |
+
# st.session_state['filename'])
|
98 |
+
# logging.info("performing lexical search")
|
99 |
+
# with st.spinner("Performing Exact matching search \
|
100 |
+
# (Lexical search) for you"):
|
101 |
+
# st.markdown("##### Top few lexical search (TFIDF) hits #####")
|
102 |
+
# lexical_search(queryList,allDocuments['documents'])
|
103 |
+
pass
|
104 |
else:
|
105 |
allDocuments = runSemanticPreprocessingPipeline(
|
106 |
+
file_path= st.session_state['filepath'],
|
107 |
+
file_name = st.session_state['filename'],
|
108 |
+
split_by=split_by,
|
109 |
+
split_length= split_length,
|
110 |
+
split_overlap=split_overlap,
|
111 |
+
removePunc= remove_punc,
|
112 |
+
split_respect_sentence_boundary=split_respect_sentence_boundary)
|
113 |
|
114 |
+
|
115 |
logging.info("starting semantic search")
|
116 |
with st.spinner("Performing Similar/Contextual search"):
|
117 |
semantic_search(queryList,allDocuments['documents'])
|
paramconfig.cfg
CHANGED
@@ -16,6 +16,8 @@ THRESHOLD = 0.1
|
|
16 |
SPLIT_BY = sentence
|
17 |
SPLIT_LENGTH = 3
|
18 |
SPLIT_OVERLAP = 0
|
|
|
|
|
19 |
|
20 |
[sdg]
|
21 |
THRESHOLD = 0.85
|
|
|
16 |
SPLIT_BY = sentence
|
17 |
SPLIT_LENGTH = 3
|
18 |
SPLIT_OVERLAP = 0
|
19 |
+
RESPECT_SENTENCE_BOUNDARY = 1
|
20 |
+
REMOVE_PUNC = 0
|
21 |
|
22 |
[sdg]
|
23 |
THRESHOLD = 0.85
|
utils/sdg_classifier.py
CHANGED
@@ -2,7 +2,6 @@ from haystack.nodes import TransformersDocumentClassifier
|
|
2 |
from haystack.schema import Document
|
3 |
from typing import List, Tuple
|
4 |
from typing_extensions import Literal
|
5 |
-
import configparser
|
6 |
import logging
|
7 |
import pandas as pd
|
8 |
from pandas import DataFrame, Series
|
|
|
2 |
from haystack.schema import Document
|
3 |
from typing import List, Tuple
|
4 |
from typing_extensions import Literal
|
|
|
5 |
import logging
|
6 |
import pandas as pd
|
7 |
from pandas import DataFrame, Series
|
utils/semantic_search.py
CHANGED
@@ -2,11 +2,11 @@ from haystack.nodes import TransformersQueryClassifier
|
|
2 |
from haystack.nodes import EmbeddingRetriever, FARMReader
|
3 |
from haystack.nodes.base import BaseComponent
|
4 |
from haystack.document_stores import InMemoryDocumentStore
|
5 |
-
import configparser
|
6 |
from markdown import markdown
|
7 |
from annotated_text import annotation
|
8 |
from haystack.schema import Document
|
9 |
from typing import List, Text
|
|
|
10 |
from utils.preprocessing import processingpipeline
|
11 |
from utils.streamlitcheck import check_streamlit
|
12 |
from haystack.pipelines import Pipeline
|
@@ -19,16 +19,15 @@ try:
|
|
19 |
import streamlit as st
|
20 |
except ImportError:
|
21 |
logging.info("Streamlit not installed")
|
22 |
-
config = configparser.ConfigParser()
|
23 |
-
try:
|
24 |
-
config.read_file(open('paramconfig.cfg'))
|
25 |
-
except Exception:
|
26 |
-
logging.info("paramconfig file not found")
|
27 |
-
st.info("Please place the paramconfig file in the same directory as app.py")
|
28 |
|
29 |
|
30 |
@st.cache(allow_output_mutation=True)
|
31 |
def loadQueryClassifier():
|
|
|
|
|
|
|
|
|
|
|
32 |
query_classifier = TransformersQueryClassifier(model_name_or_path=
|
33 |
"shahrukhx01/bert-mini-finetune-question-detection")
|
34 |
return query_classifier
|
@@ -63,8 +62,12 @@ class QueryCheck(BaseComponent):
|
|
63 |
def run_batch(self, query):
|
64 |
pass
|
65 |
|
66 |
-
|
67 |
-
def runSemanticPreprocessingPipeline(file_path, file_name
|
|
|
|
|
|
|
|
|
68 |
"""
|
69 |
creates the pipeline and runs the preprocessing pipeline,
|
70 |
the params for pipeline are fetched from paramconfig
|
@@ -76,6 +79,12 @@ def runSemanticPreprocessingPipeline(file_path, file_name)->List[Document]:
|
|
76 |
st.session_state['filename']
|
77 |
file_path: filepath, in case of streamlit application use
|
78 |
st.session_state['filepath']
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
Return
|
81 |
--------------
|
@@ -87,61 +96,90 @@ def runSemanticPreprocessingPipeline(file_path, file_name)->List[Document]:
|
|
87 |
"""
|
88 |
|
89 |
semantic_processing_pipeline = processingpipeline()
|
90 |
-
split_by = config.get('semantic_search','SPLIT_BY')
|
91 |
-
split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
|
92 |
-
split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
|
93 |
|
94 |
output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
|
95 |
params= {"FileConverter": {"file_path": file_path, \
|
96 |
"file_name": file_name},
|
97 |
-
"UdfPreProcessor": {"removePunc":
|
98 |
"split_by": split_by, \
|
99 |
"split_length":split_length,\
|
100 |
-
"split_overlap": split_overlap
|
|
|
101 |
|
102 |
return output_semantic_pre
|
103 |
|
104 |
|
105 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
106 |
-
def loadRetriever(embedding_model = None, embedding_model_format = None,
|
107 |
-
embedding_layer = None, retriever_top_k = 10,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
logging.info("loading retriever")
|
109 |
if document_store is None:
|
110 |
logging.warning("Retriever initialization requires the DocumentStore")
|
111 |
return
|
112 |
-
|
113 |
-
|
114 |
-
if embedding_model is None:
|
115 |
-
try:
|
116 |
-
embedding_model = config.get('semantic_search','RETRIEVER')
|
117 |
-
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
118 |
-
embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
119 |
-
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
120 |
-
except Exception as e:
|
121 |
-
logging.info(e)
|
122 |
-
st.info(e)
|
123 |
|
124 |
retriever = EmbeddingRetriever(
|
125 |
embedding_model=embedding_model,top_k = retriever_top_k,
|
126 |
document_store = document_store,
|
127 |
emb_extraction_layer=embedding_layer, scale_score =True,
|
128 |
model_format=embedding_model_format, use_gpu = True)
|
129 |
-
|
|
|
130 |
return retriever
|
131 |
|
132 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
133 |
def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
document_store = InMemoryDocumentStore(similarity = similarity)
|
135 |
document_store.write_documents(documents)
|
136 |
-
if
|
137 |
-
|
138 |
-
|
|
|
139 |
|
140 |
return document_store
|
141 |
|
142 |
|
143 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
144 |
-
def semanticSearchPipeline(documents:List[Document]
|
|
|
|
|
|
|
145 |
"""
|
146 |
creates the semantic search pipeline and document Store object from the
|
147 |
list of haystack documents. Retriever and Reader model are read from
|
@@ -149,32 +187,66 @@ def semanticSearchPipeline(documents:List[Document]):
|
|
149 |
all the results returned by Retriever are used, however the context is
|
150 |
extracted by Reader for each retrieved result. The querycheck is added as
|
151 |
node to process the query.
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
|
154 |
Params
|
155 |
----------
|
156 |
documents: list of Haystack Documents, returned by preprocessig pipeline.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
Return
|
159 |
---------
|
160 |
semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
|
161 |
nodes [QueryCheck, Retriever, Reader]
|
162 |
|
163 |
-
document_store: As retriever
|
164 |
list of document returned by preprocessing pipeline.
|
165 |
|
166 |
"""
|
167 |
document_store = createDocumentStore(documents)
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
document_store.update_embeddings(retriever)
|
|
|
170 |
querycheck = QueryCheck()
|
171 |
-
if
|
172 |
-
reader
|
|
|
|
|
173 |
else:
|
174 |
-
reader_model
|
175 |
-
|
176 |
-
|
177 |
-
top_k = reader_top_k, use_gpu=True)
|
178 |
st.session_state['reader'] = reader
|
179 |
|
180 |
semanticsearch_pipeline = Pipeline()
|
@@ -224,7 +296,10 @@ def semanticsearchAnnotator(matches: List[List[int]], document):
|
|
224 |
print(annotated_text)
|
225 |
|
226 |
|
227 |
-
def semantic_search(query:Text,documents:List[Document]
|
|
|
|
|
|
|
228 |
"""
|
229 |
Performs the Semantic search on the List of haystack documents which is
|
230 |
returned by preprocessing Pipeline.
|
@@ -235,9 +310,19 @@ def semantic_search(query:Text,documents:List[Document]):
|
|
235 |
documents: List fo Haystack documents returned by preprocessing pipeline.
|
236 |
|
237 |
"""
|
238 |
-
semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
results = semanticsearch_pipeline.run(query = query)
|
240 |
-
|
|
|
|
|
|
|
|
|
241 |
for i,answer in enumerate(results['answers']):
|
242 |
temp = answer.to_dict()
|
243 |
start_idx = temp['offsets_in_document'][0]['start']
|
|
|
2 |
from haystack.nodes import EmbeddingRetriever, FARMReader
|
3 |
from haystack.nodes.base import BaseComponent
|
4 |
from haystack.document_stores import InMemoryDocumentStore
|
|
|
5 |
from markdown import markdown
|
6 |
from annotated_text import annotation
|
7 |
from haystack.schema import Document
|
8 |
from typing import List, Text
|
9 |
+
from typing_extensions import Literal
|
10 |
from utils.preprocessing import processingpipeline
|
11 |
from utils.streamlitcheck import check_streamlit
|
12 |
from haystack.pipelines import Pipeline
|
|
|
19 |
import streamlit as st
|
20 |
except ImportError:
|
21 |
logging.info("Streamlit not installed")
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
|
24 |
@st.cache(allow_output_mutation=True)
|
25 |
def loadQueryClassifier():
|
26 |
+
"""
|
27 |
+
retuns the haystack query classifier model
|
28 |
+
model = shahrukhx01/bert-mini-finetune-question-detection
|
29 |
+
|
30 |
+
"""
|
31 |
query_classifier = TransformersQueryClassifier(model_name_or_path=
|
32 |
"shahrukhx01/bert-mini-finetune-question-detection")
|
33 |
return query_classifier
|
|
|
62 |
def run_batch(self, query):
|
63 |
pass
|
64 |
|
65 |
+
@st.cache(allow_output_mutation=True)
|
66 |
+
def runSemanticPreprocessingPipeline(file_path, file_name,
|
67 |
+
split_by: Literal["sentence", "word"] = 'sentence',
|
68 |
+
split_respect_sentence_boundary = False,
|
69 |
+
split_length:int = 2, split_overlap = 0,
|
70 |
+
removePunc = False)->List[Document]:
|
71 |
"""
|
72 |
creates the pipeline and runs the preprocessing pipeline,
|
73 |
the params for pipeline are fetched from paramconfig
|
|
|
79 |
st.session_state['filename']
|
80 |
file_path: filepath, in case of streamlit application use
|
81 |
st.session_state['filepath']
|
82 |
+
removePunc: to remove all Punctuation including ',' and '.' or not
|
83 |
+
split_by: document splitting strategy either as word or sentence
|
84 |
+
split_length: when synthetically creating the paragrpahs from document,
|
85 |
+
it defines the length of paragraph.
|
86 |
+
split_respect_sentence_boundary: Used when using 'word' strategy for
|
87 |
+
splititng of text.
|
88 |
|
89 |
Return
|
90 |
--------------
|
|
|
96 |
"""
|
97 |
|
98 |
semantic_processing_pipeline = processingpipeline()
|
|
|
|
|
|
|
99 |
|
100 |
output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
|
101 |
params= {"FileConverter": {"file_path": file_path, \
|
102 |
"file_name": file_name},
|
103 |
+
"UdfPreProcessor": {"removePunc": removePunc, \
|
104 |
"split_by": split_by, \
|
105 |
"split_length":split_length,\
|
106 |
+
"split_overlap": split_overlap,
|
107 |
+
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
108 |
|
109 |
return output_semantic_pre
|
110 |
|
111 |
|
112 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
113 |
+
def loadRetriever(embedding_model:Text = None, embedding_model_format:Text = None,
|
114 |
+
embedding_layer:int = None, retriever_top_k:int = 10,
|
115 |
+
document_store:InMemoryDocumentStore = None):
|
116 |
+
"""
|
117 |
+
Returns the Retriever model based on params provided.
|
118 |
+
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
119 |
+
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
120 |
+
3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
|
121 |
+
|
122 |
+
|
123 |
+
Params
|
124 |
+
---------
|
125 |
+
embedding_model: Name of the model to be used for embedding. Check the links
|
126 |
+
provided in documentation
|
127 |
+
embedding_model_format: check the github link of Haystack provided in documentation
|
128 |
+
embedding_layer: check the github link of Haystack provided in documentation
|
129 |
+
retriever_top_k: Number of Top results to be returned by retriever
|
130 |
+
document_store: InMemoryDocumentStore, write haystack Document list to DocumentStore
|
131 |
+
and pass the same to function call. Can be done using createDocumentStore from utils.
|
132 |
+
|
133 |
+
Return
|
134 |
+
-------
|
135 |
+
retriever: emebedding model
|
136 |
+
"""
|
137 |
logging.info("loading retriever")
|
138 |
if document_store is None:
|
139 |
logging.warning("Retriever initialization requires the DocumentStore")
|
140 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
retriever = EmbeddingRetriever(
|
143 |
embedding_model=embedding_model,top_k = retriever_top_k,
|
144 |
document_store = document_store,
|
145 |
emb_extraction_layer=embedding_layer, scale_score =True,
|
146 |
model_format=embedding_model_format, use_gpu = True)
|
147 |
+
if check_streamlit:
|
148 |
+
st.session_state['retriever'] = retriever
|
149 |
return retriever
|
150 |
|
151 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
152 |
def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
|
153 |
+
"""
|
154 |
+
Creates the InMemory Document Store frm haystack list of Documents.
|
155 |
+
It is mandatory component for Retriever to work in Haystack frame work.
|
156 |
+
|
157 |
+
Params
|
158 |
+
-------
|
159 |
+
documents: List of haystack document. If using the preprocessing pipeline,
|
160 |
+
can be fetched key = 'documents; on output of preprocessing pipeline.
|
161 |
+
similarity: scoring function, can be either 'cosine' or 'dot_product'
|
162 |
+
|
163 |
+
Return
|
164 |
+
-------
|
165 |
+
document_store: InMemory Document Store object type.
|
166 |
+
|
167 |
+
"""
|
168 |
document_store = InMemoryDocumentStore(similarity = similarity)
|
169 |
document_store.write_documents(documents)
|
170 |
+
# if check_streamlit:
|
171 |
+
# if 'retriever' in st.session_state:
|
172 |
+
# retriever = st.session_state['retriever']
|
173 |
+
# document_store.update_embeddings(retriever)
|
174 |
|
175 |
return document_store
|
176 |
|
177 |
|
178 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
179 |
+
def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
|
180 |
+
embedding_model_format:Text = None,
|
181 |
+
embedding_layer:int = None, retriever_top_k:int = 10,
|
182 |
+
reader_model:str = None, reader_top_k:int = 10):
|
183 |
"""
|
184 |
creates the semantic search pipeline and document Store object from the
|
185 |
list of haystack documents. Retriever and Reader model are read from
|
|
|
187 |
all the results returned by Retriever are used, however the context is
|
188 |
extracted by Reader for each retrieved result. The querycheck is added as
|
189 |
node to process the query.
|
190 |
+
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
191 |
+
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
192 |
+
3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
|
193 |
+
4. https://docs.haystack.deepset.ai/docs/reader
|
194 |
+
|
195 |
|
196 |
|
197 |
Params
|
198 |
----------
|
199 |
documents: list of Haystack Documents, returned by preprocessig pipeline.
|
200 |
+
embedding_model: Name of the model to be used for embedding. Check the links
|
201 |
+
provided in documentation
|
202 |
+
embedding_model_format: check the github link of Haystack provided in documentation
|
203 |
+
embedding_layer: check the github link of Haystack provided in documentation
|
204 |
+
retriever_top_k: Number of Top results to be returned by retriever
|
205 |
+
reader_model: Name of the model to be used for Reader node in hasyatck
|
206 |
+
Pipeline. Check the links provided in documentation
|
207 |
+
reader_top_k: Reader will use retrieved results to further find better matches.
|
208 |
+
As purpose here is to use reader to extract context, the value is
|
209 |
+
same as retriever_top_k.
|
210 |
|
211 |
Return
|
212 |
---------
|
213 |
semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
|
214 |
nodes [QueryCheck, Retriever, Reader]
|
215 |
|
216 |
+
document_store: As retriever can work only with Haystack Document Store, the
|
217 |
list of document returned by preprocessing pipeline.
|
218 |
|
219 |
"""
|
220 |
document_store = createDocumentStore(documents)
|
221 |
+
if check_streamlit:
|
222 |
+
if retriever in st.session_state:
|
223 |
+
if st.session_state['retriever']:
|
224 |
+
retriever = st.session_state['retriever']
|
225 |
+
else:
|
226 |
+
if embedding_model:
|
227 |
+
retriever = loadRetriever(embedding_model = embedding_model,
|
228 |
+
embedding_model_format=embedding_model_format,
|
229 |
+
embedding_layer=embedding_layer,
|
230 |
+
retriever_top_k= retriever_top_k,
|
231 |
+
document_store = document_store)
|
232 |
+
|
233 |
+
st.session_state['retriever'] = retriever
|
234 |
+
else:
|
235 |
+
logging.warning("no streamlit enviornment found, neither embedding model \
|
236 |
+
provided")
|
237 |
+
return
|
238 |
+
|
239 |
document_store.update_embeddings(retriever)
|
240 |
+
retriever.document_store = document_store
|
241 |
querycheck = QueryCheck()
|
242 |
+
if check_streamlit:
|
243 |
+
if 'reader' in st.session_state:
|
244 |
+
reader = st.session_state['reader']
|
245 |
+
|
246 |
else:
|
247 |
+
if reader_model:
|
248 |
+
reader = FARMReader(model_name_or_path=reader_model,
|
249 |
+
top_k = reader_top_k, use_gpu=True)
|
|
|
250 |
st.session_state['reader'] = reader
|
251 |
|
252 |
semanticsearch_pipeline = Pipeline()
|
|
|
296 |
print(annotated_text)
|
297 |
|
298 |
|
299 |
+
def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
|
300 |
+
embedding_model_format:Text,
|
301 |
+
embedding_layer:int, reader_model:str,
|
302 |
+
retriever_top_k:int = 10, reader_top_k:int = 10):
|
303 |
"""
|
304 |
Performs the Semantic search on the List of haystack documents which is
|
305 |
returned by preprocessing Pipeline.
|
|
|
310 |
documents: List fo Haystack documents returned by preprocessing pipeline.
|
311 |
|
312 |
"""
|
313 |
+
semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents,
|
314 |
+
embedding_model= embedding_model,
|
315 |
+
embedding_layer= embedding_layer,
|
316 |
+
embedding_model_format= embedding_model_format,
|
317 |
+
reader_model= reader_model, retriever_top_k= retriever_top_k,
|
318 |
+
reader_top_k= reader_top_k)
|
319 |
+
|
320 |
results = semanticsearch_pipeline.run(query = query)
|
321 |
+
|
322 |
+
if check_streamlit:
|
323 |
+
st.markdown("##### Top few semantic search results #####")
|
324 |
+
else:
|
325 |
+
print("Top few semantic search results")
|
326 |
for i,answer in enumerate(results['answers']):
|
327 |
temp = answer.to_dict()
|
328 |
start_idx = temp['offsets_in_document'][0]['start']
|