Spaces:

GIZ
/

SDSN-demo

Runtime error

App Files Files Community

prashant commited on Nov 15, 2022

Commit

43cd965

1 Parent(s): 3a88079

search UI changes

Browse files

Files changed (6) hide show

appStore/keyword_search.py +39 -27
docStore/sample/keywordexample.json +1 -1
paramconfig.cfg +3 -2
utils/lexical_search.py +18 -16
utils/preprocessing.py +4 -1
utils/semantic_search.py +23 -127

appStore/keyword_search.py CHANGED Viewed

@@ -14,7 +14,8 @@ config = getconfig('paramconfig.cfg')
 split_by = config.get('semantic_search','SPLIT_BY')
 split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
 split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
-split_respect_sentence_boundary = bool(int(config.get('semantic_search','RESPECT_SENTENCE_BOUNDARY')))
 remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
 embedding_model = config.get('semantic_search','RETRIEVER')
 embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
@@ -22,6 +23,11 @@ embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
 retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
 reader_model = config.get('semantic_search','READER')
 reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
 def app():
@@ -49,22 +55,23 @@ def app():
             keywordexample = json.load(json_file)
         genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
-        if genre == 'Food':
-            keywordList = keywordexample['Food']
-        elif genre == 'Climate':
-            keywordList = keywordexample['Climate']
-        elif genre == 'Social':
-            keywordList = keywordexample['Social']
-        elif genre == 'Nature':
-            keywordList = keywordexample['Nature']
-        elif genre == 'Implementation':
-            keywordList = keywordexample['Implementation']
         else:
             keywordList = None
-        searchtype = st.selectbox("Do you want to find exact macthes or similar \
-                                    meaning/context",
-                                 ['Exact Matches', 'Similar context/meaning'])
         st.markdown("---")
@@ -80,7 +87,7 @@ def app():
                                         for and we will we will look for similar\
                                         context in the document.",
                                     placeholder="Enter keyword here")
         if st.button("Find them"):
             if queryList == "":
@@ -91,16 +98,22 @@ def app():
                 if 'filepath' in st.session_state:
-                    if searchtype == 'Exact Matches':
-                        # allDocuments = runLexicalPreprocessingPipeline(
-                        #                     st.session_state['filepath'],
-                        #                     st.session_state['filename'])
-                        # logging.info("performing lexical search")
-                        # with st.spinner("Performing Exact matching search \
-                        #                 (Lexical search) for you"):
-                        #     st.markdown("##### Top few lexical search (TFIDF) hits #####")
-                        #     lexical_search(queryList,allDocuments['documents'])
-                        pass
                     else:
                         allDocuments = runSemanticPreprocessingPipeline(
                                             file_path= st.session_state['filepath'],
@@ -109,7 +122,7 @@ def app():
                                             split_length= split_length,
                                             split_overlap=split_overlap,
                                             removePunc= remove_punc,
-                            split_respect_sentence_boundary=split_respect_sentence_boundary)
                         logging.info("starting semantic search")
@@ -120,7 +133,6 @@ def app():
                             embedding_layer=embedding_layer,
                             embedding_model_format=embedding_model_format,
                             reader_model=reader_model,reader_top_k=reader_top_k,
                             retriever_top_k=retriever_top_k)
                 else:

 split_by = config.get('semantic_search','SPLIT_BY')
 split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
 split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
+split_respect_sentence_boundary = bool(int(config.get('semantic_search',
+                                    'RESPECT_SENTENCE_BOUNDARY')))
 remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
 embedding_model = config.get('semantic_search','RETRIEVER')
 embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
 retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
 reader_model = config.get('semantic_search','READER')
 reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
+lexical_split_by= config.get('lexical_search','SPLIT_BY')
+lexical_split_length=int(config.get('lexical_search','SPLIT_LENGTH'))
+lexical_split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
+lexical_remove_punc = bool(int(config.get('lexical_search','REMOVE_PUNC')))
+lexical_top_k=int(config.get('lexical_search','TOP_K'))
 def app():
             keywordexample = json.load(json_file)
         genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
+        if genre:
+            keywordList = keywordexample[genre]
+        # elif genre == 'Climate':
+        #     keywordList = keywordexample['Climate']
+        # elif genre == 'Social':
+        #     keywordList = keywordexample['Social']
+        # elif genre == 'Nature':
+        #     keywordList = keywordexample['Nature']
+        # elif genre == 'Implementation':
+        #     keywordList = keywordexample['Implementation']
         else:
             keywordList = None
+        # searchtype = st.selectbox("Do you want to find exact macthes or similar \
+        #                             meaning/context",
+        #                          ['Exact Matches', 'Similar context/meaning'])
         st.markdown("---")
                                         for and we will we will look for similar\
                                         context in the document.",
                                     placeholder="Enter keyword here")
+        searchtype = st.checkbox("Show only Exact Matches")
         if st.button("Find them"):
             if queryList == "":
                 if 'filepath' in st.session_state:
+                    if searchtype:
+                        allDocuments = runLexicalPreprocessingPipeline(
+                                    file_name=st.session_state['filename'],
+                                    file_path=st.session_state['filepath'],
+                                    split_by=lexical_split_by,
+                                    split_length=lexical_split_length,
+                                    split_overlap=lexical_split_overlap,
+                                    removePunc=lexical_remove_punc),
+                        logging.info("performing lexical search")
+                        with st.spinner("Performing Exact matching search \
+                                        (Lexical search) for you"):
+                            st.markdown("##### Top few lexical search (TFIDF) hits #####")
+                            lexical_search(
+                                query=queryList,
+                                documents = allDocuments['documents'],
+                                top_k = lexical_top_k )
                     else:
                         allDocuments = runSemanticPreprocessingPipeline(
                                             file_path= st.session_state['filepath'],
                                             split_length= split_length,
                                             split_overlap=split_overlap,
                                             removePunc= remove_punc,
+                        split_respect_sentence_boundary=split_respect_sentence_boundary)
                         logging.info("starting semantic search")
                             embedding_layer=embedding_layer,
                             embedding_model_format=embedding_model_format,
                             reader_model=reader_model,reader_top_k=reader_top_k,
                             retriever_top_k=retriever_top_k)
                 else:

docStore/sample/keywordexample.json CHANGED Viewed

@@ -1,4 +1,4 @@
-{"I will enter my own keyword":[],
 "Food":"Food security,Nutrition,Diets,Food loss",
 "Climate":"Climate,Adaptation,Mitigation,Decarbonization,Carbon neutrality,Net zero Emissions",
 "Social":"Indigenous,Local community(ies),Gender,Rural livelihoods,Minority",

+{
 "Food":"Food security,Nutrition,Diets,Food loss",
 "Climate":"Climate,Adaptation,Mitigation,Decarbonization,Carbon neutrality,Net zero Emissions",
 "Social":"Indigenous,Local community(ies),Gender,Rural livelihoods,Minority",

paramconfig.cfg CHANGED Viewed

@@ -1,8 +1,9 @@
 [lexical_search]
 TOP_K = 20
-SPLIT_BY = sentence
-SPLIT_LENGTH = 3
 SPLIT_OVERLAP = 0
 [semantic_search]
 RETRIEVER_TOP_K = 10

 [lexical_search]
 TOP_K = 20
+SPLIT_BY = word
+SPLIT_LENGTH = 120
 SPLIT_OVERLAP = 0
+REMOVE_PUNC = 0
 [semantic_search]
 RETRIEVER_TOP_K = 10

utils/lexical_search.py CHANGED Viewed

@@ -8,9 +8,9 @@ from markdown import markdown
 from annotated_text import annotation
 from haystack.schema import Document
 from typing import List, Text
 from utils.preprocessing import processingpipeline
 from utils.streamlitcheck import check_streamlit
-import configparser
 import logging
 try:
     from termcolor import colored
@@ -21,18 +21,17 @@ try:
     import streamlit as st
 except ImportError:
     logging.info("Streamlit not installed")
-config = configparser.ConfigParser()
-try:
-    config.read_file(open('paramconfig.cfg'))
-except Exception:
-    logging.warning("paramconfig file not found")
-    st.info("Please place the paramconfig file in the same directory as app.py")
-def runLexicalPreprocessingPipeline(file_path, file_name)->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
-    the params for pipeline are fetched from paramconfig
     Params
     ------------
@@ -41,6 +40,11 @@ def runLexicalPreprocessingPipeline(file_path, file_name)->List[Document]:
     st.session_state['filename']
     file_path: filepath, in case of streamlit application use
     st.session_state['filepath']
     Return
     --------------
@@ -52,14 +56,12 @@ def runLexicalPreprocessingPipeline(file_path, file_name)->List[Document]:
     """
     lexical_processing_pipeline = processingpipeline()
-    split_by = config.get('lexical_search','SPLIT_BY')
-    split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
-    split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
     output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
                             params= {"FileConverter": {"file_path": file_path, \
                                         "file_name": file_name},
-                                        "UdfPreProcessor": {"removePunc": False, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
                                             "split_overlap": split_overlap}})
@@ -201,7 +203,7 @@ def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
     else:
         print(annotated_text)
-def lexical_search(query:Text,documents:List[Document]):
     """
     Performs the Lexical search on the List of haystack documents which is
     returned by preprocessing Pipeline.
@@ -210,6 +212,7 @@ def lexical_search(query:Text,documents:List[Document]):
     -------
     query: Keywords that need to be searche in documents.
     documents: List of Haystack documents returned by preprocessing pipeline.
     """
@@ -218,8 +221,7 @@ def lexical_search(query:Text,documents:List[Document]):
     # Haystack Retriever works with document stores only.
     retriever = TfidfRetriever(document_store)
-    results = retriever.retrieve(query=query,
-                            top_k= int(config.get('lexical_search','TOP_K')))
     query_tokens = tokenize_lexical_query(query)
     for count, result in enumerate(results):
         matches, doc = runSpacyMatcher(query_tokens,result.content)

 from annotated_text import annotation
 from haystack.schema import Document
 from typing import List, Text
+from typing_extensions import Literal
 from utils.preprocessing import processingpipeline
 from utils.streamlitcheck import check_streamlit
 import logging
 try:
     from termcolor import colored
     import streamlit as st
 except ImportError:
     logging.info("Streamlit not installed")
+def runLexicalPreprocessingPipeline(file_path,file_name,
+                        split_by: Literal["sentence", "word"] = 'word',
+                        split_length:int = 80, removePunc:bool = False,
+                        split_overlap:int = 0 )->List[Document]:
     """
     creates the pipeline and runs the preprocessing pipeline,
+    the params for pipeline are fetched from paramconfig. As lexical doesnt gets
+    affected by overlap, threfore split_overlap = 0 in default paramconfig and
+    split_by = word.
     Params
     ------------
     st.session_state['filename']
     file_path: filepath, in case of streamlit application use
     st.session_state['filepath']
+    removePunc: to remove all Punctuation including ',' and '.' or not
+    split_by: document splitting strategy either as word or sentence
+    split_length: when synthetically creating the paragrpahs from document,
+                    it defines the length of paragraph.
+    splititng of text.
     Return
     --------------
     """
     lexical_processing_pipeline = processingpipeline()
     output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
                             params= {"FileConverter": {"file_path": file_path, \
                                         "file_name": file_name},
+                                        "UdfPreProcessor": {"removePunc": removePunc, \
                                             "split_by": split_by, \
                                             "split_length":split_length,\
                                             "split_overlap": split_overlap}})
     else:
         print(annotated_text)
+def lexical_search(query:Text,top_k:int, documents:List[Document]):
     """
     Performs the Lexical search on the List of haystack documents which is
     returned by preprocessing Pipeline.
     -------
     query: Keywords that need to be searche in documents.
     documents: List of Haystack documents returned by preprocessing pipeline.
+    top_k: Number of Top results to be fetched.
     """
     # Haystack Retriever works with document stores only.
     retriever = TfidfRetriever(document_store)
+    results = retriever.retrieve(query=query, top_k = top_k)
     query_tokens = tokenize_lexical_query(query)
     for count, result in enumerate(results):
         matches, doc = runSpacyMatcher(query_tokens,result.content)

utils/preprocessing.py CHANGED Viewed

@@ -167,7 +167,7 @@ class UdfPreProcessor(BaseComponent):
     def run(self, documents:List[Document], removePunc:bool,
             split_by: Literal["sentence", "word"] = 'sentence',
             split_respect_sentence_boundary = False,
-            split_length:int = 2, split_overlap = 0):
         """ this is required method to invoke the component in
         the pipeline implementation.
@@ -181,6 +181,9 @@ class UdfPreProcessor(BaseComponent):
                       it defines the length of paragraph.
         split_respect_sentence_boundary: Used when using 'word' strategy for
         splititng of text.
         Return
         ---------

     def run(self, documents:List[Document], removePunc:bool,
             split_by: Literal["sentence", "word"] = 'sentence',
             split_respect_sentence_boundary = False,
+            split_length:int = 2, split_overlap:int = 0):
         """ this is required method to invoke the component in
         the pipeline implementation.
                       it defines the length of paragraph.
         split_respect_sentence_boundary: Used when using 'word' strategy for
         splititng of text.
+        split_overlap: Number of words or sentences that overlap when creating
+        the paragraphs. This is done as one sentence or 'some words' make sense
+        when  read in together with others. Therefore the overlap is used.
         Return
         ---------

utils/semantic_search.py CHANGED Viewed

@@ -34,7 +34,13 @@ def loadQueryClassifier():
 class QueryCheck(BaseComponent):
     """
-    Uses Query Classifier from Haystack, process the query based on query type
     1. https://docs.haystack.deepset.ai/docs/query_classifier
     """
@@ -69,8 +75,7 @@ def runSemanticPreprocessingPipeline(file_path, file_name,
                 split_length:int = 2, split_overlap = 0,
                 removePunc = False)->List[Document]:
     """
-    creates the pipeline and runs the preprocessing pipeline,
-    the params for pipeline are fetched from paramconfig
     Params
     ------------
@@ -132,7 +137,7 @@ def loadRetriever(embedding_model:Text =  None, embedding_model_format:Text = No
     Return
     -------
-    retriever: emebedding model
     """
     logging.info("loading retriever")
     if document_store is None:
@@ -151,7 +156,7 @@ def loadRetriever(embedding_model:Text =  None, embedding_model_format:Text = No
 @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
 def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
     """
-    Creates the InMemory Document Store frm haystack list of Documents.
     It is  mandatory component for Retriever to work in Haystack frame work.
     Params
@@ -167,10 +172,6 @@ def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
     """
     document_store = InMemoryDocumentStore(similarity = similarity)
     document_store.write_documents(documents)
-    # if check_streamlit:
-    #     if 'retriever' in st.session_state:
-    #         retriever = st.session_state['retriever']
-    #         document_store.update_embeddings(retriever)
     return document_store
@@ -182,11 +183,10 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  Non
                  reader_model:str =  None, reader_top_k:int = 10):
     """
     creates the semantic search pipeline and document Store object from the
-    list of haystack documents. Retriever and Reader model are read from
-    paramconfig. The top_k for the Reader and Retirever are kept same, so that
-    all the results returned by Retriever are used, however the context is
-    extracted by Reader for each retrieved result. The querycheck is added as
-    node to process the query.
     1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
     2. https://www.sbert.net/examples/applications/semantic-search/README.html
     3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
@@ -214,50 +214,22 @@ def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  Non
     nodes [QueryCheck, Retriever, Reader]
     document_store: As retriever can work only with Haystack Document Store, the
-    list of document returned by preprocessing pipeline.
     """
-    document_store = createDocumentStore(documents)
-    # if check_streamlit:
-    #     if 'retriever' in st.session_state:
-    #         # if st.session_state['retriever']:
-    #         retriever = st.session_state['retriever']
-    #     else:
-    #         if embedding_model:
     retriever = loadRetriever(embedding_model = embedding_model,
                     embedding_model_format=embedding_model_format,
                     embedding_layer=embedding_layer,
                     retriever_top_k= retriever_top_k,
                     document_store = document_store)
-                # st.session_state['retriever'] = retriever
-    #         else:
-    #             logging.warning("no streamlit enviornment found, neither embedding model \
-    #                 provided")
-    #             return
-    # elif embedding_model:
-    #     retriever = loadRetriever(embedding_model = embedding_model,
-    #                             embedding_model_format=embedding_model_format,
-    #                             embedding_layer=embedding_layer,
-    #                             retriever_top_k= retriever_top_k,
-    #                             document_store = document_store)
     document_store.update_embeddings(retriever)
-    # retriever.document_store = document_store
     querycheck = QueryCheck()
-    # if check_streamlit:
-    #     if 'reader' in st.session_state:
-    #         reader = st.session_state['reader']
-    #     else:
-    #         if reader_model:
     reader = FARMReader(model_name_or_path=reader_model,
                     top_k = reader_top_k, use_gpu=True)
-    #             st.session_state['reader'] = reader
-    # elif reader_model:
-    #             reader = FARMReader(model_name_or_path=reader_model,
-    #                             top_k = reader_top_k, use_gpu=True)
     semanticsearch_pipeline = Pipeline()
     semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
@@ -339,84 +311,8 @@ def semantic_search(query:Text,documents:List[Document],embedding_model:Text,
         end_idx = temp['offsets_in_document'][0]['end']
         match = [[start_idx,end_idx]]
         doc = doc_store.get_document_by_id(temp['document_id']).content
-        st.write("Result {}".format(i+1))
-        semanticsearchAnnotator(match, doc)
-    # if 'document_store' in st.session_state:
-    #     document_store = st.session_state['document_store']
-    #     temp  = document_store.get_all_documents()
-    #     if st.session_state['filename'] != temp[0].meta['name']:
-    #         document_store = InMemoryDocumentStore()
-    #         document_store.write_documents(documents)
-    #         if 'retriever' in st.session_state:
-    #             retriever = st.session_state['retriever']
-    #             document_store.update_embeddings(retriever)
-    #             # querycheck =
-    #         # embedding_model = config.get('semantic_search','RETRIEVER')
-    #         # embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
-    #         # embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
-    #         # retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
-    #         # retriever = EmbeddingRetriever(
-    #         #     document_store=document_store,
-    #         #     embedding_model=embedding_model,top_k = retriever_top_k,
-    #         #     emb_extraction_layer=embedding_layer, scale_score =True,
-    #         #     model_format=embedding_model_format, use_gpu = True)
-    #         # document_store.update_embeddings(retriever)
-    #     else:
-    #         embedding_model = config.get('semantic_search','RETRIEVER')
-    #         embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
-    #         retriever = EmbeddingRetriever(
-    #             document_store=document_store,
-    #             embedding_model=embedding_model,top_k = retriever_top_k,
-    #             emb_extraction_layer=embedding_layer, scale_score =True,
-    #             model_format=embedding_model_format, use_gpu = True)
-    # else:
-    #     document_store = InMemoryDocumentStore()
-    #     document_store.write_documents(documents)
-    #     embedding_model = config.get('semantic_search','RETRIEVER')
-    #     embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
-    #     embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
-    #     retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
-    #     retriever = EmbeddingRetriever(
-    #         document_store=document_store,
-    #         embedding_model=embedding_model,top_k = retriever_top_k,
-    #         emb_extraction_layer=embedding_layer, scale_score =True,
-    #         model_format=embedding_model_format, use_gpu = True)
-    #     st.session_state['retriever'] = retriever
-    #     document_store.update_embeddings(retriever)
-    #     st.session_state['document_store'] = document_store
-    #     querycheck = QueryCheck()
-    #     st.session_state['querycheck'] = querycheck
-    #     reader_model = config.get('semantic_search','READER')
-    #     reader_top_k = retriever_top_k
-    #     reader = FARMReader(model_name_or_path=reader_model,
-    #                     top_k = reader_top_k, use_gpu=True)
-    #     st.session_state['reader'] = reader
-    # querycheck = QueryCheck()
-    # reader_model = config.get('semantic_search','READER')
-    # reader_top_k = retriever_top_k
-    # reader = FARMReader(model_name_or_path=reader_model,
-    #                 top_k = reader_top_k, use_gpu=True)
-    # semanticsearch_pipeline = Pipeline()
-    # semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
-    #                                 inputs = ["Query"])
-    # semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
-    #                                 inputs = ["QueryCheck.output_1"])
-    # semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
-    #                                 inputs= ["EmbeddingRetriever"])
-    # return semanticsearch_pipeline, document_store

 class QueryCheck(BaseComponent):
     """
+    Uses Query Classifier from Haystack, process the query based on query type.
+    Ability to determine the statements is not so good, therefore the chances
+    statement also get modified. Ex: "List water related issues" will be
+    identified by the model as keywords, and therefore it be processed as "find
+    all issues related to 'list all water related issues'". This is one shortcoming
+    but is igonred for now, as semantic search will not get affected a lot, by this.
     1. https://docs.haystack.deepset.ai/docs/query_classifier
     """
                 split_length:int = 2, split_overlap = 0,
                 removePunc = False)->List[Document]:
     """
+    creates the pipeline and runs the preprocessing pipeline.
     Params
     ------------
     Return
     -------
+    retriever: embedding model
     """
     logging.info("loading retriever")
     if document_store is None:
 @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
 def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
     """
+    Creates the InMemory Document Store from haystack list of Documents.
     It is  mandatory component for Retriever to work in Haystack frame work.
     Params
     """
     document_store = InMemoryDocumentStore(similarity = similarity)
     document_store.write_documents(documents)
     return document_store
                  reader_model:str =  None, reader_top_k:int = 10):
     """
     creates the semantic search pipeline and document Store object from the
+    list of haystack documents. The top_k for the Reader and Retirever are kept
+    same, so that all the results returned by Retriever are used, however the
+    context is extracted by Reader for each retrieved result. The querycheck is
+    added as node to process the query.
     1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
     2. https://www.sbert.net/examples/applications/semantic-search/README.html
     3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
     nodes [QueryCheck, Retriever, Reader]
     document_store: As retriever can work only with Haystack Document Store, the
+    list of document returned by preprocessing pipeline are fed into to get
+    InMemmoryDocumentStore object type, with retriever updating the embedding
+    embeddings of each paragraph in document store.
     """
+    document_store = createDocumentStore(documents)
     retriever = loadRetriever(embedding_model = embedding_model,
                     embedding_model_format=embedding_model_format,
                     embedding_layer=embedding_layer,
                     retriever_top_k= retriever_top_k,
                     document_store = document_store)
     document_store.update_embeddings(retriever)
     querycheck = QueryCheck()
     reader = FARMReader(model_name_or_path=reader_model,
                     top_k = reader_top_k, use_gpu=True)
     semanticsearch_pipeline = Pipeline()
     semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
         end_idx = temp['offsets_in_document'][0]['end']
         match = [[start_idx,end_idx]]
         doc = doc_store.get_document_by_id(temp['document_id']).content
+        if check_streamlit:
+            st.write("Result {}".format(i+1))
+        else:
+            print("Result {}".format(i+1))
+        semanticsearchAnnotator(match, doc)