Spaces:

TeresaK
/

cpv_test

Runtime error

App Files Files Community

leavoigt commited on Sep 22, 2023

Commit

cfcd3f8

1 Parent(s): f937ba1

Upload 9 files

Browse files

Files changed (9) hide show

utils/checkconfig.py +15 -0
utils/keyword_extraction.py +140 -0
utils/lexical_search.py +251 -0
utils/ndc_explorer.py +90 -0
utils/preprocessing.py +260 -0
utils/sdg_classifier.py +177 -0
utils/semantic_search.py +582 -0
utils/streamlitcheck.py +42 -0
utils/uploadAndExample.py +33 -0

utils/checkconfig.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import configparser
+import logging
+def getconfig(configfile_path:str):
+    """
+    configfile_path: file path of .cfg file
+    """
+    config = configparser.ConfigParser()
+    try:
+        config.read_file(open(configfile_path))
+        return config
+    except:
+        logging.warning("config file not found")

utils/keyword_extraction.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import pandas as pd
+# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+# import nltk
+# nltk.download('stopwords')
+# from nltk.corpus import stopwords
+import pickle
+from typing import List, Text
+import logging
+from summa import keywords
+try:
+    import streamlit as st
+except ImportError:
+    logging.info("Streamlit not installed")
+def sort_coo(coo_matrix):
+    """
+    It takes Coordinate format scipy sparse matrix and extracts info from same.\
+    1. https://kavita-ganesan.com/python-keyword-extraction/#.Y2-TFHbMJPb
+    """
+    tuples = zip(coo_matrix.col, coo_matrix.data)
+    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
+def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
+    """get the feature names and tf-idf score of top n items
+    Params
+    ---------
+    feature_names: list of words from vectorizer
+    sorted_items: tuple returned by sort_coo function defined in  \
+    keyword_extraction.py
+    topn: topn words to be extracted using tfidf
+    Return
+    ----------
+    results: top extracted keywords
+    """
+    #use only topn items from vector
+    sorted_items = sorted_items[:top_n]
+    score_vals = []
+    feature_vals = []
+    # word index and corresponding tf-idf score
+    for idx, score in sorted_items:
+        #keep track of feature name and its corresponding score
+        score_vals.append(round(score, 3))
+        feature_vals.append(feature_names[idx])
+    results= {}
+    for idx in range(len(feature_vals)):
+        results[feature_vals[idx]]=score_vals[idx]
+    return results
+def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n):
+    """
+    TFIDF based keywords extraction
+    Params
+    ---------
+    vectorizer: trained cont vectorizer model
+    tfidfmodel: TFIDF Tranformer model
+    top_n: Top N keywords to be extracted
+    textdata: text data to which needs keyword extraction
+    Return
+    ----------
+    keywords: top extracted keywords
+    """
+    features = vectorizer.get_feature_names_out()
+    tf_idf_vector=tfidfmodel.transform(vectorizer.transform(textdata))
+    sorted_items=sort_coo(tf_idf_vector.tocoo())
+    results=extract_topn_from_vector(features,sorted_items,top_n)
+    keywords = [keyword for keyword in results]
+    return keywords
+def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
+    """
+    TFIDF based keywords extraction
+    Params
+    ---------
+    sdg: which sdg tfidf model to be used
+    sdgdata: text data to which needs keyword extraction
+    Return
+    ----------
+    keywords: top extracted keywords
+    """
+    model_path = "docStore/sdg{}/".format(sdg)
+    vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
+    tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
+    features = vectorizer.get_feature_names_out()
+    tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
+    sorted_items=sort_coo(tf_idf_vector.tocoo())
+    top_n = top_n
+    results=extract_topn_from_vector(features,sorted_items,top_n)
+    keywords = [keyword for keyword in results]
+    return keywords
+@st.cache(allow_output_mutation=True)
+def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]:
+    """
+    wrappper function to perform textrank, uses either ratio or wordcount to
+    extract top keywords limited by words or ratio.
+    1. https://github.com/summanlp/textrank/blob/master/summa/keywords.py
+    Params
+    --------
+    textdata: text data to perform the textrank.
+    ratio: float to limit the number of keywords as proportion of total token \
+        in textdata
+    words: number of keywords to be extracted. Takes priority over ratio if \
+        Non zero. Howevr incase the pagerank returns lesser keywords than \
+        compared to fix value then ratio is used.
+    Return
+    --------
+    results: extracted keywords
+    """
+    if words == 0:
+        logging.info("Textrank using defulat ratio value = 0.1, as no words limit given")
+        results = keywords.keywords(textdata, ratio= ratio).split("\n")
+    else:
+        try:
+            results = keywords.keywords(textdata, words= words).split("\n")
+        except:
+            results = keywords.keywords(textdata, ratio = ratio).split("\n")
+    return results

utils/lexical_search.py ADDED Viewed

	@@ -0,0 +1,251 @@

+from haystack.nodes import TfidfRetriever
+from haystack.document_stores import InMemoryDocumentStore
+import spacy
+import re
+from spacy.matcher import Matcher
+from markdown import markdown
+from annotated_text import annotation
+from haystack.schema import Document
+from typing import List, Text, Tuple
+from typing_extensions import Literal
+from utils.preprocessing import processingpipeline
+from utils.streamlitcheck import check_streamlit
+import logging
+try:
+    from termcolor import colored
+except:
+    pass
+try:
+    import streamlit as st
+except ImportError:
+    logging.info("Streamlit not installed")
+def runLexicalPreprocessingPipeline(file_name:str,file_path:str,
+                        split_by: Literal["sentence", "word"] = 'word',
+                        split_length:int = 80, split_overlap:int = 0,
+                        remove_punc:bool = False,)->List[Document]:
+    """
+    creates the pipeline and runs the preprocessing pipeline,
+    the params for pipeline are fetched from paramconfig. As lexical doesnt gets
+    affected by overlap, threfore split_overlap = 0 in default paramconfig and
+    split_by = word.
+    Params
+    ------------
+    file_name: filename, in case of streamlit application use
+    st.session_state['filename']
+    file_path: filepath, in case of streamlit application use
+    st.session_state['filepath']
+    split_by: document splitting strategy either as word or sentence
+    split_length: when synthetically creating the paragrpahs from document,
+                    it defines the length of paragraph.
+    split_overlap: Number of words or sentences that overlap when creating
+        the paragraphs. This is done as one sentence or 'some words' make sense
+        when  read in together with others. Therefore the overlap is used.
+    splititng of text.
+    removePunc: to remove all Punctuation including ',' and '.' or not
+    Return
+    --------------
+    List[Document]: When preprocessing pipeline is run, the output dictionary
+    has four objects. For the lexicaal search using TFIDFRetriever we
+    need to use the List of Haystack Document, which can be fetched by
+    key = 'documents' on output.
+    """
+    lexical_processing_pipeline = processingpipeline()
+    output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
+                            params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
+                                        "UdfPreProcessor": {"remove_punc": remove_punc, \
+                                            "split_by": split_by, \
+                                            "split_length":split_length,\
+                                            "split_overlap": split_overlap}})
+    return output_lexical_pre
+def tokenize_lexical_query(query:str)-> List[str]:
+    """
+    Removes the stop words from query and returns the list of important keywords
+    in query. For the lexical search the relevent paragraphs in document are
+    retreived using TfIDFretreiver from Haystack. However to highlight these
+    keywords we need the tokenized form of query.
+    Params
+    --------
+    query: string which represents either list of keywords user is looking for
+            or a query in form of Question.
+    Return
+    -----------
+    token_list: list of important keywords in the query.
+    """
+    nlp = spacy.load("en_core_web_sm")
+    token_list = [token.text.lower() for token in nlp(query)
+                  if not (token.is_stop or token.is_punct)]
+    return token_list
+def runSpacyMatcher(token_list:List[str], document:Text
+                    )->Tuple[List[List[int]],spacy.tokens.doc.Doc]:
+    """
+    Using the spacy in backend finds the keywords in the document using the
+    Matcher class from spacy. We can alternatively use the regex, but spacy
+    finds all keywords in serialized manner which helps in annotation of answers.
+    Params
+    -------
+    token_list: this is token list which tokenize_lexical_query function returns
+    document: text in which we need to find the tokens
+    Return
+    --------
+    matches: List of [start_index, end_index] in the spacydoc(at word level not
+    character) for the keywords in token list.
+    spacydoc: the keyword index in the spacydoc are at word level and not character,
+    therefore to allow the annotator to work seamlessly we return the spacydoc.
+    """
+    nlp = spacy.load("en_core_web_sm")
+    spacydoc = nlp(document)
+    matcher = Matcher(nlp.vocab)
+    token_pattern = [[{"LOWER":token}] for token in token_list]
+    matcher.add(",".join(token_list), token_pattern)
+    spacymatches = matcher(spacydoc)
+    # getting start and end index in spacydoc so that annotator can work seamlessly
+    matches = []
+    for match_id, start, end in spacymatches:
+        matches = matches + [[start, end]]
+    return matches, spacydoc
+def runRegexMatcher(token_list:List[str], document:Text):
+    """
+    Using the regex in backend finds the keywords in the document.
+    Params
+    -------
+    token_list: this is token list which tokenize_lexical_query function returns
+    document: text in which we need to find the tokens
+    Return
+    --------
+    matches: List of [start_index, end_index] in the document for the keywords
+    in token list at character level.
+    document: the keyword index returned by regex are at character level,
+    therefore to allow the annotator to work seamlessly we return the text back.
+    """
+    matches = []
+    for token in token_list:
+        matches = (matches +
+                  [[val.start(), val.start() +
+                  len(token)] for val in re.finditer(token, document)])
+    return matches, document
+def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
+    """
+    This is spacy Annotator and needs spacy.doc
+    Annotates the text in the document defined by list of [start index, end index]
+    Example: "How are you today", if document type is text, matches = [[0,3]]
+    will give answer = "How", however in case we used the spacy matcher then the
+    matches = [[0,3]] will give answer = "How are you". However if spacy is used
+    to find "How" then the matches = [[0,1]] for the string defined above.
+    Params
+    -----------
+    matches: As mentioned its list of list. Example [[0,1],[10,13]]
+    document: document which needs to be indexed.
+    Return
+    --------
+    will send the output to either app front end using streamlit or
+    write directly to output screen.
+    """
+    start = 0
+    annotated_text = ""
+    for match in matches:
+        start_idx = match[0]
+        end_idx = match[1]
+        if check_streamlit():
+            annotated_text = (annotated_text + document[start:start_idx].text
+                            + str(annotation(body=document[start_idx:end_idx].text,
+                            label="ANSWER", background="#964448", color='#ffffff')))
+        else:
+            annotated_text = (annotated_text + document[start:start_idx].text
+                            + colored(document[start_idx:end_idx].text,
+                          "green", attrs = ['bold']))
+        start = end_idx
+    annotated_text = annotated_text + document[end_idx:].text
+    if check_streamlit():
+        st.write(
+                markdown(annotated_text),
+                unsafe_allow_html=True,
+            )
+    else:
+        print(annotated_text)
+def lexical_search(query:Text, documents:List[Document],top_k:int):
+    """
+    Performs the Lexical search on the List of haystack documents which is
+    returned by preprocessing Pipeline.
+    Params
+    -------
+    query: Keywords that need to be searche in documents.
+    documents: List of Haystack documents returned by preprocessing pipeline.
+    top_k: Number of Top results to be fetched.
+    """
+    document_store = InMemoryDocumentStore()
+    document_store.write_documents(documents)
+    # Haystack Retriever works with document stores only.
+    retriever = TfidfRetriever(document_store)
+    results = retriever.retrieve(query=query, top_k = top_k)
+    query_tokens = tokenize_lexical_query(query)
+    flag = True
+    for count, result in enumerate(results):
+        matches, doc = runSpacyMatcher(query_tokens,result.content)
+        if len(matches) != 0:
+            if flag:
+                flag = False
+                if check_streamlit():
+                    st.markdown("##### Top few lexical search (TFIDF) hits #####")
+                else:
+                    print("Top few lexical search (TFIDF) hits")
+            if check_streamlit():
+                st.write("Result {}".format(count+1))
+            else:
+                print("Results {}".format(count +1))
+            spacyAnnotator(matches, doc)
+    if flag:
+        if check_streamlit():
+            st.info("🤔 No relevant result found. Please try another keyword.")
+        else:
+            print("No relevant result found. Please try another keyword.")

utils/ndc_explorer.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import urllib.request
+import json
+link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
+def get_document(country_code: str):
+    """
+    read the country NDC data from
+    https://klimalog.die-gdi.de/ndc/open-data/dataset.json
+    using the country code.
+    Params
+    -------
+    country_code:"""
+    with urllib.request.urlopen(link) as urlfile:
+        data =  json.loads(urlfile.read())
+    categoriesData = {}
+    categoriesData['categories']= data['categories']
+    categoriesData['subcategories']= data['subcategories']
+    keys_sub = categoriesData['subcategories'].keys()
+    documentType= 'NDCs'
+    if documentType in data.keys():
+        if country_code in data[documentType].keys():
+            get_dict = {}
+            for key, value in data[documentType][country_code].items():
+                if key not in ['country_name','region_id', 'region_name']:
+                    get_dict[key] = value['classification']
+                else:
+                    get_dict[key] = value
+        else:
+            return None
+    else:
+        return None
+    country = {}
+    for key in categoriesData['categories']:
+        country[key]= {}
+    for key,value in categoriesData['subcategories'].items():
+        country[value['category']][key] = get_dict[key]
+    return country
+def countrySpecificCCA(cca_sent:dict, threshold:int, countryCode:str):
+    """
+    based on the countrycode, reads the country data from
+    https://klimalog.die-gdi.de/ndc/open-data/dataset.json
+    using get_documents from utils.ndc_explorer.py
+    then based on thereshold value filters the Climate Change Adaptation
+    targets assigned by NDC explorer team to that country. Using the sentences
+    create by Data services team of GIZ for each target level, tries to find the
+    relevant passages from the document by doing the semantic search.
+    Params
+    -------
+    cca_sent: dictionary with key as 'target labels' and manufactured sentences
+    reflecting the target level. Please see the docStore/ndcs/cca.txt
+    threshold: NDC target have many categoriees ranging from [0-5], with 0
+    refelcting most relaxed attitude and 5 being most aggrisive towards Climate
+    change. We select the threshold value beyond which we need to focus on.
+    countryCode: standard country code to allow us to fetch the country specific
+    data.
+    """
+    temp = {}
+    doc = get_document(countryCode)
+    for key,value in cca_sent.items():
+        id_ = doc['climate change adaptation'][key]['id']
+        if id_ >threshold:
+            temp[key] = value['id'][id_]
+    return temp
+def countrySpecificCCM(ccm_sent, threshold, countryCode):
+    """
+    see the documentation of countrySpecificCCA. This is same instead of
+    this gets the data pertaining to Adaptation
+    """
+    temp = {}
+    doc = get_document(countryCode)
+    for key,value in ccm_sent.items():
+        id_ = doc['climate change mitigation'][key]['id']
+        if id_ >threshold:
+            temp[key] = value['id'][id_]
+    return temp

utils/preprocessing.py ADDED Viewed

	@@ -0,0 +1,260 @@

+from haystack.nodes.base import BaseComponent
+from haystack.schema import Document
+from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
+from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
+from typing import Callable, Dict, List, Optional, Text, Tuple, Union
+from typing_extensions import Literal
+import pandas as pd
+import logging
+import re
+import string
+from haystack.pipelines import Pipeline
+def useOCR(file_path: str)-> Text:
+    """
+    Converts image pdfs into text, Using the Farm-haystack[OCR]
+    Params
+    ----------
+    file_path: file_path of uploade file, returned by add_upload function in
+    uploadAndExample.py
+    Returns the text file as string.
+    """
+    converter = PDFToTextOCRConverter(remove_numeric_tables=True,
+                                      valid_languages=["eng"])
+    docs = converter.convert(file_path=file_path, meta=None)
+    return docs[0].content
+class FileConverter(BaseComponent):
+    """
+    Wrapper class to convert uploaded document into text by calling appropriate
+    Converter class, will use internally haystack PDFToTextOCR in case of image
+    pdf. Cannot use the FileClassifier from haystack as its doesnt has any
+    label/output class for image.
+    1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
+    2. https://docs.haystack.deepset.ai/docs/file_converters
+    3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
+    4. https://docs.haystack.deepset.ai/reference/file-converters-api
+    """
+    outgoing_edges = 1
+    def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
+            id_hash_keys: Optional[List[str]] = None,
+            ) -> Tuple[dict,str]:
+        """ this is required method to invoke the component in
+            the pipeline implementation.
+        Params
+        ----------
+        file_name: name of file
+        file_path: file_path of uploade file, returned by add_upload function in
+                    uploadAndExample.py
+        See the links provided in Class docstring/description to see other params
+        Return
+        ---------
+        output: dictionary, with key as identifier and value could be anything
+                we need to return. In this case its the List of Hasyatck Document
+        output_1: As there is only one outgoing edge, we pass 'output_1' string
+        """
+        try:
+            if file_name.endswith('.pdf'):
+                converter = PDFToTextConverter(remove_numeric_tables=True)
+            if file_name.endswith('.txt'):
+                converter = TextConverter(remove_numeric_tables=True)
+            if file_name.endswith('.docx'):
+                converter = DocxToTextConverter()
+        except Exception as e:
+            logging.error(e)
+            return
+        documents = []
+        document = converter.convert(
+                      file_path=file_path, meta=None,
+                      encoding=encoding, id_hash_keys=id_hash_keys
+                      )[0]
+        text = document.content
+        # if file is image pdf then it will have {'content': "\x0c\x0c\x0c\x0c"}
+        # subsitute this substring with '',and check if content is empty string
+        text = re.sub(r'\x0c', '', text)
+        documents.append(Document(content=text,
+                              meta={"name": file_name},
+                              id_hash_keys=id_hash_keys))
+        # check if text is empty and apply pdfOCR converter.
+        for i in documents:
+            if i.content == "":
+                logging.info("Using OCR")
+                i.content = useOCR(file_path)
+        logging.info('file conversion succesful')
+        output = {'documents': documents}
+        return output, 'output_1'
+    def run_batch():
+        """
+        we dont have requirement to process the multiple files in one go
+        therefore nothing here, however to use the custom node we need to have
+        this method for the class.
+        """
+        return
+def basic(s:str, remove_punc:bool = False):
+    """
+    Performs basic cleaning of text.
+    Params
+    ----------
+    s: string to be processed
+    removePunc: to remove all Punctuation including ',' and '.' or not
+    Returns: processed string: see comments in the source code for more info
+    """
+    # Remove URLs
+    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
+    s = re.sub(r"http\S+", " ", s)
+    # Remove new line characters
+    s = re.sub('\n', ' ', s)
+    # Remove punctuations
+    if remove_punc == True:
+      translator = str.maketrans(' ', ' ', string.punctuation)
+      s = s.translate(translator)
+    # Remove distracting single quotes and dotted pattern
+    s = re.sub("\'", " ", s)
+    s = s.replace("..","")
+    return s.strip()
+class UdfPreProcessor(BaseComponent):
+    """
+    class to preprocess the document returned by FileConverter. It will check
+    for splitting strategy and splits the document by word or sentences and then
+    synthetically create the paragraphs.
+    1. https://docs.haystack.deepset.ai/docs/preprocessor
+    2. https://docs.haystack.deepset.ai/reference/preprocessor-api
+    3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
+    """
+    outgoing_edges = 1
+    def run(self, documents:List[Document], remove_punc:bool=False,
+            split_by: Literal["sentence", "word"] = 'sentence',
+            split_length:int = 2, split_respect_sentence_boundary:bool = False,
+            split_overlap:int = 0):
+        """ this is required method to invoke the component in
+        the pipeline implementation.
+        Params
+        ----------
+        documents: documents from the output dictionary returned by Fileconverter
+        remove_punc: to remove all Punctuation including ',' and '.' or not
+        split_by: document splitting strategy either as word or sentence
+        split_length: when synthetically creating the paragrpahs from document,
+                      it defines the length of paragraph.
+        split_respect_sentence_boundary: Used when using 'word' strategy for
+        splititng of text.
+        split_overlap: Number of words or sentences that overlap when creating
+        the paragraphs. This is done as one sentence or 'some words' make sense
+        when  read in together with others. Therefore the overlap is used.
+        Return
+        ---------
+        output: dictionary, with key as identifier and value could be anything
+                we need to return. In this case the output will contain 4 objects
+                the paragraphs text list as List, Haystack document, Dataframe and
+                one raw text file.
+        output_1: As there is only one outgoing edge, we pass 'output_1' string
+        """
+        if split_by == 'sentence':
+            split_respect_sentence_boundary = False
+        else:
+            split_respect_sentence_boundary = split_respect_sentence_boundary
+        preprocessor = PreProcessor(
+            clean_empty_lines=True,
+            clean_whitespace=True,
+            clean_header_footer=True,
+            split_by=split_by,
+            split_length=split_length,
+            split_respect_sentence_boundary= split_respect_sentence_boundary,
+            split_overlap=split_overlap,
+            # will add page number only in case of PDF not for text/docx file.
+            add_page_number=True
+            )
+        for i in documents:
+            # # basic cleaning before passing it to preprocessor.
+            # i = basic(i)
+            docs_processed = preprocessor.process([i])
+            for item in docs_processed:
+                item.content = basic(item.content, remove_punc= remove_punc)
+        df = pd.DataFrame(docs_processed)
+        all_text = " ".join(df.content.to_list())
+        para_list = df.content.to_list()
+        logging.info('document split into {} paragraphs'.format(len(para_list)))
+        output = {'documents': docs_processed,
+                  'dataframe': df,
+                  'text': all_text,
+                  'paraList': para_list
+                 }
+        return output, "output_1"
+    def run_batch():
+        """
+            we dont have requirement to process the multiple files in one go
+            therefore nothing here, however to use the custom node we need to have
+            this method for the class.
+        """
+        return
+def processingpipeline():
+    """
+    Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
+    from utils.preprocessing
+    """
+    preprocessing_pipeline = Pipeline()
+    file_converter = FileConverter()
+    custom_preprocessor = UdfPreProcessor()
+    preprocessing_pipeline.add_node(component=file_converter,
+                                    name="FileConverter", inputs=["File"])
+    preprocessing_pipeline.add_node(component = custom_preprocessor,
+                            name ='UdfPreProcessor', inputs=["FileConverter"])
+    return preprocessing_pipeline

utils/sdg_classifier.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from haystack.nodes import TransformersDocumentClassifier
+from haystack.schema import Document
+from typing import List, Tuple
+from typing_extensions import Literal
+import logging
+import pandas as pd
+from pandas import DataFrame, Series
+from utils.checkconfig import getconfig
+from utils.streamlitcheck import check_streamlit
+from utils.preprocessing import processingpipeline
+try:
+    import streamlit as st
+except ImportError:
+    logging.info("Streamlit not installed")
+## Labels dictionary ###
+_lab_dict = {0: 'no_cat',
+            1:'SDG 1 - No poverty',
+            2:'SDG 2 - Zero hunger',
+            3:'SDG 3 - Good health and well-being',
+            4:'SDG 4 - Quality education',
+            5:'SDG 5 - Gender equality',
+            6:'SDG 6 - Clean water and sanitation',
+            7:'SDG 7 - Affordable and clean energy',
+            8:'SDG 8 - Decent work and economic growth',
+            9:'SDG 9 - Industry, Innovation and Infrastructure',
+            10:'SDG 10 - Reduced inequality',
+            11:'SDG 11 - Sustainable cities and communities',
+            12:'SDG 12 - Responsible consumption and production',
+            13:'SDG 13 - Climate action',
+            14:'SDG 14 - Life below water',
+            15:'SDG 15 - Life on land',
+            16:'SDG 16 - Peace, justice and strong institutions',
+            17:'SDG 17 - Partnership for the goals',}
+@st.cache(allow_output_mutation=True)
+def load_sdgClassifier(config_file:str = None, classifier_name:str = None):
+    """
+    loads the document classifier using haystack, where the name/path of model
+    in HF-hub as string is used to fetch the model object.Either configfile or
+    model should be passed.
+    1. https://docs.haystack.deepset.ai/reference/document-classifier-api
+    2. https://docs.haystack.deepset.ai/docs/document_classifier
+    Params
+    --------
+    config_file: config file path from which to read the model name
+    classifier_name: if modelname is passed, it takes a priority if not \
+    found then will look for configfile, else raise error.
+    Return: document classifier model
+    """
+    if not classifier_name:
+        if not config_file:
+            logging.warning("Pass either model name or config file")
+            return
+        else:
+            config = getconfig(config_file)
+            classifier_name = config.get('sdg','MODEL')
+    logging.info("Loading classifier")
+    doc_classifier = TransformersDocumentClassifier(
+                        model_name_or_path=classifier_name,
+                        task="text-classification")
+    return doc_classifier
+@st.cache(allow_output_mutation=True)
+def sdg_classification(haystack_doc:List[Document],
+                        threshold:float = 0.8,
+                        classifier_model:TransformersDocumentClassifier= None
+                        )->Tuple[DataFrame,Series]:
+    """
+    Text-Classification on the list of texts provided. Classifier provides the
+    most appropriate label for each text. these labels are in terms of if text
+    belongs to which particular Sustainable Devleopment Goal (SDG).
+    Params
+    ---------
+    haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
+    contains the list of paragraphs in different format,here the list of
+    Haystack Documents is used.
+    threshold: threshold value for the model to keep the results from classifier
+    classifiermodel: you can pass the classifier model directly,which takes priority
+    however if not then looks for model in streamlit session.
+    In case of streamlit avoid passing the model directly.
+    Returns
+    ----------
+    df: Dataframe with two columns['SDG:int', 'text']
+    x: Series object with the unique SDG covered in the document uploaded and
+    the number of times it is covered/discussed/count_of_paragraphs.
+    """
+    logging.info("Working on SDG Classification")
+    if not classifier_model:
+        if check_streamlit():
+            classifier_model = st.session_state['sdg_classifier']
+        else:
+            logging.warning("No streamlit envinornment found, Pass the classifier")
+            return
+    results = classifier_model.predict(haystack_doc)
+    labels_= [(l.meta['classification']['label'],
+            l.meta['classification']['score'],l.content,) for l in results]
+    df = DataFrame(labels_, columns=["SDG","Relevancy","text"])
+    df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
+    df.index += 1
+    df =df[df['Relevancy']>threshold]
+    # creating the dataframe for value counts of SDG, along with 'title' of SDGs
+    x = df['SDG'].value_counts()
+    x = x.rename('count')
+    x = x.rename_axis('SDG').reset_index()
+    x["SDG"] = pd.to_numeric(x["SDG"])
+    x = x.sort_values(by=['count'], ascending=False)
+    x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
+    x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
+    df['SDG'] = pd.to_numeric(df['SDG'])
+    df = df.sort_values('SDG')
+    return df, x
+def runSDGPreprocessingPipeline(file_name:str, file_path:str,
+            split_by: Literal["sentence", "word"] = 'sentence',
+            split_length:int = 2, split_respect_sentence_boundary:bool = False,
+            split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
+    """
+    creates the pipeline and runs the preprocessing pipeline,
+    the params for pipeline are fetched from paramconfig
+    Params
+    ------------
+    file_name: filename, in case of streamlit application use
+    st.session_state['filename']
+    file_path: filepath, in case of streamlit application use st.session_state['filepath']
+    split_by: document splitting strategy either as word or sentence
+    split_length: when synthetically creating the paragrpahs from document,
+                    it defines the length of paragraph.
+    split_respect_sentence_boundary: Used when using 'word' strategy for
+    splititng of text.
+    split_overlap: Number of words or sentences that overlap when creating
+        the paragraphs. This is done as one sentence or 'some words' make sense
+        when  read in together with others. Therefore the overlap is used.
+    remove_punc: to remove all Punctuation including ',' and '.' or not
+    Return
+    --------------
+    List[Document]: When preprocessing pipeline is run, the output dictionary
+    has four objects. For the Haysatck implementation of SDG classification we,
+    need to use the List of Haystack Document, which can be fetched by
+    key = 'documents' on output.
+    """
+    sdg_processing_pipeline = processingpipeline()
+    output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
+                            params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
+                                     "UdfPreProcessor": {"remove_punc": remove_punc, \
+                                            "split_by": split_by, \
+                                            "split_length":split_length,\
+                                            "split_overlap": split_overlap, \
+        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
+    return output_sdg_pre

utils/semantic_search.py ADDED Viewed

	@@ -0,0 +1,582 @@

+from haystack.nodes import TransformersQueryClassifier, Docs2Answers
+from haystack.nodes import EmbeddingRetriever, FARMReader
+from haystack.nodes.base import BaseComponent
+from haystack.document_stores import InMemoryDocumentStore
+from markdown import markdown
+from annotated_text import annotation
+from haystack.schema import Document
+from typing import List, Text, Union
+from typing_extensions import Literal
+from utils.preprocessing import processingpipeline
+from utils.streamlitcheck import check_streamlit
+from haystack.pipelines import Pipeline
+import pandas as pd
+import logging
+try:
+    from termcolor import colored
+except:
+    pass
+try:
+    import streamlit as st
+except ImportError:
+    logging.info("Streamlit not installed")
+@st.cache(allow_output_mutation=True)
+def loadQueryClassifier():
+    """
+    retuns the haystack query classifier model
+    model = shahrukhx01/bert-mini-finetune-question-detection
+    """
+    query_classifier = TransformersQueryClassifier(model_name_or_path=
+                            "shahrukhx01/bert-mini-finetune-question-detection")
+    return query_classifier
+class QueryCheck(BaseComponent):
+    """
+    Uses Query Classifier from Haystack, process the query based on query type.
+    Ability to determine the statements is not so good, therefore the chances
+    statement also get modified. Ex: "List water related issues" will be
+    identified by the model as keywords, and therefore it be processed as "what
+    are the 'list all water related issues' related issues and discussions?".
+    This is one shortcoming but is igonred for now, as semantic search will not
+    get affected a lot, by this. If you want to pass keywords list and want to
+    do batch processing use. run_batch. Example: if you want to find relevant
+    passages for water, food security, poverty then querylist = ["water", "food
+    security","poverty"] and then execute QueryCheck.run_batch(queries = querylist)
+    1. https://docs.haystack.deepset.ai/docs/query_classifier
+    """
+    outgoing_edges = 1
+    def run(self, query:str):
+        """
+        mandatory method to use the custom node. Determines the query type, if
+        if the query is of type keyword/statement will modify it to make it more
+        useful for sentence transoformers.
+        Params
+        --------
+        query: query/statement/keywords in form of string
+        Return
+        ------
+        output: dictionary, with key as identifier and value could be anything
+                we need to return. In this case the output contain key = 'query'.
+        output_1: As there is only one outgoing edge, we pass 'output_1' string
+        """
+        query_classifier = loadQueryClassifier()
+        result = query_classifier.run(query=query)
+        if result[1] == "output_1":
+            output = {"query":query,
+                       "query_type": 'question/statement'}
+        else:
+            output = {"query": "what are the {} related issues and \
+                        discussions?".format(query),
+                      "query_type": 'statements/keyword'}
+        logging.info(output)
+        return output, "output_1"
+    def run_batch(self, queries:List[str]):
+        """
+        running multiple queries in one go, howeevr need the queries to be passed
+        as list of string. Example: if you want to find relevant passages for
+        water, food security, poverty then querylist = ["water", "food security",
+        "poverty"] and then execute QueryCheck.run_batch(queries = querylist)
+        Params
+        --------
+        queries: queries/statements/keywords in form of string encapsulated
+                within List
+        Return
+        ------
+        output: dictionary, with key as identifier and value could be anything
+                we need to return. In this case the output contain key = 'queries'.
+        output_1: As there is only one outgoing edge, we pass 'output_1' string
+        """
+        query_classifier = loadQueryClassifier()
+        query_list = []
+        for query in queries:
+            result = query_classifier.run(query=query)
+            if result[1] == "output_1":
+                query_list.append(query)
+            else:
+                query_list.append("what are the {} related issues and \
+                    discussions?".format(query))
+        output = {'queries':query_list}
+        logging.info(output)
+        return output, "output_1"
+@st.cache(allow_output_mutation=True)
+def runSemanticPreprocessingPipeline(file_path:str, file_name:str,
+                split_by: Literal["sentence", "word"] = 'sentence',
+                split_length:int = 2, split_overlap:int = 0,
+                split_respect_sentence_boundary:bool = False,
+                remove_punc:bool = False)->List[Document]:
+    """
+    creates the pipeline and runs the preprocessing pipeline.
+    Params
+    ------------
+    file_name: filename, in case of streamlit application use
+            st.session_state['filename']
+    file_path: filepath, in case of streamlit application use
+            st.session_state['filepath']
+    split_by: document splitting strategy either as word or sentence
+    split_length: when synthetically creating the paragrpahs from document,
+            it defines the length of paragraph.
+    split_overlap: Number of words or sentences that overlap when creating the
+            paragraphs. This is done as one sentence or 'some words' make sense
+            when  read in together with others. Therefore the overlap is used.
+    split_respect_sentence_boundary: Used when using 'word' strategy for
+            splititng of text.
+    remove_punc: to remove all Punctuation including ',' and '.' or not
+    Return
+    --------------
+    List[Document]: When preprocessing pipeline is run, the output dictionary
+        has four objects. For the Haysatck implementation of semantic search we,
+        need to use the List of Haystack Document, which can be fetched by
+        key = 'documents' on output.
+    """
+    semantic_processing_pipeline = processingpipeline()
+    output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
+                            params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
+                                "UdfPreProcessor": {"remove_punc": remove_punc, \
+                                            "split_by": split_by, \
+                                            "split_length":split_length,\
+                                            "split_overlap": split_overlap,
+        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
+    return output_semantic_pre
+@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
+                                        allow_output_mutation=True)
+def loadRetriever(embedding_model:Text=None, embedding_model_format:Text = None,
+                 embedding_layer:int = None,  retriever_top_k:int = 10,
+                 max_seq_len:int=512, document_store:InMemoryDocumentStore=None):
+    """
+    Returns the Retriever model based on params provided.
+    1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
+    2. https://www.sbert.net/examples/applications/semantic-search/README.html
+    3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
+    Params
+    ---------
+    embedding_model: Name of the model to be used for embedding. Check the links
+            provided in documentation
+    embedding_model_format: check the github link of Haystack provided in
+            documentation embedding_layer: check the github link of Haystack
+            provided in documentation retriever_top_k: Number of Top results to
+            be returned by
+    retriever max_seq_len: everymodel has max seq len it can handle, check in
+            model card. Needed to hanlde the edge cases.
+    document_store: InMemoryDocumentStore, write haystack Document list to
+            DocumentStore and pass the same to function call. Can be done using
+            createDocumentStore from utils.
+    Return
+    -------
+    retriever: embedding model
+    """
+    logging.info("loading retriever")
+    if document_store is None:
+        logging.warning("Retriever initialization requires the DocumentStore")
+        return
+    retriever = EmbeddingRetriever(
+                embedding_model=embedding_model,top_k = retriever_top_k,
+                document_store = document_store,
+                emb_extraction_layer=embedding_layer, scale_score =True,
+                model_format=embedding_model_format, use_gpu = True,
+                max_seq_len = max_seq_len )
+    if check_streamlit:
+        st.session_state['retriever'] = retriever
+    return retriever
+@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
+                    allow_output_mutation=True)
+def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
+                        embedding_dim:int = 768):
+    """
+    Creates the InMemory Document Store from haystack list of Documents.
+    It is  mandatory component for Retriever to work in Haystack frame work.
+    Params
+    -------
+    documents: List of haystack document. If using the preprocessing pipeline,
+            can be fetched key = 'documents; on output of preprocessing pipeline.
+    similarity: scoring function, can be either 'cosine' or 'dot_product'
+    embedding_dim: Document store has default value of embedding size = 768, and
+            update_embeddings method of Docstore cannot infer the embedding size of
+            retiever automatically, therefore set this value as per the model card.
+    Return
+    -------
+    document_store: InMemory Document Store object type.
+    """
+    document_store = InMemoryDocumentStore(similarity = similarity,
+                                        embedding_dim = embedding_dim )
+    document_store.write_documents(documents)
+    return document_store
+@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
+                                        allow_output_mutation=True)
+def semanticSearchPipeline(documents:List[Document], embedding_model:Text =  None,
+                embedding_model_format:Text = None,embedding_layer:int = None,
+                embedding_dim:int = 768,retriever_top_k:int = 10,
+                reader_model:str =  None, reader_top_k:int = 10,
+                max_seq_len:int =512,useQueryCheck = True,
+                top_k_per_candidate:int = 1):
+    """
+    creates the semantic search pipeline and document Store object from the
+    list of haystack documents. The top_k for the Reader and Retirever are kept
+    same, so that all the results returned by Retriever are used, however the
+    context is extracted by Reader for each retrieved result. The querycheck is
+    added as node to process the query. This pipeline is suited for keyword search,
+    and to some extent extractive QA purpose. The purpose of Reader is strictly to
+    highlight the context for retrieved result and not for QA, however as stated
+    it can work for QA too in limited sense.
+    There are 4 variants of pipeline it can return
+    1.QueryCheck > Retriever > Reader
+    2.Retriever > Reader
+    3.QueryCheck > Retriever > Docs2Answers : If reader is None,
+    then Doc2answer is used to keep the output of pipeline structurally same.
+    4.Retriever > Docs2Answers
+    Links
+    1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
+    2. https://www.sbert.net/examples/applications/semantic-search/README.html
+    3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
+    4. https://docs.haystack.deepset.ai/docs/reader
+    Params
+    ----------
+    documents: list of Haystack Documents, returned by preprocessig pipeline.
+    embedding_model: Name of the model to be used for embedding. Check the links
+            provided in documentation
+    embedding_model_format: check the github link of Haystack provided in
+            documentation
+    embedding_layer: check the github link of Haystack provided in documentation
+    embedding_dim: Document store has default value of embedding size = 768, and
+            update_embeddings method of Docstore cannot infer the embedding size of
+            retiever automatically, therefore set this value as per the model card.
+    retriever_top_k: Number of Top results to be returned by retriever
+    reader_model: Name of the model to be used for Reader node in hasyatck
+            Pipeline. Check the links provided in documentation
+    reader_top_k: Reader will use retrieved results to further find better matches.
+            As purpose here is to use reader to extract context, the value is
+            same as retriever_top_k.
+    max_seq_len:everymodel has max seq len it can handle, check in model card.
+            Needed to hanlde the edge cases
+    useQueryCheck: Whether to use the querycheck which modifies the query or not.
+    top_k_per_candidate:How many answers to extract for each candidate doc
+            that is coming from the retriever
+    Return
+    ---------
+    semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
+            nodes [QueryCheck, Retriever, Reader/Docs2Answer]. If reader is None,
+            then Doc2answer is used to keep the output of pipeline structurally
+            same.
+    document_store: As retriever can work only with Haystack Document Store, the
+            list of document returned by preprocessing pipeline are fed into to
+            get InMemmoryDocumentStore object type, with retriever updating the
+            embeddings of each paragraph in document store.
+    """
+    document_store = createDocumentStore(documents=documents,
+                                    embedding_dim=embedding_dim)
+    retriever = loadRetriever(embedding_model = embedding_model,
+                    embedding_model_format=embedding_model_format,
+                    embedding_layer=embedding_layer,
+                    retriever_top_k= retriever_top_k,
+                    document_store = document_store,
+                    max_seq_len=max_seq_len)
+    document_store.update_embeddings(retriever)
+    semantic_search_pipeline = Pipeline()
+    if useQueryCheck and reader_model:
+        querycheck = QueryCheck()
+        reader = FARMReader(model_name_or_path=reader_model,
+                    top_k = reader_top_k, use_gpu=True,
+                    top_k_per_candidate = top_k_per_candidate)
+        semantic_search_pipeline.add_node(component = querycheck,
+                    name = "QueryCheck",inputs = ["Query"])
+        semantic_search_pipeline.add_node(component = retriever,
+                    name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
+        semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
+                                        inputs= ["EmbeddingRetriever"])
+    elif reader_model :
+        reader = FARMReader(model_name_or_path=reader_model,
+                    top_k = reader_top_k, use_gpu=True,
+                    top_k_per_candidate = top_k_per_candidate)
+        semantic_search_pipeline.add_node(component = retriever,
+                    name = "EmbeddingRetriever",inputs = ["Query"])
+        semantic_search_pipeline.add_node(component = reader,
+                    name = "FARMReader",inputs= ["EmbeddingRetriever"])
+    elif useQueryCheck and not reader_model:
+        querycheck = QueryCheck()
+        docs2answers = Docs2Answers()
+        semantic_search_pipeline.add_node(component = querycheck,
+                        name = "QueryCheck",inputs = ["Query"])
+        semantic_search_pipeline.add_node(component = retriever,
+                        name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
+        semantic_search_pipeline.add_node(component = docs2answers,
+                        name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
+    elif not useQueryCheck and not reader_model:
+        docs2answers = Docs2Answers()
+        semantic_search_pipeline.add_node(component = retriever,
+                        name = "EmbeddingRetriever",inputs = ["Query"])
+        semantic_search_pipeline.add_node(component = docs2answers,
+                        name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
+    logging.info(semantic_search_pipeline.components)
+    return semantic_search_pipeline, document_store
+def runSemanticPipeline(pipeline:Pipeline, queries:Union[list,str])->dict:
+    """
+    will use the haystack run or run_batch based on if single query is passed
+    as string or multiple queries as List[str]
+    Params
+    -------
+    pipeline: haystack pipeline, this is same as returned by semanticSearchPipeline
+            from utils.semanticsearch
+    queries: Either a single query or list of queries.
+    Return
+    -------
+    results: Dict containing answers and documents as key and their respective
+            values
+    """
+    if type(queries) == list:
+        results = pipeline.run_batch(queries=queries)
+    elif type(queries) == str:
+        results = pipeline.run(query=queries)
+    else:
+        logging.info("Please check the input type for the queries")
+        return
+    return results
+def process_query_output(results:dict)->pd.DataFrame:
+    """
+    Returns the dataframe with necessary information like including
+    ['query','answer','answer_offset','context_offset','context','content',
+    'reader_score','retriever_score','id',]. This is designed for output given
+    by semantic search pipeline with single query and final node as reader.
+    The output of pipeline having Docs2Answers as final node or multiple queries
+    need to be handled separately. In these other cases, use process_semantic_output
+    from utils.semantic_search which uses this function internally to make one
+    combined dataframe.
+    Params
+    ---------
+    results: this dictionary should have key,values with
+            keys = [query,answers,documents], however answers is optional.
+            in case of [Doc2Answers as final node], process_semantic_output
+            doesnt return answers thereby setting all values contained in
+            answers to 'None'
+    Return
+    --------
+    df: dataframe with all the columns mentioned in function description.
+    """
+    query_text = results['query']
+    if 'answers' in results.keys():
+        answer_dict = {}
+        for answer in results['answers']:
+            answer_dict[answer.document_id] = answer.to_dict()
+    else:
+        answer_dict = {}
+    docs = results['documents']
+    df = pd.DataFrame(columns=['query','answer','answer_offset','context_offset',
+                            'context','content','reader_score','retriever_score',
+                            'id'])
+    for doc in docs:
+        row_list = {}
+        row_list['query'] = query_text
+        row_list['retriever_score'] = doc.score
+        row_list['id'] = doc.id
+        row_list['content'] = doc.content
+        if doc.id in answer_dict.keys():
+            row_list['answer'] = answer_dict[doc.id]['answer']
+            row_list['context'] = answer_dict[doc.id]['context']
+            row_list['reader_score'] = answer_dict[doc.id]['score']
+            answer_offset = answer_dict[doc.id]['offsets_in_document'][0]
+            row_list['answer_offset'] = [answer_offset['start'],answer_offset['end']]
+            start_idx = doc.content.find(row_list['context'])
+            end_idx = start_idx + len(row_list['context'])
+            row_list['context_offset'] = [start_idx, end_idx]
+        else:
+            row_list['answer'] = None
+            row_list['context'] = None
+            row_list['reader_score'] = None
+            row_list['answer_offset'] = None
+            row_list['context_offset'] = None
+        df_dictionary = pd.DataFrame([row_list])
+        df = pd.concat([df, df_dictionary], ignore_index=True)
+    return df
+def process_semantic_output(results):
+    """
+    Returns the dataframe with necessary information like including
+    ['query','answer','answer_offset','context_offset','context','content',
+    'reader_score','retriever_score','id',]. Distingushes if its single query or
+    multi queries by reading the pipeline output dictionary keys.
+    Uses the process_query_output to get the dataframe for each query and create
+    one concataneted dataframe. In case of Docs2Answers as final node, deletes
+    the answers part. See documentations of process_query_output.
+    Params
+    ---------
+    results: raw output of runSemanticPipeline.
+    Return
+    --------
+    df: dataframe with all the columns mentioned in function description.
+    """
+    output = {}
+    if 'query' in results.keys():
+        output['query'] = results['query']
+        output['documents'] = results['documents']
+        if results['node_id'] == 'Docs2Answers':
+            pass
+        else:
+            output['answers'] = results['answers']
+        df = process_query_output(output)
+        return df
+    if 'queries' in results.keys():
+        df = pd.DataFrame(columns=['query','answer','answer_offset',
+                                   'context_offset','context','content',
+                                   'reader_score','retriever_score','id'])
+        for query,answers,documents in zip(results['queries'],
+                    results['answers'],results['documents']):
+            output = {}
+            output['query'] = query
+            output['documents'] = documents
+            if results['node_id'] == 'Docs2Answers':
+                    pass
+            else:
+                output['answers'] = answers
+            temp = process_query_output(output)
+            df = pd.concat([df, temp], ignore_index=True)
+    return df
+def semanticsearchAnnotator(matches:List[List[int]], document:Text):
+    """
+    Annotates the text in the document defined by list of [start index, end index]
+    Example: "How are you today", if document type is text, matches = [[0,3]]
+    will give answer = "How", however in case we used the spacy matcher then the
+    matches = [[0,3]] will give answer = "How are you". However if spacy is used
+    to find "How" then the matches = [[0,1]] for the string defined above.
+    """
+    start = 0
+    annotated_text = ""
+    for match in matches:
+        start_idx = match[0]
+        end_idx = match[1]
+        if check_streamlit():
+            annotated_text = (annotated_text + document[start:start_idx]
+                            + str(annotation(body=document[start_idx:end_idx],
+                            label="Context", background="#964448", color='#ffffff')))
+        else:
+            annotated_text = (annotated_text + document[start:start_idx]
+                            + colored(document[start_idx:end_idx],
+                          "green", attrs = ['bold']))
+        start = end_idx
+    annotated_text = annotated_text + document[end_idx:]
+    if check_streamlit():
+        st.write(
+                markdown(annotated_text),
+                unsafe_allow_html=True,
+            )
+    else:
+        print(annotated_text)
+def semantic_keywordsearch(query:Text,documents:List[Document],
+                embedding_model:Text,
+                embedding_model_format:Text,
+                embedding_layer:int,  reader_model:str,
+                retriever_top_k:int = 10, reader_top_k:int = 10,
+                return_results:bool = False, embedding_dim:int = 768,
+                max_seq_len:int = 512,top_k_per_candidate:int =1,
+                sort_by:Literal["retriever", "reader"] = 'retriever'):
+    """
+    Performs the Semantic search on the List of haystack documents which is
+    returned by preprocessing Pipeline.
+    Params
+    -------
+    query: Keywords that need to be searche in documents.
+    documents: List fo Haystack documents returned by preprocessing pipeline.
+    """
+    semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = documents,
+                        embedding_model= embedding_model,
+                        embedding_layer= embedding_layer,
+                        embedding_model_format= embedding_model_format,
+                        reader_model= reader_model, retriever_top_k= retriever_top_k,
+                        reader_top_k= reader_top_k, embedding_dim=embedding_dim,
+                        max_seq_len=max_seq_len,
+                        top_k_per_candidate=top_k_per_candidate)
+    raw_output = runSemanticPipeline(semanticsearch_pipeline,query)
+    results_df = process_semantic_output(raw_output)
+    if sort_by == 'retriever':
+        results_df = results_df.sort_values(by=['retriever_score'], ascending=False)
+    else:
+        results_df = results_df.sort_values(by=['reader_score'], ascending=False)
+    if return_results:
+        return results_df
+    else:
+        if check_streamlit:
+            st.markdown("##### Top few semantic search results #####")
+        else:
+            print("Top few semantic search results")
+        for i in range(len(results_df)):
+            if check_streamlit:
+                st.write("Result {}".format(i+1))
+            else:
+                print("Result {}".format(i+1))
+            semanticsearchAnnotator([results_df.loc[i]['context_offset']],
+                        results_df.loc[i]['content'] )

utils/streamlitcheck.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import logging
+try:
+    import streamlit as st
+except ImportError:
+    logging.info("Streamlit not installed")
+def check_streamlit():
+    """
+    Function to check whether python code is run within streamlit
+    Returns
+    -------
+    use_streamlit : boolean
+        True if code is run within streamlit, else False
+    """
+    try:
+        from streamlit.scriptrunner.script_run_context import get_script_run_ctx
+        if not get_script_run_ctx():
+            use_streamlit = False
+        else:
+            use_streamlit = True
+    except ModuleNotFoundError:
+        use_streamlit = False
+    return use_streamlit
+def disable_other_checkboxes(*other_checkboxes_keys):
+    for checkbox_key in other_checkboxes_keys:
+        st.session_state[checkbox_key] = False
+def checkbox_without_preselect(keylist):
+    dict_ = {}
+    for i,key_val in enumerate(keylist):
+        dict_[i] = st.checkbox(key_val,key = key_val,
+        on_change = disable_other_checkboxes,
+        args=tuple(list(filter(lambda x: x!= key_val, keylist))),)
+    for key,val in dict_.items():
+        if val == True:
+            return keylist[int(key)]
+    return None

utils/uploadAndExample.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import streamlit as st
+import tempfile
+import json
+def add_upload(choice):
+    """
+    Provdies the user with choice to either 'Upload Document' or 'Try Example'.
+    Based on user choice runs streamlit processes and save the path and name of
+    the 'file' to streamlit session_state which then can be fetched later.
+    """
+    if choice == 'Upload Document':
+        uploaded_file = st.sidebar.file_uploader('Upload the File',
+                            type=['pdf', 'docx', 'txt'])
+        if uploaded_file is not None:
+            with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
+                bytes_data = uploaded_file.getvalue()
+                temp.write(bytes_data)
+                st.session_state['filename'] = uploaded_file.name
+                st.session_state['filepath'] = temp.name
+    else:
+        # listing the options
+        with open('docStore/sample/files.json','r') as json_file:
+            files = json.load(json_file)
+        option = st.sidebar.selectbox('Select the example document',
+                              list(files.keys()))
+        file_name = file_path  = files[option]
+        st.session_state['filename'] = file_name
+        st.session_state['filepath'] = file_path