Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

App Files Files Community

prashant commited on Nov 2, 2022

Commit

1d3978a

1 Parent(s): 3d34c75

updating overlap in preprocessing

Browse files

Files changed (5) hide show

appStore/keyword_search.py +1 -2
paramconfig.cfg +5 -0
utils/preprocessing.py +5 -5
utils/sdg_classifier.py +4 -1
utils/search.py +36 -4

appStore/keyword_search.py CHANGED Viewed

@@ -5,8 +5,7 @@ sys.path.append('../utils')
 import streamlit as st
 import json
 import logging
-from utils.search import runLexicalPreprocessingPipeline, tokenize_lexical_query
-from utils.search import runSpacyMatcher, lexical_search
 def app():

 import streamlit as st
 import json
 import logging
+from utils.search import runLexicalPreprocessingPipeline, lexical_search
 def app():

paramconfig.cfg CHANGED Viewed

@@ -3,18 +3,23 @@ TOP_K = 20
 THRESHOLD = 0.1
 SPLIT_BY = sentence
 SPLIT_LENGTH = 3
 [semantic_search]
 TOP_K = 10
 MAX_SEQ_LENGTH = 64
 MODEL_NAME = msmarco-distilbert-cos-v5
 THRESHOLD = 0.1
 [sdg]
 THRESHOLD = 0.85
 MODEL = jonas/sdg_classifier_osdg
 SPLIT_BY = word
 SPLIT_LENGTH = 110
 [preprocessor]
 SPLIT_OVERLAP_WORD = 10

 THRESHOLD = 0.1
 SPLIT_BY = sentence
 SPLIT_LENGTH = 3
+SPLIT_OVERLAP = 0
 [semantic_search]
 TOP_K = 10
 MAX_SEQ_LENGTH = 64
 MODEL_NAME = msmarco-distilbert-cos-v5
 THRESHOLD = 0.1
+SPLIT_BY = sentence
+SPLIT_LENGTH = 3
+SPLIT_OVERLAP = 0
 [sdg]
 THRESHOLD = 0.85
 MODEL = jonas/sdg_classifier_osdg
 SPLIT_BY = word
 SPLIT_LENGTH = 110
+SPLIT_OVERLAP = 10
 [preprocessor]
 SPLIT_OVERLAP_WORD = 10

utils/preprocessing.py CHANGED Viewed

@@ -167,12 +167,12 @@ class UdfPreProcessor(BaseComponent):
     """
     outgoing_edges = 1
-    split_overlap_word = int(config.get('preprocessor','SPLIT_OVERLAP_WORD'))
-    split_overlap_sentence = int(config.get('preprocessor','SPLIT_OVERLAP_SENTENCE'))
     def run(self, documents:List[Document], removePunc:bool,
             split_by: Literal["sentence", "word"] = 'sentence',
-            split_length:int = 2):
         """ this is required method to invoke the component in
         the pipeline implementation.
@@ -198,11 +198,11 @@ class UdfPreProcessor(BaseComponent):
         if split_by == 'sentence':
             split_respect_sentence_boundary = False
-            split_overlap=self.split_overlap_sentence
         else:
             split_respect_sentence_boundary = True
-            split_overlap= self.split_overlap_word
         preprocessor = PreProcessor(
             clean_empty_lines=True,

     """
     outgoing_edges = 1
+    # split_overlap_word = int(config.get('preprocessor','SPLIT_OVERLAP_WORD'))
+    # split_overlap_sentence = int(config.get('preprocessor','SPLIT_OVERLAP_SENTENCE'))
     def run(self, documents:List[Document], removePunc:bool,
             split_by: Literal["sentence", "word"] = 'sentence',
+            split_length:int = 2, split_overlap = 0):
         """ this is required method to invoke the component in
         the pipeline implementation.
         if split_by == 'sentence':
             split_respect_sentence_boundary = False
+            # split_overlap=self.split_overlap_sentence
         else:
             split_respect_sentence_boundary = True
+            # split_overlap= self.split_overlap_word
         preprocessor = PreProcessor(
             clean_empty_lines=True,

utils/sdg_classifier.py CHANGED Viewed

@@ -86,12 +86,15 @@ def runSDGPreprocessingPipeline()->List[Document]:
     sdg_processing_pipeline = processingpipeline()
     split_by = config.get('sdg','SPLIT_BY')
     split_length = int(config.get('sdg','SPLIT_LENGTH'))
     output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
                             params= {"FileConverter": {"file_path": file_path, \
                                         "file_name": file_name},
                                      "UdfPreProcessor": {"removePunc": False, \
                                             "split_by": split_by, \
-                                            "split_length":split_length}})
     return output_sdg_pre['documents']

     sdg_processing_pipeline = processingpipeline()
     split_by = config.get('sdg','SPLIT_BY')
     split_length = int(config.get('sdg','SPLIT_LENGTH'))
+    split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
     output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
                             params= {"FileConverter": {"file_path": file_path, \
                                         "file_name": file_name},
                                      "UdfPreProcessor": {"removePunc": False, \
                                             "split_by": split_by, \
+                                            "split_length":split_length,\
+                                            "split_overlap": split_overlap}})
     return output_sdg_pre['documents']

utils/search.py CHANGED Viewed

@@ -117,6 +117,8 @@ def searchAnnotator(matches: List[List[int]], document):
                          label="ANSWER", background="#964448", color='#ffffff')))
         start = end_idx
     st.write(
             markdown(annotated_text),
             unsafe_allow_html=True,
@@ -137,9 +139,10 @@ def lexical_search(query:Text,documents:List[Document]):
                             top_k= int(config.get('lexical_search','TOP_K')))
     query_tokens = tokenize_lexical_query(query)
     for count, result in enumerate(results):
-        matches, doc = runSpacyMatcher(query_tokens,result.content)
-        st.write("Result {}".format(count))
-        searchAnnotator(matches, doc)
 def runLexicalPreprocessingPipeline()->List[Document]:
     """
@@ -159,13 +162,42 @@ def runLexicalPreprocessingPipeline()->List[Document]:
     sdg_processing_pipeline = processingpipeline()
     split_by = config.get('lexical_search','SPLIT_BY')
     split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
     output_lexical_pre = sdg_processing_pipeline.run(file_paths = file_path,
                             params= {"FileConverter": {"file_path": file_path, \
                                         "file_name": file_name},
                                         "UdfPreProcessor": {"removePunc": False, \
                                             "split_by": split_by, \
-                                            "split_length":split_length}})
     return output_lexical_pre['documents']

                          label="ANSWER", background="#964448", color='#ffffff')))
         start = end_idx
+    annotated_text = annotated_text + document[end_idx:].text
     st.write(
             markdown(annotated_text),
             unsafe_allow_html=True,
                             top_k= int(config.get('lexical_search','TOP_K')))
     query_tokens = tokenize_lexical_query(query)
     for count, result in enumerate(results):
+        if result.content != "":
+            matches, doc = runSpacyMatcher(query_tokens,result.content)
+            st.write("Result {}".format(count))
+            searchAnnotator(matches, doc)
 def runLexicalPreprocessingPipeline()->List[Document]:
     """
     sdg_processing_pipeline = processingpipeline()
     split_by = config.get('lexical_search','SPLIT_BY')
     split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
+    split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
     output_lexical_pre = sdg_processing_pipeline.run(file_paths = file_path,
                             params= {"FileConverter": {"file_path": file_path, \
                                         "file_name": file_name},
                                         "UdfPreProcessor": {"removePunc": False, \
                                             "split_by": split_by, \
+                                            "split_length":split_length,\
+                                            "split_overlap": split_overlap}})
     return output_lexical_pre['documents']
+def runSemanticPreprocessingPipeline()->List[Document]:
+    """
+    creates the pipeline and runs the preprocessing pipeline,
+    the params for pipeline are fetched from paramconfig
+    Return
+    --------------
+    List[Document]: When preprocessing pipeline is run, the output dictionary
+    has four objects. For the Haysatck implementation of SDG classification we,
+    need to use the List of Haystack Document, which can be fetched by
+    key = 'documents' on output.
+    """
+    file_path = st.session_state['filepath']
+    file_name = st.session_state['filename']
+    sdg_processing_pipeline = processingpipeline()
+    split_by = config.get('lexical_search','SPLIT_BY')
+    split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
+    output_lexical_pre = sdg_processing_pipeline.run(file_paths = file_path,
+                            params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
+                                        "UdfPreProcessor": {"removePunc": False, \
+                                            "split_by": split_by, \
+                                            "split_length":split_length}})
+    return output_lexical_pre['documents']