prashant
commited on
Commit
·
1d3978a
1
Parent(s):
3d34c75
updating overlap in preprocessing
Browse files- appStore/keyword_search.py +1 -2
- paramconfig.cfg +5 -0
- utils/preprocessing.py +5 -5
- utils/sdg_classifier.py +4 -1
- utils/search.py +36 -4
appStore/keyword_search.py
CHANGED
@@ -5,8 +5,7 @@ sys.path.append('../utils')
|
|
5 |
import streamlit as st
|
6 |
import json
|
7 |
import logging
|
8 |
-
from utils.search import runLexicalPreprocessingPipeline,
|
9 |
-
from utils.search import runSpacyMatcher, lexical_search
|
10 |
|
11 |
def app():
|
12 |
|
|
|
5 |
import streamlit as st
|
6 |
import json
|
7 |
import logging
|
8 |
+
from utils.search import runLexicalPreprocessingPipeline, lexical_search
|
|
|
9 |
|
10 |
def app():
|
11 |
|
paramconfig.cfg
CHANGED
@@ -3,18 +3,23 @@ TOP_K = 20
|
|
3 |
THRESHOLD = 0.1
|
4 |
SPLIT_BY = sentence
|
5 |
SPLIT_LENGTH = 3
|
|
|
6 |
|
7 |
[semantic_search]
|
8 |
TOP_K = 10
|
9 |
MAX_SEQ_LENGTH = 64
|
10 |
MODEL_NAME = msmarco-distilbert-cos-v5
|
11 |
THRESHOLD = 0.1
|
|
|
|
|
|
|
12 |
|
13 |
[sdg]
|
14 |
THRESHOLD = 0.85
|
15 |
MODEL = jonas/sdg_classifier_osdg
|
16 |
SPLIT_BY = word
|
17 |
SPLIT_LENGTH = 110
|
|
|
18 |
|
19 |
[preprocessor]
|
20 |
SPLIT_OVERLAP_WORD = 10
|
|
|
3 |
THRESHOLD = 0.1
|
4 |
SPLIT_BY = sentence
|
5 |
SPLIT_LENGTH = 3
|
6 |
+
SPLIT_OVERLAP = 0
|
7 |
|
8 |
[semantic_search]
|
9 |
TOP_K = 10
|
10 |
MAX_SEQ_LENGTH = 64
|
11 |
MODEL_NAME = msmarco-distilbert-cos-v5
|
12 |
THRESHOLD = 0.1
|
13 |
+
SPLIT_BY = sentence
|
14 |
+
SPLIT_LENGTH = 3
|
15 |
+
SPLIT_OVERLAP = 0
|
16 |
|
17 |
[sdg]
|
18 |
THRESHOLD = 0.85
|
19 |
MODEL = jonas/sdg_classifier_osdg
|
20 |
SPLIT_BY = word
|
21 |
SPLIT_LENGTH = 110
|
22 |
+
SPLIT_OVERLAP = 10
|
23 |
|
24 |
[preprocessor]
|
25 |
SPLIT_OVERLAP_WORD = 10
|
utils/preprocessing.py
CHANGED
@@ -167,12 +167,12 @@ class UdfPreProcessor(BaseComponent):
|
|
167 |
|
168 |
"""
|
169 |
outgoing_edges = 1
|
170 |
-
split_overlap_word = int(config.get('preprocessor','SPLIT_OVERLAP_WORD'))
|
171 |
-
split_overlap_sentence = int(config.get('preprocessor','SPLIT_OVERLAP_SENTENCE'))
|
172 |
|
173 |
def run(self, documents:List[Document], removePunc:bool,
|
174 |
split_by: Literal["sentence", "word"] = 'sentence',
|
175 |
-
split_length:int = 2):
|
176 |
|
177 |
""" this is required method to invoke the component in
|
178 |
the pipeline implementation.
|
@@ -198,11 +198,11 @@ class UdfPreProcessor(BaseComponent):
|
|
198 |
|
199 |
if split_by == 'sentence':
|
200 |
split_respect_sentence_boundary = False
|
201 |
-
split_overlap=self.split_overlap_sentence
|
202 |
|
203 |
else:
|
204 |
split_respect_sentence_boundary = True
|
205 |
-
split_overlap= self.split_overlap_word
|
206 |
|
207 |
preprocessor = PreProcessor(
|
208 |
clean_empty_lines=True,
|
|
|
167 |
|
168 |
"""
|
169 |
outgoing_edges = 1
|
170 |
+
# split_overlap_word = int(config.get('preprocessor','SPLIT_OVERLAP_WORD'))
|
171 |
+
# split_overlap_sentence = int(config.get('preprocessor','SPLIT_OVERLAP_SENTENCE'))
|
172 |
|
173 |
def run(self, documents:List[Document], removePunc:bool,
|
174 |
split_by: Literal["sentence", "word"] = 'sentence',
|
175 |
+
split_length:int = 2, split_overlap = 0):
|
176 |
|
177 |
""" this is required method to invoke the component in
|
178 |
the pipeline implementation.
|
|
|
198 |
|
199 |
if split_by == 'sentence':
|
200 |
split_respect_sentence_boundary = False
|
201 |
+
# split_overlap=self.split_overlap_sentence
|
202 |
|
203 |
else:
|
204 |
split_respect_sentence_boundary = True
|
205 |
+
# split_overlap= self.split_overlap_word
|
206 |
|
207 |
preprocessor = PreProcessor(
|
208 |
clean_empty_lines=True,
|
utils/sdg_classifier.py
CHANGED
@@ -86,12 +86,15 @@ def runSDGPreprocessingPipeline()->List[Document]:
|
|
86 |
sdg_processing_pipeline = processingpipeline()
|
87 |
split_by = config.get('sdg','SPLIT_BY')
|
88 |
split_length = int(config.get('sdg','SPLIT_LENGTH'))
|
|
|
|
|
89 |
|
90 |
output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
|
91 |
params= {"FileConverter": {"file_path": file_path, \
|
92 |
"file_name": file_name},
|
93 |
"UdfPreProcessor": {"removePunc": False, \
|
94 |
"split_by": split_by, \
|
95 |
-
"split_length":split_length
|
|
|
96 |
|
97 |
return output_sdg_pre['documents']
|
|
|
86 |
sdg_processing_pipeline = processingpipeline()
|
87 |
split_by = config.get('sdg','SPLIT_BY')
|
88 |
split_length = int(config.get('sdg','SPLIT_LENGTH'))
|
89 |
+
split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
|
90 |
+
|
91 |
|
92 |
output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
|
93 |
params= {"FileConverter": {"file_path": file_path, \
|
94 |
"file_name": file_name},
|
95 |
"UdfPreProcessor": {"removePunc": False, \
|
96 |
"split_by": split_by, \
|
97 |
+
"split_length":split_length,\
|
98 |
+
"split_overlap": split_overlap}})
|
99 |
|
100 |
return output_sdg_pre['documents']
|
utils/search.py
CHANGED
@@ -117,6 +117,8 @@ def searchAnnotator(matches: List[List[int]], document):
|
|
117 |
label="ANSWER", background="#964448", color='#ffffff')))
|
118 |
start = end_idx
|
119 |
|
|
|
|
|
120 |
st.write(
|
121 |
markdown(annotated_text),
|
122 |
unsafe_allow_html=True,
|
@@ -137,9 +139,10 @@ def lexical_search(query:Text,documents:List[Document]):
|
|
137 |
top_k= int(config.get('lexical_search','TOP_K')))
|
138 |
query_tokens = tokenize_lexical_query(query)
|
139 |
for count, result in enumerate(results):
|
140 |
-
|
141 |
-
|
142 |
-
|
|
|
143 |
|
144 |
def runLexicalPreprocessingPipeline()->List[Document]:
|
145 |
"""
|
@@ -159,13 +162,42 @@ def runLexicalPreprocessingPipeline()->List[Document]:
|
|
159 |
sdg_processing_pipeline = processingpipeline()
|
160 |
split_by = config.get('lexical_search','SPLIT_BY')
|
161 |
split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
|
|
|
162 |
|
163 |
output_lexical_pre = sdg_processing_pipeline.run(file_paths = file_path,
|
164 |
params= {"FileConverter": {"file_path": file_path, \
|
165 |
"file_name": file_name},
|
166 |
"UdfPreProcessor": {"removePunc": False, \
|
167 |
"split_by": split_by, \
|
168 |
-
"split_length":split_length
|
|
|
169 |
|
170 |
return output_lexical_pre['documents']
|
171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
label="ANSWER", background="#964448", color='#ffffff')))
|
118 |
start = end_idx
|
119 |
|
120 |
+
annotated_text = annotated_text + document[end_idx:].text
|
121 |
+
|
122 |
st.write(
|
123 |
markdown(annotated_text),
|
124 |
unsafe_allow_html=True,
|
|
|
139 |
top_k= int(config.get('lexical_search','TOP_K')))
|
140 |
query_tokens = tokenize_lexical_query(query)
|
141 |
for count, result in enumerate(results):
|
142 |
+
if result.content != "":
|
143 |
+
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
144 |
+
st.write("Result {}".format(count))
|
145 |
+
searchAnnotator(matches, doc)
|
146 |
|
147 |
def runLexicalPreprocessingPipeline()->List[Document]:
|
148 |
"""
|
|
|
162 |
sdg_processing_pipeline = processingpipeline()
|
163 |
split_by = config.get('lexical_search','SPLIT_BY')
|
164 |
split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
|
165 |
+
split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
|
166 |
|
167 |
output_lexical_pre = sdg_processing_pipeline.run(file_paths = file_path,
|
168 |
params= {"FileConverter": {"file_path": file_path, \
|
169 |
"file_name": file_name},
|
170 |
"UdfPreProcessor": {"removePunc": False, \
|
171 |
"split_by": split_by, \
|
172 |
+
"split_length":split_length,\
|
173 |
+
"split_overlap": split_overlap}})
|
174 |
|
175 |
return output_lexical_pre['documents']
|
176 |
|
177 |
+
def runSemanticPreprocessingPipeline()->List[Document]:
|
178 |
+
"""
|
179 |
+
creates the pipeline and runs the preprocessing pipeline,
|
180 |
+
the params for pipeline are fetched from paramconfig
|
181 |
+
|
182 |
+
Return
|
183 |
+
--------------
|
184 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
185 |
+
has four objects. For the Haysatck implementation of SDG classification we,
|
186 |
+
need to use the List of Haystack Document, which can be fetched by
|
187 |
+
key = 'documents' on output.
|
188 |
+
|
189 |
+
"""
|
190 |
+
file_path = st.session_state['filepath']
|
191 |
+
file_name = st.session_state['filename']
|
192 |
+
sdg_processing_pipeline = processingpipeline()
|
193 |
+
split_by = config.get('lexical_search','SPLIT_BY')
|
194 |
+
split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
|
195 |
+
|
196 |
+
output_lexical_pre = sdg_processing_pipeline.run(file_paths = file_path,
|
197 |
+
params= {"FileConverter": {"file_path": file_path, \
|
198 |
+
"file_name": file_name},
|
199 |
+
"UdfPreProcessor": {"removePunc": False, \
|
200 |
+
"split_by": split_by, \
|
201 |
+
"split_length":split_length}})
|
202 |
+
|
203 |
+
return output_lexical_pre['documents']
|