prashant commited on
Commit
1d3978a
·
1 Parent(s): 3d34c75

updating overlap in preprocessing

Browse files
appStore/keyword_search.py CHANGED
@@ -5,8 +5,7 @@ sys.path.append('../utils')
5
  import streamlit as st
6
  import json
7
  import logging
8
- from utils.search import runLexicalPreprocessingPipeline, tokenize_lexical_query
9
- from utils.search import runSpacyMatcher, lexical_search
10
 
11
  def app():
12
 
 
5
  import streamlit as st
6
  import json
7
  import logging
8
+ from utils.search import runLexicalPreprocessingPipeline, lexical_search
 
9
 
10
  def app():
11
 
paramconfig.cfg CHANGED
@@ -3,18 +3,23 @@ TOP_K = 20
3
  THRESHOLD = 0.1
4
  SPLIT_BY = sentence
5
  SPLIT_LENGTH = 3
 
6
 
7
  [semantic_search]
8
  TOP_K = 10
9
  MAX_SEQ_LENGTH = 64
10
  MODEL_NAME = msmarco-distilbert-cos-v5
11
  THRESHOLD = 0.1
 
 
 
12
 
13
  [sdg]
14
  THRESHOLD = 0.85
15
  MODEL = jonas/sdg_classifier_osdg
16
  SPLIT_BY = word
17
  SPLIT_LENGTH = 110
 
18
 
19
  [preprocessor]
20
  SPLIT_OVERLAP_WORD = 10
 
3
  THRESHOLD = 0.1
4
  SPLIT_BY = sentence
5
  SPLIT_LENGTH = 3
6
+ SPLIT_OVERLAP = 0
7
 
8
  [semantic_search]
9
  TOP_K = 10
10
  MAX_SEQ_LENGTH = 64
11
  MODEL_NAME = msmarco-distilbert-cos-v5
12
  THRESHOLD = 0.1
13
+ SPLIT_BY = sentence
14
+ SPLIT_LENGTH = 3
15
+ SPLIT_OVERLAP = 0
16
 
17
  [sdg]
18
  THRESHOLD = 0.85
19
  MODEL = jonas/sdg_classifier_osdg
20
  SPLIT_BY = word
21
  SPLIT_LENGTH = 110
22
+ SPLIT_OVERLAP = 10
23
 
24
  [preprocessor]
25
  SPLIT_OVERLAP_WORD = 10
utils/preprocessing.py CHANGED
@@ -167,12 +167,12 @@ class UdfPreProcessor(BaseComponent):
167
 
168
  """
169
  outgoing_edges = 1
170
- split_overlap_word = int(config.get('preprocessor','SPLIT_OVERLAP_WORD'))
171
- split_overlap_sentence = int(config.get('preprocessor','SPLIT_OVERLAP_SENTENCE'))
172
 
173
  def run(self, documents:List[Document], removePunc:bool,
174
  split_by: Literal["sentence", "word"] = 'sentence',
175
- split_length:int = 2):
176
 
177
  """ this is required method to invoke the component in
178
  the pipeline implementation.
@@ -198,11 +198,11 @@ class UdfPreProcessor(BaseComponent):
198
 
199
  if split_by == 'sentence':
200
  split_respect_sentence_boundary = False
201
- split_overlap=self.split_overlap_sentence
202
 
203
  else:
204
  split_respect_sentence_boundary = True
205
- split_overlap= self.split_overlap_word
206
 
207
  preprocessor = PreProcessor(
208
  clean_empty_lines=True,
 
167
 
168
  """
169
  outgoing_edges = 1
170
+ # split_overlap_word = int(config.get('preprocessor','SPLIT_OVERLAP_WORD'))
171
+ # split_overlap_sentence = int(config.get('preprocessor','SPLIT_OVERLAP_SENTENCE'))
172
 
173
  def run(self, documents:List[Document], removePunc:bool,
174
  split_by: Literal["sentence", "word"] = 'sentence',
175
+ split_length:int = 2, split_overlap = 0):
176
 
177
  """ this is required method to invoke the component in
178
  the pipeline implementation.
 
198
 
199
  if split_by == 'sentence':
200
  split_respect_sentence_boundary = False
201
+ # split_overlap=self.split_overlap_sentence
202
 
203
  else:
204
  split_respect_sentence_boundary = True
205
+ # split_overlap= self.split_overlap_word
206
 
207
  preprocessor = PreProcessor(
208
  clean_empty_lines=True,
utils/sdg_classifier.py CHANGED
@@ -86,12 +86,15 @@ def runSDGPreprocessingPipeline()->List[Document]:
86
  sdg_processing_pipeline = processingpipeline()
87
  split_by = config.get('sdg','SPLIT_BY')
88
  split_length = int(config.get('sdg','SPLIT_LENGTH'))
 
 
89
 
90
  output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
91
  params= {"FileConverter": {"file_path": file_path, \
92
  "file_name": file_name},
93
  "UdfPreProcessor": {"removePunc": False, \
94
  "split_by": split_by, \
95
- "split_length":split_length}})
 
96
 
97
  return output_sdg_pre['documents']
 
86
  sdg_processing_pipeline = processingpipeline()
87
  split_by = config.get('sdg','SPLIT_BY')
88
  split_length = int(config.get('sdg','SPLIT_LENGTH'))
89
+ split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
90
+
91
 
92
  output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
93
  params= {"FileConverter": {"file_path": file_path, \
94
  "file_name": file_name},
95
  "UdfPreProcessor": {"removePunc": False, \
96
  "split_by": split_by, \
97
+ "split_length":split_length,\
98
+ "split_overlap": split_overlap}})
99
 
100
  return output_sdg_pre['documents']
utils/search.py CHANGED
@@ -117,6 +117,8 @@ def searchAnnotator(matches: List[List[int]], document):
117
  label="ANSWER", background="#964448", color='#ffffff')))
118
  start = end_idx
119
 
 
 
120
  st.write(
121
  markdown(annotated_text),
122
  unsafe_allow_html=True,
@@ -137,9 +139,10 @@ def lexical_search(query:Text,documents:List[Document]):
137
  top_k= int(config.get('lexical_search','TOP_K')))
138
  query_tokens = tokenize_lexical_query(query)
139
  for count, result in enumerate(results):
140
- matches, doc = runSpacyMatcher(query_tokens,result.content)
141
- st.write("Result {}".format(count))
142
- searchAnnotator(matches, doc)
 
143
 
144
  def runLexicalPreprocessingPipeline()->List[Document]:
145
  """
@@ -159,13 +162,42 @@ def runLexicalPreprocessingPipeline()->List[Document]:
159
  sdg_processing_pipeline = processingpipeline()
160
  split_by = config.get('lexical_search','SPLIT_BY')
161
  split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
 
162
 
163
  output_lexical_pre = sdg_processing_pipeline.run(file_paths = file_path,
164
  params= {"FileConverter": {"file_path": file_path, \
165
  "file_name": file_name},
166
  "UdfPreProcessor": {"removePunc": False, \
167
  "split_by": split_by, \
168
- "split_length":split_length}})
 
169
 
170
  return output_lexical_pre['documents']
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  label="ANSWER", background="#964448", color='#ffffff')))
118
  start = end_idx
119
 
120
+ annotated_text = annotated_text + document[end_idx:].text
121
+
122
  st.write(
123
  markdown(annotated_text),
124
  unsafe_allow_html=True,
 
139
  top_k= int(config.get('lexical_search','TOP_K')))
140
  query_tokens = tokenize_lexical_query(query)
141
  for count, result in enumerate(results):
142
+ if result.content != "":
143
+ matches, doc = runSpacyMatcher(query_tokens,result.content)
144
+ st.write("Result {}".format(count))
145
+ searchAnnotator(matches, doc)
146
 
147
  def runLexicalPreprocessingPipeline()->List[Document]:
148
  """
 
162
  sdg_processing_pipeline = processingpipeline()
163
  split_by = config.get('lexical_search','SPLIT_BY')
164
  split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
165
+ split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
166
 
167
  output_lexical_pre = sdg_processing_pipeline.run(file_paths = file_path,
168
  params= {"FileConverter": {"file_path": file_path, \
169
  "file_name": file_name},
170
  "UdfPreProcessor": {"removePunc": False, \
171
  "split_by": split_by, \
172
+ "split_length":split_length,\
173
+ "split_overlap": split_overlap}})
174
 
175
  return output_lexical_pre['documents']
176
 
177
+ def runSemanticPreprocessingPipeline()->List[Document]:
178
+ """
179
+ creates the pipeline and runs the preprocessing pipeline,
180
+ the params for pipeline are fetched from paramconfig
181
+
182
+ Return
183
+ --------------
184
+ List[Document]: When preprocessing pipeline is run, the output dictionary
185
+ has four objects. For the Haysatck implementation of SDG classification we,
186
+ need to use the List of Haystack Document, which can be fetched by
187
+ key = 'documents' on output.
188
+
189
+ """
190
+ file_path = st.session_state['filepath']
191
+ file_name = st.session_state['filename']
192
+ sdg_processing_pipeline = processingpipeline()
193
+ split_by = config.get('lexical_search','SPLIT_BY')
194
+ split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
195
+
196
+ output_lexical_pre = sdg_processing_pipeline.run(file_paths = file_path,
197
+ params= {"FileConverter": {"file_path": file_path, \
198
+ "file_name": file_name},
199
+ "UdfPreProcessor": {"removePunc": False, \
200
+ "split_by": split_by, \
201
+ "split_length":split_length}})
202
+
203
+ return output_lexical_pre['documents']