TeresaK commited on
Commit
d47c4f8
1 Parent(s): 0723c2a

Update utils/vulnerability_classifier.py

Browse files
Files changed (1) hide show
  1. utils/vulnerability_classifier.py +281 -126
utils/vulnerability_classifier.py CHANGED
@@ -1,152 +1,307 @@
1
- from typing import List, Tuple
 
 
 
 
 
2
  from typing_extensions import Literal
3
- import logging
4
  import pandas as pd
5
- from pandas import DataFrame, Series
6
- from utils.config import getconfig
7
- from utils.preprocessing import processingpipeline
 
8
  import streamlit as st
9
- from transformers import pipeline
10
- from setfit import SetFitModel
11
-
12
- label_dict= {0: 'Agricultural communities',
13
- 1: 'Children',
14
- 2: 'Coastal communities',
15
- 3: 'Ethnic, racial or other minorities',
16
- 4: 'Fishery communities',
17
- 5: 'Informal sector workers',
18
- 6: 'Members of indigenous and local communities',
19
- 7: 'Migrants and displaced persons',
20
- 8: 'Older persons',
21
- 9: 'Other',
22
- 10: 'Persons living in poverty',
23
- 11: 'Persons with disabilities',
24
- 12: 'Persons with pre-existing health conditions',
25
- 13: 'Residents of drought-prone regions',
26
- 14: 'Rural populations',
27
- 15: 'Sexual minorities (LGBTQI+)',
28
- 16: 'Urban populations',
29
- 17: 'Women and other genders'}
30
-
31
- def get_vulnerability_labels(preds):
32
 
 
 
33
  """
34
- Function that takes the numerical predictions as an input and returns a list of the labels.
35
 
36
- """
 
 
 
37
 
38
- # Get label names
39
- preds_list = preds.tolist()
 
 
 
 
 
 
 
 
 
40
 
41
- # Get the name of the group where the prediction is equal to "1"
42
- result = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- for sublist in preds_list:
45
- names = [label_dict[key] for key, value in enumerate(sublist) if value == 1]
46
- result.append(names)
 
47
 
48
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- @st.cache_resource
51
- def load_vulnerabilityClassifier(config_file:str = None, classifier_name:str = None):
 
 
 
52
  """
53
- loads the document classifier using haystack, where the name/path of model
54
- in HF-hub as string is used to fetch the model object.Either configfile or
55
- model should be passed.
56
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
57
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
58
  Params
59
- --------
60
- config_file: config file path from which to read the model name
61
- classifier_name: if modelname is passed, it takes a priority if not \
62
- found then will look for configfile, else raise error.
63
- Return: document classifier model
64
  """
 
 
 
 
 
 
 
65
 
66
- # If no classifier given
 
 
 
 
 
 
67
 
68
- if not classifier_name:
69
- if not config_file:
70
- logging.warning("Pass either model name or config file")
71
- return
72
- else:
73
- config = getconfig(config_file)
74
- classifier_name = config.get('vulnerability','MODEL')
 
75
 
76
- logging.info("Loading vulnerability classifier")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- # we are using the pipeline as the model is multilabel and DocumentClassifier
79
- # from Haystack doesnt support multilabel
80
- # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
81
- # if not then it will automatically use softmax, which is not a desired thing.
82
- # doc_classifier = TransformersDocumentClassifier(
83
- # model_name_or_path=classifier_name,
84
- # task="text-classification",
85
- # top_k = None)
86
-
87
- # Download model from HF Hub
88
- doc_classifier = SetFitModel.from_pretrained("leavoigt/vulnerability_multilabel")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- # doc_classifier = pipeline("text-classification",
91
- # model=classifier_name,
92
- # return_all_scores=True,
93
- # function_to_apply= "sigmoid")
 
 
 
 
 
 
 
94
 
95
- return doc_classifier
 
 
 
 
 
 
 
 
 
 
 
 
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
- @st.cache_data
99
- def vulnerability_classification(haystack_doc:pd.DataFrame,
100
- threshold:float = 0.5,
101
- classifier_model:pipeline= None
102
- )->Tuple[DataFrame,Series]:
103
  """
104
- Text-Classification on the list of texts provided. Classifier provides the
105
- most appropriate label for each text. these labels are in terms of if text
106
- reference a group in a vulnerable situation.
107
- ---------
108
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
109
- contains the list of paragraphs in different format,here the list of
110
- Haystack Documents is used.
111
- threshold: threshold value for the model to keep the results from classifier
112
- classifiermodel: you can pass the classifier model directly,which takes priority
113
- however if not then looks for model in streamlit session.
114
- In case of streamlit avoid passing the model directly.
115
- Returns
116
- ----------
117
- df: Dataframe with two columns['SDG:int', 'text']
118
- x: Series object with the unique SDG covered in the document uploaded and
119
- the number of times it is covered/discussed/count_of_paragraphs.
120
  """
121
- logging.info("Working on vulnerability Identification")
122
- haystack_doc['Vulnerability Label'] = 'NA'
123
- # haystack_doc['PA_check'] = haystack_doc['Policy-Action Label'].apply(lambda x: True if len(x) != 0 else False)
124
-
125
- # df1 = haystack_doc[haystack_doc['PA_check'] == True]
126
- # df = haystack_doc[haystack_doc['PA_check'] == False]
127
- if not classifier_model:
128
- classifier_model = st.session_state['vulnerability_classifier']
129
-
130
- predictions = classifier_model(list(haystack_doc.text))
131
 
132
-
 
 
133
 
134
- pred_labels = get_vulnerability_labels(predictions)
135
-
136
- haystack_doc['Vulnerability Label'] = pred_labels
137
- # placeholder = {}
138
- # for j in range(len(temp)):
139
- # placeholder[temp[j]['label']] = temp[j]['score']
140
- # list_.append(placeholder)
141
- # labels_ = [{**list_[l]} for l in range(len(predictions))]
142
- # truth_df = DataFrame.from_dict(labels_)
143
- # truth_df = truth_df.round(2)
144
- # truth_df = truth_df.astype(float) >= threshold
145
- # truth_df = truth_df.astype(str)
146
- # categories = list(truth_df.columns)
147
- # truth_df['Vulnerability Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
148
- # None for i in categories}, axis=1)
149
- # truth_df['Vulnerability Label'] = truth_df.apply(lambda x: list(x['Vulnerability Label']
150
- # -{None}),axis=1)
151
- # haystack_doc['Vulnerability Label'] = list(truth_df['Vulnerability Label'])
152
- return haystack_doc
 
1
+ from haystack.nodes.base import BaseComponent
2
+ from haystack.schema import Document
3
+ from haystack.nodes import ImageToTextConverter, PDFToTextConverter
4
+ from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
5
+ from pdf2image import convert_from_path
6
+ from typing import Callable, Dict, List, Optional, Text, Tuple, Union
7
  from typing_extensions import Literal
 
8
  import pandas as pd
9
+ import logging
10
+ import re
11
+ import string
12
+ from haystack.pipelines import Pipeline
13
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ @st.cache_data
16
+ def useOCR(file_path: str)-> Text:
17
  """
18
+ Converts image pdfs into text, Using the Farm-haystack[OCR]
19
 
20
+ Params
21
+ ----------
22
+ file_path: file_path of uploade file, returned by add_upload function in
23
+ uploadAndExample.py
24
 
25
+ Returns the text file as string.
26
+ """
27
+ # we need pdf file to be first converted into image file
28
+ # this will create each page as image file
29
+ images = convert_from_path(pdf_path = file_path)
30
+ list_ = []
31
+ # save image file in cache and read them one by one to pass it to OCR
32
+ for i, pdf in enumerate(images):
33
+ # Save pages as images in the pdf
34
+ pdf.save(f'PDF\image_converted_{i+1}.png', 'PNG')
35
+ list_.append(f'PDF\image_converted_{i+1}.png')
36
 
37
+ converter = ImageToTextConverter(remove_numeric_tables=True,
38
+ valid_languages=["eng"])
39
+ # placeholder to collect the text from each page
40
+ placeholder = []
41
+ for file in list_:
42
+ document = converter.convert(
43
+ file_path=file, meta=None,
44
+ )[0]
45
+
46
+ text = document.content
47
+ placeholder.append(text)
48
+ # join the text from each page by page separator
49
+ text = '\x0c'.join(placeholder)
50
+ return text
51
+
52
+
53
+
54
+ class FileConverter(BaseComponent):
55
+ """
56
+ Wrapper class to convert uploaded document into text by calling appropriate
57
+ Converter class, will use internally haystack PDFToTextOCR in case of image
58
+ pdf. Cannot use the FileClassifier from haystack as its doesnt has any
59
+ label/output class for image.
60
+ 1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
61
+ 2. https://docs.haystack.deepset.ai/docs/file_converters
62
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
63
+ 4. https://docs.haystack.deepset.ai/reference/file-converters-api
64
+ """
65
+
66
+ outgoing_edges = 1
67
+
68
+ def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
69
+ id_hash_keys: Optional[List[str]] = None,
70
+ ) -> Tuple[dict,str]:
71
+ """ this is required method to invoke the component in
72
+ the pipeline implementation.
73
+
74
+ Params
75
+ ----------
76
+ file_name: name of file
77
+ file_path: file_path of uploade file, returned by add_upload function in
78
+ uploadAndExample.py
79
+
80
+ See the links provided in Class docstring/description to see other params
81
+
82
+ Return
83
+ ---------
84
+ output: dictionary, with key as identifier and value could be anything
85
+ we need to return. In this case its the List of Hasyatck Document
86
+
87
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
88
+ """
89
+ try:
90
+ if file_name.endswith('.pdf'):
91
+ converter = PDFToTextConverter(remove_numeric_tables=True)
92
+ if file_name.endswith('.txt'):
93
+ converter = TextConverter(remove_numeric_tables=True)
94
+ if file_name.endswith('.docx'):
95
+ converter = DocxToTextConverter()
96
+ except Exception as e:
97
+ logging.error(e)
98
+ return
99
+
100
+
101
+
102
+ documents = []
103
+
104
+ document = converter.convert(
105
+ file_path=file_path, meta=None,
106
+ encoding=encoding, id_hash_keys=id_hash_keys
107
+ )[0]
108
+
109
+ text = document.content
110
 
111
+ # in case of scanned/images only PDF the content might contain only
112
+ # the page separator (\f or \x0c). We check if is so and use
113
+ # use the OCR to get the text.
114
+ filtered = re.sub(r'\x0c', '', text)
115
 
116
+ if filtered == "":
117
+ logging.info("Using OCR")
118
+ text = useOCR(file_path)
119
+
120
+ documents.append(Document(content=text,
121
+ meta={"name": file_name},
122
+ id_hash_keys=id_hash_keys))
123
+
124
+
125
+
126
+ logging.info('file conversion succesful')
127
+ output = {'documents': documents}
128
+ return output, 'output_1'
129
+
130
+ def run_batch():
131
+ """
132
+ we dont have requirement to process the multiple files in one go
133
+ therefore nothing here, however to use the custom node we need to have
134
+ this method for the class.
135
+ """
136
 
137
+ return
138
+
139
+
140
+ def basic(s:str, remove_punc:bool = False):
141
+
142
  """
143
+ Performs basic cleaning of text.
 
 
 
 
144
  Params
145
+ ----------
146
+ s: string to be processed
147
+ removePunc: to remove all Punctuation including ',' and '.' or not
148
+
149
+ Returns: processed string: see comments in the source code for more info
150
  """
151
+
152
+ # Remove URLs
153
+ s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
154
+ s = re.sub(r"http\S+", " ", s)
155
+
156
+ # Remove new line characters
157
+ s = re.sub('\n', ' ', s)
158
 
159
+ # Remove punctuations
160
+ if remove_punc == True:
161
+ translator = str.maketrans(' ', ' ', string.punctuation)
162
+ s = s.translate(translator)
163
+ # Remove distracting single quotes and dotted pattern
164
+ s = re.sub("\'", " ", s)
165
+ s = s.replace("..","")
166
 
167
+ return s.strip()
168
+
169
+
170
+ def paraLengthCheck(paraList, max_len = 100):
171
+ """
172
+ There are cases where preprocessor cannot respect word limit, when using
173
+ respect sentence boundary flag due to missing sentence boundaries.
174
+ Therefore we run one more round of split here for those paragraphs
175
 
176
+ Params
177
+ ---------------
178
+ paraList : list of paragraphs/text
179
+ max_len : max length to be respected by sentences which bypassed
180
+ preprocessor strategy
181
+
182
+ """
183
+ new_para_list = []
184
+ for passage in paraList:
185
+ # check if para exceeds words limit
186
+ if len(passage.content.split()) > max_len:
187
+ # we might need few iterations example if para = 512 tokens
188
+ # we need to iterate 5 times to reduce para to size limit of '100'
189
+ iterations = int(len(passage.content.split())/max_len)
190
+ for i in range(iterations):
191
+ temp = " ".join(passage.content.split()[max_len*i:max_len*(i+1)])
192
+ new_para_list.append((temp,passage.meta['page']))
193
+ temp = " ".join(passage.content.split()[max_len*(i+1):])
194
+ new_para_list.append((temp,passage.meta['page']))
195
+ else:
196
+ # paragraphs which dont need any splitting
197
+ new_para_list.append((passage.content, passage.meta['page']))
198
 
199
+ logging.info("New paragraphs length {}".format(len(new_para_list)))
200
+ return new_para_list
201
+
202
+ class UdfPreProcessor(BaseComponent):
203
+ """
204
+ class to preprocess the document returned by FileConverter. It will check
205
+ for splitting strategy and splits the document by word or sentences and then
206
+ synthetically create the paragraphs.
207
+ 1. https://docs.haystack.deepset.ai/docs/preprocessor
208
+ 2. https://docs.haystack.deepset.ai/reference/preprocessor-api
209
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
210
+ """
211
+ outgoing_edges = 1
212
+
213
+ def run(self, documents:List[Document], remove_punc:bool=False, apply_clean = True,
214
+ split_by: Literal["sentence", "word"] = 'sentence',
215
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
216
+ split_overlap:int = 0):
217
+
218
+ """ this is required method to invoke the component in
219
+ the pipeline implementation.
220
+
221
+ Params
222
+ ----------
223
+ documents: documents from the output dictionary returned by Fileconverter
224
+ remove_punc: to remove all Punctuation including ',' and '.' or not
225
+ split_by: document splitting strategy either as word or sentence
226
+ split_length: when synthetically creating the paragrpahs from document,
227
+ it defines the length of paragraph.
228
+ split_respect_sentence_boundary: Used when using 'word' strategy for
229
+ splititng of text.
230
+ split_overlap: Number of words or sentences that overlap when creating
231
+ the paragraphs. This is done as one sentence or 'some words' make sense
232
+ when read in together with others. Therefore the overlap is used.
233
+
234
+ Return
235
+ ---------
236
+ output: dictionary, with key as identifier and value could be anything
237
+ we need to return. In this case the output will contain 4 objects
238
+ the paragraphs text list as List, Haystack document, Dataframe and
239
+ one raw text file.
240
+
241
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
242
+
243
+ """
244
+
245
+ if split_by == 'sentence':
246
+ split_respect_sentence_boundary = False
247
 
248
+ else:
249
+ split_respect_sentence_boundary = split_respect_sentence_boundary
250
+
251
+ preprocessor = PreProcessor(
252
+ clean_empty_lines=True,
253
+ clean_whitespace=True,
254
+ clean_header_footer=True,
255
+ split_by=split_by,
256
+ split_length=split_length,
257
+ split_respect_sentence_boundary= split_respect_sentence_boundary,
258
+ split_overlap=split_overlap,
259
 
260
+ # will add page number only in case of PDF not for text/docx file.
261
+ add_page_number=True
262
+ )
263
+
264
+ for i in documents:
265
+ # # basic cleaning before passing it to preprocessor.
266
+ # i = basic(i)
267
+ docs_processed = preprocessor.process([i])
268
+ if apply_clean:
269
+ for item in docs_processed:
270
+ item.content = basic(item.content, remove_punc= remove_punc)
271
+ else:
272
+ pass
273
 
274
+ df = pd.DataFrame(docs_processed)
275
+ all_text = " ".join(df.content.to_list())
276
+ para_list = df.content.to_list()
277
+ logging.info('document split into {} paragraphs'.format(len(para_list)))
278
+ output = {'documents': docs_processed,
279
+ 'dataframe': df,
280
+ 'text': all_text,
281
+ 'paraList': para_list
282
+ }
283
+ return output, "output_1"
284
+ def run_batch():
285
+ """
286
+ we dont have requirement to process the multiple files in one go
287
+ therefore nothing here, however to use the custom node we need to have
288
+ this method for the class.
289
+ """
290
+ return
291
 
292
+ def processingpipeline():
 
 
 
 
293
  """
294
+ Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
295
+ from utils.preprocessing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  """
 
 
 
 
 
 
 
 
 
 
297
 
298
+ preprocessing_pipeline = Pipeline()
299
+ file_converter = FileConverter()
300
+ custom_preprocessor = UdfPreProcessor()
301
 
302
+ preprocessing_pipeline.add_node(component=file_converter,
303
+ name="FileConverter", inputs=["File"])
304
+ preprocessing_pipeline.add_node(component = custom_preprocessor,
305
+ name ='UdfPreProcessor', inputs=["FileConverter"])
306
+
307
+ return preprocessing_pipeline