TeresaK commited on
Commit
e569bf0
1 Parent(s): d47c4f8

Update utils/vulnerability_classifier.py

Browse files
Files changed (1) hide show
  1. utils/vulnerability_classifier.py +127 -281
utils/vulnerability_classifier.py CHANGED
@@ -1,307 +1,153 @@
1
- from haystack.nodes.base import BaseComponent
2
- from haystack.schema import Document
3
- from haystack.nodes import ImageToTextConverter, PDFToTextConverter
4
- from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
5
- from pdf2image import convert_from_path
6
- from typing import Callable, Dict, List, Optional, Text, Tuple, Union
7
  from typing_extensions import Literal
8
- import pandas as pd
9
  import logging
10
- import re
11
- import string
12
- from haystack.pipelines import Pipeline
 
13
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- @st.cache_data
16
- def useOCR(file_path: str)-> Text:
17
  """
18
- Converts image pdfs into text, Using the Farm-haystack[OCR]
19
-
20
- Params
21
- ----------
22
- file_path: file_path of uploade file, returned by add_upload function in
23
- uploadAndExample.py
24
 
25
- Returns the text file as string.
26
  """
27
- # we need pdf file to be first converted into image file
28
- # this will create each page as image file
29
- images = convert_from_path(pdf_path = file_path)
30
- list_ = []
31
- # save image file in cache and read them one by one to pass it to OCR
32
- for i, pdf in enumerate(images):
33
- # Save pages as images in the pdf
34
- pdf.save(f'PDF\image_converted_{i+1}.png', 'PNG')
35
- list_.append(f'PDF\image_converted_{i+1}.png')
36
 
37
- converter = ImageToTextConverter(remove_numeric_tables=True,
38
- valid_languages=["eng"])
39
- # placeholder to collect the text from each page
40
- placeholder = []
41
- for file in list_:
42
- document = converter.convert(
43
- file_path=file, meta=None,
44
- )[0]
45
-
46
- text = document.content
47
- placeholder.append(text)
48
- # join the text from each page by page separator
49
- text = '\x0c'.join(placeholder)
50
- return text
51
-
52
-
53
-
54
- class FileConverter(BaseComponent):
55
- """
56
- Wrapper class to convert uploaded document into text by calling appropriate
57
- Converter class, will use internally haystack PDFToTextOCR in case of image
58
- pdf. Cannot use the FileClassifier from haystack as its doesnt has any
59
- label/output class for image.
60
- 1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
61
- 2. https://docs.haystack.deepset.ai/docs/file_converters
62
- 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
63
- 4. https://docs.haystack.deepset.ai/reference/file-converters-api
64
- """
65
-
66
- outgoing_edges = 1
67
-
68
- def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
69
- id_hash_keys: Optional[List[str]] = None,
70
- ) -> Tuple[dict,str]:
71
- """ this is required method to invoke the component in
72
- the pipeline implementation.
73
-
74
- Params
75
- ----------
76
- file_name: name of file
77
- file_path: file_path of uploade file, returned by add_upload function in
78
- uploadAndExample.py
79
-
80
- See the links provided in Class docstring/description to see other params
81
-
82
- Return
83
- ---------
84
- output: dictionary, with key as identifier and value could be anything
85
- we need to return. In this case its the List of Hasyatck Document
86
-
87
- output_1: As there is only one outgoing edge, we pass 'output_1' string
88
- """
89
- try:
90
- if file_name.endswith('.pdf'):
91
- converter = PDFToTextConverter(remove_numeric_tables=True)
92
- if file_name.endswith('.txt'):
93
- converter = TextConverter(remove_numeric_tables=True)
94
- if file_name.endswith('.docx'):
95
- converter = DocxToTextConverter()
96
- except Exception as e:
97
- logging.error(e)
98
- return
99
-
100
-
101
-
102
- documents = []
103
-
104
- document = converter.convert(
105
- file_path=file_path, meta=None,
106
- encoding=encoding, id_hash_keys=id_hash_keys
107
- )[0]
108
-
109
- text = document.content
110
-
111
- # in case of scanned/images only PDF the content might contain only
112
- # the page separator (\f or \x0c). We check if is so and use
113
- # use the OCR to get the text.
114
- filtered = re.sub(r'\x0c', '', text)
115
-
116
- if filtered == "":
117
- logging.info("Using OCR")
118
- text = useOCR(file_path)
119
-
120
- documents.append(Document(content=text,
121
- meta={"name": file_name},
122
- id_hash_keys=id_hash_keys))
123
 
124
-
125
-
126
- logging.info('file conversion succesful')
127
- output = {'documents': documents}
128
- return output, 'output_1'
129
 
130
- def run_batch():
131
- """
132
- we dont have requirement to process the multiple files in one go
133
- therefore nothing here, however to use the custom node we need to have
134
- this method for the class.
135
- """
136
 
137
- return
138
-
139
-
140
- def basic(s:str, remove_punc:bool = False):
141
-
142
  """
143
- Performs basic cleaning of text.
 
 
 
 
144
  Params
145
- ----------
146
- s: string to be processed
147
- removePunc: to remove all Punctuation including ',' and '.' or not
148
-
149
- Returns: processed string: see comments in the source code for more info
150
  """
151
-
152
- # Remove URLs
153
- s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
154
- s = re.sub(r"http\S+", " ", s)
155
-
156
- # Remove new line characters
157
- s = re.sub('\n', ' ', s)
158
 
159
- # Remove punctuations
160
- if remove_punc == True:
161
- translator = str.maketrans(' ', ' ', string.punctuation)
162
- s = s.translate(translator)
163
- # Remove distracting single quotes and dotted pattern
164
- s = re.sub("\'", " ", s)
165
- s = s.replace("..","")
166
-
167
- return s.strip()
168
-
169
-
170
- def paraLengthCheck(paraList, max_len = 100):
171
- """
172
- There are cases where preprocessor cannot respect word limit, when using
173
- respect sentence boundary flag due to missing sentence boundaries.
174
- Therefore we run one more round of split here for those paragraphs
175
 
176
- Params
177
- ---------------
178
- paraList : list of paragraphs/text
179
- max_len : max length to be respected by sentences which bypassed
180
- preprocessor strategy
181
-
182
- """
183
- new_para_list = []
184
- for passage in paraList:
185
- # check if para exceeds words limit
186
- if len(passage.content.split()) > max_len:
187
- # we might need few iterations example if para = 512 tokens
188
- # we need to iterate 5 times to reduce para to size limit of '100'
189
- iterations = int(len(passage.content.split())/max_len)
190
- for i in range(iterations):
191
- temp = " ".join(passage.content.split()[max_len*i:max_len*(i+1)])
192
- new_para_list.append((temp,passage.meta['page']))
193
- temp = " ".join(passage.content.split()[max_len*(i+1):])
194
- new_para_list.append((temp,passage.meta['page']))
195
  else:
196
- # paragraphs which dont need any splitting
197
- new_para_list.append((passage.content, passage.meta['page']))
198
 
199
- logging.info("New paragraphs length {}".format(len(new_para_list)))
200
- return new_para_list
201
-
202
- class UdfPreProcessor(BaseComponent):
203
- """
204
- class to preprocess the document returned by FileConverter. It will check
205
- for splitting strategy and splits the document by word or sentences and then
206
- synthetically create the paragraphs.
207
- 1. https://docs.haystack.deepset.ai/docs/preprocessor
208
- 2. https://docs.haystack.deepset.ai/reference/preprocessor-api
209
- 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
210
- """
211
- outgoing_edges = 1
212
-
213
- def run(self, documents:List[Document], remove_punc:bool=False, apply_clean = True,
214
- split_by: Literal["sentence", "word"] = 'sentence',
215
- split_length:int = 2, split_respect_sentence_boundary:bool = False,
216
- split_overlap:int = 0):
217
-
218
- """ this is required method to invoke the component in
219
- the pipeline implementation.
220
-
221
- Params
222
- ----------
223
- documents: documents from the output dictionary returned by Fileconverter
224
- remove_punc: to remove all Punctuation including ',' and '.' or not
225
- split_by: document splitting strategy either as word or sentence
226
- split_length: when synthetically creating the paragrpahs from document,
227
- it defines the length of paragraph.
228
- split_respect_sentence_boundary: Used when using 'word' strategy for
229
- splititng of text.
230
- split_overlap: Number of words or sentences that overlap when creating
231
- the paragraphs. This is done as one sentence or 'some words' make sense
232
- when read in together with others. Therefore the overlap is used.
233
-
234
- Return
235
- ---------
236
- output: dictionary, with key as identifier and value could be anything
237
- we need to return. In this case the output will contain 4 objects
238
- the paragraphs text list as List, Haystack document, Dataframe and
239
- one raw text file.
240
-
241
- output_1: As there is only one outgoing edge, we pass 'output_1' string
242
-
243
- """
244
-
245
- if split_by == 'sentence':
246
- split_respect_sentence_boundary = False
247
 
248
- else:
249
- split_respect_sentence_boundary = split_respect_sentence_boundary
250
-
251
- preprocessor = PreProcessor(
252
- clean_empty_lines=True,
253
- clean_whitespace=True,
254
- clean_header_footer=True,
255
- split_by=split_by,
256
- split_length=split_length,
257
- split_respect_sentence_boundary= split_respect_sentence_boundary,
258
- split_overlap=split_overlap,
 
 
 
 
 
 
259
 
260
- # will add page number only in case of PDF not for text/docx file.
261
- add_page_number=True
262
- )
263
-
264
- for i in documents:
265
- # # basic cleaning before passing it to preprocessor.
266
- # i = basic(i)
267
- docs_processed = preprocessor.process([i])
268
- if apply_clean:
269
- for item in docs_processed:
270
- item.content = basic(item.content, remove_punc= remove_punc)
271
- else:
272
- pass
273
 
274
- df = pd.DataFrame(docs_processed)
275
- all_text = " ".join(df.content.to_list())
276
- para_list = df.content.to_list()
277
- logging.info('document split into {} paragraphs'.format(len(para_list)))
278
- output = {'documents': docs_processed,
279
- 'dataframe': df,
280
- 'text': all_text,
281
- 'paraList': para_list
282
- }
283
- return output, "output_1"
284
- def run_batch():
285
- """
286
- we dont have requirement to process the multiple files in one go
287
- therefore nothing here, however to use the custom node we need to have
288
- this method for the class.
289
- """
290
- return
291
 
292
- def processingpipeline():
 
 
 
 
293
  """
294
- Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
295
- from utils.preprocessing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  """
 
 
 
 
 
 
 
 
 
 
297
 
298
- preprocessing_pipeline = Pipeline()
299
- file_converter = FileConverter()
300
- custom_preprocessor = UdfPreProcessor()
301
-
302
- preprocessing_pipeline.add_node(component=file_converter,
303
- name="FileConverter", inputs=["File"])
304
- preprocessing_pipeline.add_node(component = custom_preprocessor,
305
- name ='UdfPreProcessor', inputs=["FileConverter"])
306
 
307
- return preprocessing_pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
 
 
 
 
 
2
  from typing_extensions import Literal
 
3
  import logging
4
+ import pandas as pd
5
+ from pandas import DataFrame, Series
6
+ from utils.config import getconfig
7
+ from utils.preprocessing import processingpipeline
8
  import streamlit as st
9
+ from transformers import pipeline
10
+ from setfit import SetFitModel
11
+
12
+ label_dict= {0: 'Agricultural communities',
13
+ 1: 'Children',
14
+ 2: 'Coastal communities',
15
+ 3: 'Ethnic, racial or other minorities',
16
+ 4: 'Fishery communities',
17
+ 5: 'Informal sector workers',
18
+ 6: 'Members of indigenous and local communities',
19
+ 7: 'Migrants and displaced persons',
20
+ 8: 'Older persons',
21
+ 9: 'Other',
22
+ 10: 'Persons living in poverty',
23
+ 11: 'Persons with disabilities',
24
+ 12: 'Persons with pre-existing health conditions',
25
+ 13: 'Residents of drought-prone regions',
26
+ 14: 'Rural populations',
27
+ 15: 'Sexual minorities (LGBTQI+)',
28
+ 16: 'Urban populations',
29
+ 17: 'Women and other genders'}
30
+
31
+ def get_vulnerability_labels(preds):
32
 
 
 
33
  """
34
+ Function that takes the numerical predictions as an input and returns a list of the labels.
 
 
 
 
 
35
 
 
36
  """
 
 
 
 
 
 
 
 
 
37
 
38
+ # Get label names
39
+ preds_list = preds.tolist()
40
+
41
+ # Get the name of the group where the prediction is equal to "1"
42
+ result = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ for sublist in preds_list:
45
+ names = [label_dict[key] for key, value in enumerate(sublist) if value == 1]
46
+ result.append(names)
 
 
47
 
48
+ return result
 
 
 
 
 
49
 
50
+ @st.cache_resource
51
+ def load_vulnerabilityClassifier(config_file:str = None, classifier_name:str = None):
 
 
 
52
  """
53
+ loads the document classifier using haystack, where the name/path of model
54
+ in HF-hub as string is used to fetch the model object.Either configfile or
55
+ model should be passed.
56
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
57
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
58
  Params
59
+ --------
60
+ config_file: config file path from which to read the model name
61
+ classifier_name: if modelname is passed, it takes a priority if not \
62
+ found then will look for configfile, else raise error.
63
+ Return: document classifier model
64
  """
 
 
 
 
 
 
 
65
 
66
+ # If no classifier given
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ if not classifier_name:
69
+ if not config_file:
70
+ logging.warning("Pass either model name or config file")
71
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  else:
73
+ config = getconfig(config_file)
74
+ classifier_name = config.get('vulnerability','MODEL')
75
 
76
+ logging.info("Loading vulnerability classifier")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ # we are using the pipeline as the model is multilabel and DocumentClassifier
79
+ # from Haystack doesnt support multilabel
80
+ # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
81
+ # if not then it will automatically use softmax, which is not a desired thing.
82
+ # doc_classifier = TransformersDocumentClassifier(
83
+ # model_name_or_path=classifier_name,
84
+ # task="text-classification",
85
+ # top_k = None)
86
+
87
+ # Download model from HF Hub
88
+ doc_classifier = SetFitModel.from_pretrained(classifier_name)
89
+
90
+
91
+ # doc_classifier = pipeline("text-classification",
92
+ # model=classifier_name,
93
+ # return_all_scores=True,
94
+ # function_to_apply= "sigmoid")
95
 
96
+ return doc_classifier
 
 
 
 
 
 
 
 
 
 
 
 
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
+ @st.cache_data
100
+ def vulnerability_classification(haystack_doc:pd.DataFrame,
101
+ threshold:float = 0.5,
102
+ classifier_model:pipeline= None
103
+ )->Tuple[DataFrame,Series]:
104
  """
105
+ Text-Classification on the list of texts provided. Classifier provides the
106
+ most appropriate label for each text. these labels are in terms of if text
107
+ reference a group in a vulnerable situation.
108
+ ---------
109
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
110
+ contains the list of paragraphs in different format,here the list of
111
+ Haystack Documents is used.
112
+ threshold: threshold value for the model to keep the results from classifier
113
+ classifiermodel: you can pass the classifier model directly,which takes priority
114
+ however if not then looks for model in streamlit session.
115
+ In case of streamlit avoid passing the model directly.
116
+ Returns
117
+ ----------
118
+ df: Dataframe with two columns['SDG:int', 'text']
119
+ x: Series object with the unique SDG covered in the document uploaded and
120
+ the number of times it is covered/discussed/count_of_paragraphs.
121
  """
122
+ logging.info("Working on vulnerability Identification")
123
+ haystack_doc['Vulnerability Label'] = 'NA'
124
+ # haystack_doc['PA_check'] = haystack_doc['Policy-Action Label'].apply(lambda x: True if len(x) != 0 else False)
125
+
126
+ # df1 = haystack_doc[haystack_doc['PA_check'] == True]
127
+ # df = haystack_doc[haystack_doc['PA_check'] == False]
128
+ if not classifier_model:
129
+ classifier_model = st.session_state['vulnerability_classifier']
130
+
131
+ predictions = classifier_model(list(haystack_doc.text))
132
 
133
+
 
 
 
 
 
 
 
134
 
135
+ pred_labels = get_vulnerability_labels(predictions)
136
+
137
+ haystack_doc['Vulnerability Label'] = pred_labels
138
+ # placeholder = {}
139
+ # for j in range(len(temp)):
140
+ # placeholder[temp[j]['label']] = temp[j]['score']
141
+ # list_.append(placeholder)
142
+ # labels_ = [{**list_[l]} for l in range(len(predictions))]
143
+ # truth_df = DataFrame.from_dict(labels_)
144
+ # truth_df = truth_df.round(2)
145
+ # truth_df = truth_df.astype(float) >= threshold
146
+ # truth_df = truth_df.astype(str)
147
+ # categories = list(truth_df.columns)
148
+ # truth_df['Vulnerability Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
149
+ # None for i in categories}, axis=1)
150
+ # truth_df['Vulnerability Label'] = truth_df.apply(lambda x: list(x['Vulnerability Label']
151
+ # -{None}),axis=1)
152
+ # haystack_doc['Vulnerability Label'] = list(truth_df['Vulnerability Label'])
153
+ return haystack_doc