Spaces:
Sleeping
Sleeping
Update utils/sdg_classifier.py
Browse files- utils/sdg_classifier.py +13 -13
utils/sdg_classifier.py
CHANGED
@@ -95,7 +95,7 @@ def classification(haystack_doc:List[Document],
|
|
95 |
the number of times it is covered/discussed/count_of_paragraphs.
|
96 |
|
97 |
"""
|
98 |
-
logging.info("Working on
|
99 |
if not classifier_model:
|
100 |
if check_streamlit():
|
101 |
classifier_model = st.session_state['vulnerability_classifier']
|
@@ -109,27 +109,27 @@ def classification(haystack_doc:List[Document],
|
|
109 |
labels_= [(l.meta['classification']['label'],
|
110 |
l.meta['classification']['score'],l.content,) for l in results]
|
111 |
|
112 |
-
df = DataFrame(labels_, columns=["
|
113 |
|
114 |
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
115 |
df.index += 1
|
116 |
df =df[df['Relevancy']>threshold]
|
117 |
|
118 |
# creating the dataframe for value counts of SDG, along with 'title' of SDGs
|
119 |
-
x = df['
|
120 |
x = x.rename('count')
|
121 |
-
x = x.rename_axis('
|
122 |
-
x["Vulnerability"] = pd.to_numeric(x["
|
123 |
x = x.sort_values(by=['count'], ascending=False)
|
124 |
-
x['
|
125 |
-
x['
|
126 |
|
127 |
-
df['
|
128 |
-
df = df.sort_values('
|
129 |
|
130 |
return df, x
|
131 |
|
132 |
-
def
|
133 |
split_by: Literal["sentence", "word"] = 'sentence',
|
134 |
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
135 |
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
@@ -163,9 +163,9 @@ def runSDGPreprocessingPipeline(file_name:str, file_path:str,
|
|
163 |
|
164 |
"""
|
165 |
|
166 |
-
|
167 |
|
168 |
-
|
169 |
params= {"FileConverter": {"file_path": file_path, \
|
170 |
"file_name": file_name},
|
171 |
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
@@ -174,4 +174,4 @@ def runSDGPreprocessingPipeline(file_name:str, file_path:str,
|
|
174 |
"split_overlap": split_overlap, \
|
175 |
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
176 |
|
177 |
-
return
|
|
|
95 |
the number of times it is covered/discussed/count_of_paragraphs.
|
96 |
|
97 |
"""
|
98 |
+
logging.info("Working on vulnerability Classification")
|
99 |
if not classifier_model:
|
100 |
if check_streamlit():
|
101 |
classifier_model = st.session_state['vulnerability_classifier']
|
|
|
109 |
labels_= [(l.meta['classification']['label'],
|
110 |
l.meta['classification']['score'],l.content,) for l in results]
|
111 |
|
112 |
+
df = DataFrame(labels_, columns=["vulnerability","Relevancy","text"])
|
113 |
|
114 |
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
115 |
df.index += 1
|
116 |
df =df[df['Relevancy']>threshold]
|
117 |
|
118 |
# creating the dataframe for value counts of SDG, along with 'title' of SDGs
|
119 |
+
x = df['vulnerability'].value_counts()
|
120 |
x = x.rename('count')
|
121 |
+
x = x.rename_axis('vulnerability').reset_index()
|
122 |
+
x["Vulnerability"] = pd.to_numeric(x["vulnerability"])
|
123 |
x = x.sort_values(by=['count'], ascending=False)
|
124 |
+
x['vulnerability_name'] = x['vulnerability'].apply(lambda x: _lab_dict[x])
|
125 |
+
x['vulnerability_Num'] = x['vulnerability'].apply(lambda x: "vulnerability "+str(x))
|
126 |
|
127 |
+
df['vulnerability'] = pd.to_numeric(df['vulnerability'])
|
128 |
+
df = df.sort_values('vulnerability')
|
129 |
|
130 |
return df, x
|
131 |
|
132 |
+
def runPreprocessingPipeline(file_name:str, file_path:str,
|
133 |
split_by: Literal["sentence", "word"] = 'sentence',
|
134 |
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
135 |
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
|
|
163 |
|
164 |
"""
|
165 |
|
166 |
+
processing_pipeline = processingpipeline()
|
167 |
|
168 |
+
output_pre = processing_pipeline.run(file_paths = file_path,
|
169 |
params= {"FileConverter": {"file_path": file_path, \
|
170 |
"file_name": file_name},
|
171 |
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
|
|
174 |
"split_overlap": split_overlap, \
|
175 |
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
176 |
|
177 |
+
return output_pre
|