prashant
commited on
Commit
·
ce1209f
1
Parent(s):
1984bd1
info and sdg update
Browse files- appStore/info.py +2 -3
- appStore/sdg_analysis.py +37 -6
- paramconfig.cfg +2 -1
- utils/preprocessing.py +5 -10
- utils/sdg_classifier.py +3 -2
- utils/uploadAndExample.py +0 -2
appStore/info.py
CHANGED
@@ -28,7 +28,7 @@ def app():
|
|
28 |
</div>
|
29 |
"""
|
30 |
st.markdown(footer, unsafe_allow_html=True)
|
31 |
-
|
32 |
c1, c2, c3 = st.columns([8,1,12])
|
33 |
with c1:
|
34 |
st.image("docStore/img/ndc.png")
|
@@ -42,13 +42,12 @@ def app():
|
|
42 |
evaluation of stated goals and targets and their actual implementation on \
|
43 |
the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
|
44 |
Language Processing (NLP) methods can help in shortening and easing this \
|
45 |
-
task for policy analysts.</div>',
|
46 |
unsafe_allow_html=True)
|
47 |
|
48 |
intro = """
|
49 |
<div style="text-align: justify;">
|
50 |
|
51 |
-
|
52 |
For this purpose, the United Nations Sustainable Development Solutions \
|
53 |
Network (SDSN) and the Deutsche Gesellschaft für Internationale \
|
54 |
Zusammenarbeit (GIZ) GmbH are collaborating since 2021 in the development \
|
|
|
28 |
</div>
|
29 |
"""
|
30 |
st.markdown(footer, unsafe_allow_html=True)
|
31 |
+
|
32 |
c1, c2, c3 = st.columns([8,1,12])
|
33 |
with c1:
|
34 |
st.image("docStore/img/ndc.png")
|
|
|
42 |
evaluation of stated goals and targets and their actual implementation on \
|
43 |
the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
|
44 |
Language Processing (NLP) methods can help in shortening and easing this \
|
45 |
+
task for policy analysts.</div><br>',
|
46 |
unsafe_allow_html=True)
|
47 |
|
48 |
intro = """
|
49 |
<div style="text-align: justify;">
|
50 |
|
|
|
51 |
For this purpose, the United Nations Sustainable Development Solutions \
|
52 |
Network (SDSN) and the Deutsche Gesellschaft für Internationale \
|
53 |
Zusammenarbeit (GIZ) GmbH are collaborating since 2021 in the development \
|
appStore/sdg_analysis.py
CHANGED
@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)
|
|
23 |
def app():
|
24 |
|
25 |
with st.container():
|
26 |
-
st.markdown("<h2 style='text-align: center; color: black;'> SDG
|
27 |
st.write(' ')
|
28 |
st.write(' ')
|
29 |
|
@@ -31,12 +31,45 @@ def app():
|
|
31 |
|
32 |
st.write(
|
33 |
"""
|
34 |
-
The *SDG Analysis
|
35 |
in Streamlit for analyzing policy documents with respect to SDG \
|
36 |
Classification for the paragraphs/texts in the document and \
|
37 |
extracting the keyphrase per SDG label - developed by GIZ Data \
|
38 |
and the Sustainable Development Solution Network. \n
|
39 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
st.markdown("")
|
41 |
|
42 |
|
@@ -57,11 +90,11 @@ def app():
|
|
57 |
|
58 |
df, x = sdg_classification(allDocuments['documents'])
|
59 |
sdg_labels = df.SDG.unique()
|
60 |
-
tfidfkeywordList = []
|
61 |
textrankkeywordlist = []
|
62 |
for label in sdg_labels:
|
63 |
sdgdata = " ".join(df[df.SDG == label].text.to_list())
|
64 |
-
tfidflist_ = keywordExtraction(label,[sdgdata])
|
65 |
textranklist_ = textrank(sdgdata, words = 20)
|
66 |
tfidfkeywordList.append({'SDG':label, 'TFIDF Keywords':tfidflist_})
|
67 |
textrankkeywordlist.append({'SDG':label, 'TextRank Keywords':textranklist_})
|
@@ -69,8 +102,6 @@ def app():
|
|
69 |
tRkeywordsDf = pd.DataFrame(textrankkeywordlist)
|
70 |
|
71 |
|
72 |
-
|
73 |
-
|
74 |
plt.rcParams['font.size'] = 25
|
75 |
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
|
76 |
# plot
|
|
|
23 |
def app():
|
24 |
|
25 |
with st.container():
|
26 |
+
st.markdown("<h2 style='text-align: center; color: black;'> SDG Classification and Keyphrase Extraction </h2>", unsafe_allow_html=True)
|
27 |
st.write(' ')
|
28 |
st.write(' ')
|
29 |
|
|
|
31 |
|
32 |
st.write(
|
33 |
"""
|
34 |
+
The *SDG Analysis* app is an easy-to-use interface built \
|
35 |
in Streamlit for analyzing policy documents with respect to SDG \
|
36 |
Classification for the paragraphs/texts in the document and \
|
37 |
extracting the keyphrase per SDG label - developed by GIZ Data \
|
38 |
and the Sustainable Development Solution Network. \n
|
39 |
""")
|
40 |
+
st.write("""Document Processing: The Uploaded/Selected document is \
|
41 |
+
automatically cleaned and split into paragraphs with a maximum \
|
42 |
+
length of 120 words using a Haystack preprocessing pipeline. The \
|
43 |
+
length of 120 is an empirical value which should reflect the length \
|
44 |
+
of a “context” and should limit the paragraph length deviation. \
|
45 |
+
However, since we want to respect the sentence boundary the limit \
|
46 |
+
can breach and hence this limit of 120 is tentative.\n
|
47 |
+
|
48 |
+
SDG cLassification: The application assigns paragraphs to 15 of \
|
49 |
+
the 17 United Nations Sustainable Development Goals (SDGs). SDG 16 \
|
50 |
+
“Peace, Justice and Strong Institutions” and SDG 17 \
|
51 |
+
“Partnerships for the Goals” are excluded from the analysis due to \
|
52 |
+
their broad nature which could potentially inflate the results. \
|
53 |
+
Each paragraph is assigned to one SDG only. Again, the results are \
|
54 |
+
displayed in a summary table including the number of the SDG, a \
|
55 |
+
relevancy score highlighted through a green color shading, and the \
|
56 |
+
respective text of the analyzed paragraph. Additionally, a pie \
|
57 |
+
chart with a blue color shading is displayed which illustrates the \
|
58 |
+
three most prominent SDGs in the document. The SDG classification \
|
59 |
+
uses open-source training [data](https://zenodo.org/record/5550238#.Y25ICHbMJPY) \
|
60 |
+
from [OSDG.ai](https://osdg.ai/) which is a global \
|
61 |
+
partnerships and growing community of researchers and institutions \
|
62 |
+
interested in the classification of research according to the \
|
63 |
+
Sustainable Development Goals. The summary table only displays \
|
64 |
+
paragraphs with a calculated relevancy score above 85%.\n
|
65 |
+
|
66 |
+
Keyphrase Extraction: The application extracts 15 keyphrases from \
|
67 |
+
the document, calculates a respective relevancy score, and displays \
|
68 |
+
the results in a summary table. The keyphrases are extracted using \
|
69 |
+
using [Textrank](https://github.com/summanlp/textrank) which is an \
|
70 |
+
easy-to-use computational less expensive \
|
71 |
+
model leveraging combination of TFIDF and Graph networks.
|
72 |
+
""")
|
73 |
st.markdown("")
|
74 |
|
75 |
|
|
|
90 |
|
91 |
df, x = sdg_classification(allDocuments['documents'])
|
92 |
sdg_labels = df.SDG.unique()
|
93 |
+
# tfidfkeywordList = []
|
94 |
textrankkeywordlist = []
|
95 |
for label in sdg_labels:
|
96 |
sdgdata = " ".join(df[df.SDG == label].text.to_list())
|
97 |
+
# tfidflist_ = keywordExtraction(label,[sdgdata])
|
98 |
textranklist_ = textrank(sdgdata, words = 20)
|
99 |
tfidfkeywordList.append({'SDG':label, 'TFIDF Keywords':tfidflist_})
|
100 |
textrankkeywordlist.append({'SDG':label, 'TextRank Keywords':textranklist_})
|
|
|
102 |
tRkeywordsDf = pd.DataFrame(textrankkeywordlist)
|
103 |
|
104 |
|
|
|
|
|
105 |
plt.rcParams['font.size'] = 25
|
106 |
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
|
107 |
# plot
|
paramconfig.cfg
CHANGED
@@ -22,8 +22,9 @@ THRESHOLD = 0.85
|
|
22 |
MODEL = jonas/sdg_classifier_osdg
|
23 |
SPLIT_BY = word
|
24 |
REMOVE_PUNC = 0
|
25 |
-
SPLIT_LENGTH =
|
26 |
SPLIT_OVERLAP = 10
|
|
|
27 |
|
28 |
[preprocessor]
|
29 |
SPLIT_OVERLAP_WORD = 10
|
|
|
22 |
MODEL = jonas/sdg_classifier_osdg
|
23 |
SPLIT_BY = word
|
24 |
REMOVE_PUNC = 0
|
25 |
+
SPLIT_LENGTH = 120
|
26 |
SPLIT_OVERLAP = 10
|
27 |
+
RESPECT_SENTENCE_BOUNDARY = 1
|
28 |
|
29 |
[preprocessor]
|
30 |
SPLIT_OVERLAP_WORD = 10
|
utils/preprocessing.py
CHANGED
@@ -9,10 +9,6 @@ import logging
|
|
9 |
import re
|
10 |
import string
|
11 |
from haystack.pipelines import Pipeline
|
12 |
-
import configparser
|
13 |
-
config = configparser.ConfigParser()
|
14 |
-
config.read_file(open('paramconfig.cfg'))
|
15 |
-
top_k = int(config.get('lexical_search','TOP_K'))
|
16 |
|
17 |
def useOCR(file_path: str)-> Text:
|
18 |
"""
|
@@ -167,11 +163,10 @@ class UdfPreProcessor(BaseComponent):
|
|
167 |
|
168 |
"""
|
169 |
outgoing_edges = 1
|
170 |
-
# split_overlap_word = int(config.get('preprocessor','SPLIT_OVERLAP_WORD'))
|
171 |
-
# split_overlap_sentence = int(config.get('preprocessor','SPLIT_OVERLAP_SENTENCE'))
|
172 |
|
173 |
def run(self, documents:List[Document], removePunc:bool,
|
174 |
split_by: Literal["sentence", "word"] = 'sentence',
|
|
|
175 |
split_length:int = 2, split_overlap = 0):
|
176 |
|
177 |
""" this is required method to invoke the component in
|
@@ -198,11 +193,9 @@ class UdfPreProcessor(BaseComponent):
|
|
198 |
|
199 |
if split_by == 'sentence':
|
200 |
split_respect_sentence_boundary = False
|
201 |
-
# split_overlap=self.split_overlap_sentence
|
202 |
|
203 |
else:
|
204 |
-
split_respect_sentence_boundary =
|
205 |
-
# split_overlap= self.split_overlap_word
|
206 |
|
207 |
preprocessor = PreProcessor(
|
208 |
clean_empty_lines=True,
|
@@ -218,6 +211,8 @@ class UdfPreProcessor(BaseComponent):
|
|
218 |
)
|
219 |
|
220 |
for i in documents:
|
|
|
|
|
221 |
docs_processed = preprocessor.process([i])
|
222 |
for item in docs_processed:
|
223 |
item.content = basic(item.content, removePunc= removePunc)
|
@@ -243,7 +238,7 @@ class UdfPreProcessor(BaseComponent):
|
|
243 |
def processingpipeline():
|
244 |
"""
|
245 |
Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
|
246 |
-
from utils.
|
247 |
|
248 |
"""
|
249 |
|
|
|
9 |
import re
|
10 |
import string
|
11 |
from haystack.pipelines import Pipeline
|
|
|
|
|
|
|
|
|
12 |
|
13 |
def useOCR(file_path: str)-> Text:
|
14 |
"""
|
|
|
163 |
|
164 |
"""
|
165 |
outgoing_edges = 1
|
|
|
|
|
166 |
|
167 |
def run(self, documents:List[Document], removePunc:bool,
|
168 |
split_by: Literal["sentence", "word"] = 'sentence',
|
169 |
+
split_respect_sentence_boundary = False,
|
170 |
split_length:int = 2, split_overlap = 0):
|
171 |
|
172 |
""" this is required method to invoke the component in
|
|
|
193 |
|
194 |
if split_by == 'sentence':
|
195 |
split_respect_sentence_boundary = False
|
|
|
196 |
|
197 |
else:
|
198 |
+
split_respect_sentence_boundary = split_respect_sentence_boundary
|
|
|
199 |
|
200 |
preprocessor = PreProcessor(
|
201 |
clean_empty_lines=True,
|
|
|
211 |
)
|
212 |
|
213 |
for i in documents:
|
214 |
+
# # basic cleaning before passing it to preprocessor.
|
215 |
+
# i = basic(i)
|
216 |
docs_processed = preprocessor.process([i])
|
217 |
for item in docs_processed:
|
218 |
item.content = basic(item.content, removePunc= removePunc)
|
|
|
238 |
def processingpipeline():
|
239 |
"""
|
240 |
Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
|
241 |
+
from utils.preprocessing
|
242 |
|
243 |
"""
|
244 |
|
utils/sdg_classifier.py
CHANGED
@@ -106,7 +106,7 @@ def runSDGPreprocessingPipeline(filePath, fileName)->List[Document]:
|
|
106 |
split_length = int(config.get('sdg','SPLIT_LENGTH'))
|
107 |
split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
|
108 |
remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
|
109 |
-
|
110 |
|
111 |
output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
|
112 |
params= {"FileConverter": {"file_path": filePath, \
|
@@ -114,6 +114,7 @@ def runSDGPreprocessingPipeline(filePath, fileName)->List[Document]:
|
|
114 |
"UdfPreProcessor": {"removePunc": remove_punc, \
|
115 |
"split_by": split_by, \
|
116 |
"split_length":split_length,\
|
117 |
-
"split_overlap": split_overlap
|
|
|
118 |
|
119 |
return output_sdg_pre
|
|
|
106 |
split_length = int(config.get('sdg','SPLIT_LENGTH'))
|
107 |
split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
|
108 |
remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
|
109 |
+
split_respect_sentence_boundary = bool(int(config.get('sdg','RESPECT_SENTENCE_BOUNDARY')))
|
110 |
|
111 |
output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
|
112 |
params= {"FileConverter": {"file_path": filePath, \
|
|
|
114 |
"UdfPreProcessor": {"removePunc": remove_punc, \
|
115 |
"split_by": split_by, \
|
116 |
"split_length":split_length,\
|
117 |
+
"split_overlap": split_overlap, \
|
118 |
+
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
119 |
|
120 |
return output_sdg_pre
|
utils/uploadAndExample.py
CHANGED
@@ -8,7 +8,6 @@ def add_upload(choice):
|
|
8 |
the 'file' to streamlit session_state which then can be fetched later.
|
9 |
|
10 |
"""
|
11 |
-
|
12 |
|
13 |
if choice == 'Upload Document':
|
14 |
uploaded_file = st.sidebar.file_uploader('Upload the File',
|
@@ -21,7 +20,6 @@ def add_upload(choice):
|
|
21 |
st.session_state['filepath'] = temp.name
|
22 |
|
23 |
|
24 |
-
|
25 |
else:
|
26 |
# listing the options
|
27 |
option = st.sidebar.selectbox('Select the example document',
|
|
|
8 |
the 'file' to streamlit session_state which then can be fetched later.
|
9 |
|
10 |
"""
|
|
|
11 |
|
12 |
if choice == 'Upload Document':
|
13 |
uploaded_file = st.sidebar.file_uploader('Upload the File',
|
|
|
20 |
st.session_state['filepath'] = temp.name
|
21 |
|
22 |
|
|
|
23 |
else:
|
24 |
# listing the options
|
25 |
option = st.sidebar.selectbox('Select the example document',
|