Upload 4 files
Browse files- appStore/info.py +72 -0
- appStore/keyword_search.py +176 -0
- appStore/multiapp.py +70 -0
- appStore/sdg_analysis.py +179 -0
appStore/info.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def app():
|
4 |
+
|
5 |
+
|
6 |
+
with open('style.css') as f:
|
7 |
+
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
8 |
+
|
9 |
+
st.markdown("<h2 style='text-align: center; \
|
10 |
+
color: black;'> Policy Action Tracker</h2>",
|
11 |
+
unsafe_allow_html=True)
|
12 |
+
|
13 |
+
|
14 |
+
st.markdown("<div style='text-align: center; \
|
15 |
+
color: grey;'>The Policy Action Tracker is an open-source\
|
16 |
+
digital tool which aims to assist policy analysts and \
|
17 |
+
other users in extracting and filtering relevant \
|
18 |
+
information from policy documents.</div>",
|
19 |
+
unsafe_allow_html=True)
|
20 |
+
footer = """
|
21 |
+
<div class="footer-custom">
|
22 |
+
Guidance & Feedback - <a href="https://www.linkedin.com/in/maren-bernlöhr-149891222" target="_blank">Maren Bernlöhr</a> |
|
23 |
+
<a href="https://www.linkedin.com/in/manuelkuhm" target="_blank">Manuel Kuhm</a> |
|
24 |
+
Developer - <a href="https://www.linkedin.com/in/erik-lehmann-giz/" target="_blank">Erik Lehmann</a> |
|
25 |
+
<a href="https://www.linkedin.com/in/jonas-nothnagel-bb42b114b/" target="_blank">Jonas Nothnagel</a> |
|
26 |
+
<a href="https://www.linkedin.com/in/prashantpsingh/" target="_blank">Prashant Singh</a> |
|
27 |
+
|
28 |
+
</div>
|
29 |
+
"""
|
30 |
+
st.markdown(footer, unsafe_allow_html=True)
|
31 |
+
|
32 |
+
c1, c2, c3 = st.columns([8,1,12])
|
33 |
+
with c1:
|
34 |
+
st.image("docStore/img/ndc.png")
|
35 |
+
with c3:
|
36 |
+
st.markdown('<div style="text-align: justify;">The manual extraction \
|
37 |
+
of relevant information from text documents is a \
|
38 |
+
time-consuming task for any policy analyst. As the amount and length of \
|
39 |
+
public policy documents in relation to sustainable development (such as \
|
40 |
+
National Development Plans and Nationally Determined Contributions) \
|
41 |
+
continuously increases, a major challenge for policy action tracking – the \
|
42 |
+
evaluation of stated goals and targets and their actual implementation on \
|
43 |
+
the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
|
44 |
+
Language Processing (NLP) methods can help in shortening and easing this \
|
45 |
+
task for policy analysts.</div><br>',
|
46 |
+
unsafe_allow_html=True)
|
47 |
+
|
48 |
+
intro = """
|
49 |
+
<div style="text-align: justify;">
|
50 |
+
|
51 |
+
For this purpose, the United Nations Sustainable Development Solutions \
|
52 |
+
Network (SDSN) and the Deutsche Gesellschaft für Internationale \
|
53 |
+
Zusammenarbeit (GIZ) GmbH are collaborated in the development \
|
54 |
+
of this AI-powered open-source web application that helps find and extract \
|
55 |
+
relevant information from public policy documents faster to facilitate \
|
56 |
+
evidence-based decision-making processes in sustainable development and beyond.
|
57 |
+
|
58 |
+
This tool allows policy analysts and other users the possibility to rapidly \
|
59 |
+
search for relevant information/paragraphs in the document according to the \
|
60 |
+
user’s interest, classify the document’s content according to the Sustainable \
|
61 |
+
Development Goals (SDGs), and compare climate-related policy documents and NDCs \
|
62 |
+
across countries using open data from the German Institute of Development and \
|
63 |
+
Sustainability’s (IDOS) NDC Explorer.
|
64 |
+
To understand the application's functionalities and learn more about ß
|
65 |
+
the project, see the attached concept note. We hope you like our application 😊
|
66 |
+
|
67 |
+
|
68 |
+
</div>
|
69 |
+
<br>
|
70 |
+
"""
|
71 |
+
st.markdown(intro, unsafe_allow_html=True)
|
72 |
+
# st.image("docStore/img/paris.png")
|
appStore/keyword_search.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# set path
|
2 |
+
import glob, os, sys;
|
3 |
+
sys.path.append('../utils')
|
4 |
+
|
5 |
+
import streamlit as st
|
6 |
+
import json
|
7 |
+
import logging
|
8 |
+
from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
|
9 |
+
from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_keywordsearch
|
10 |
+
from utils.checkconfig import getconfig
|
11 |
+
from utils.streamlitcheck import checkbox_without_preselect
|
12 |
+
|
13 |
+
# Declare all the necessary variables
|
14 |
+
config = getconfig('paramconfig.cfg')
|
15 |
+
split_by = config.get('semantic_search','SPLIT_BY')
|
16 |
+
split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
|
17 |
+
split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
|
18 |
+
split_respect_sentence_boundary = bool(int(config.get('semantic_search',
|
19 |
+
'RESPECT_SENTENCE_BOUNDARY')))
|
20 |
+
remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
|
21 |
+
embedding_model = config.get('semantic_search','RETRIEVER')
|
22 |
+
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
23 |
+
embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
24 |
+
embedding_dim = int(config.get('semantic_search','EMBEDDING_DIM'))
|
25 |
+
max_seq_len = int(config.get('semantic_search','MAX_SEQ_LENGTH'))
|
26 |
+
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
27 |
+
reader_model = config.get('semantic_search','READER')
|
28 |
+
reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
29 |
+
top_k_per_candidate = int(config.get('semantic_search','READER_TOP_K_PER_CANDIDATE'))
|
30 |
+
lexical_split_by= config.get('lexical_search','SPLIT_BY')
|
31 |
+
lexical_split_length=int(config.get('lexical_search','SPLIT_LENGTH'))
|
32 |
+
lexical_split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
|
33 |
+
lexical_remove_punc = bool(int(config.get('lexical_search','REMOVE_PUNC')))
|
34 |
+
lexical_top_k=int(config.get('lexical_search','TOP_K'))
|
35 |
+
|
36 |
+
def app():
|
37 |
+
|
38 |
+
with st.container():
|
39 |
+
st.markdown("<h1 style='text-align: center; \
|
40 |
+
color: black;'> Search</h1>",
|
41 |
+
unsafe_allow_html=True)
|
42 |
+
st.write(' ')
|
43 |
+
st.write(' ')
|
44 |
+
|
45 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
46 |
+
|
47 |
+
st.write(
|
48 |
+
"""
|
49 |
+
The *Search* app is an interface \
|
50 |
+
for doing contextual and keyword searches in \
|
51 |
+
policy documents. \
|
52 |
+
""")
|
53 |
+
st.write("")
|
54 |
+
st.write(""" The application allows its user to perform a search\
|
55 |
+
based on two options: a lexical search([TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf))\
|
56 |
+
and semantic search. [bi-encoder](https://www.sbert.net/examples/applications/retrieve_rerank/README.html)\
|
57 |
+
The lexical search only \
|
58 |
+
displays paragraphs in the document with exact matching results, \
|
59 |
+
the semantic search shows paragraphs with meaningful connections \
|
60 |
+
(e.g., synonyms) based on the search context. Both \
|
61 |
+
methods employ a probabilistic retrieval framework in its identification\
|
62 |
+
of relevant paragraphs. By defualt the search is performed using \
|
63 |
+
'Semantic Search', and to find 'Exact/Lexical Matches' please tick the \
|
64 |
+
checkbox provided which will by-pass semantic search. Furthermore,\
|
65 |
+
the application allows the user to search for pre-defined keywords \
|
66 |
+
from different thematic buckets present in sidebar.""")
|
67 |
+
st.write("")
|
68 |
+
st.write(""" The Exact Matches gives back top {} findings, and Semantic
|
69 |
+
search provides with top {} answers.""".format(lexical_top_k, retriever_top_k))
|
70 |
+
st.write("")
|
71 |
+
st.write("")
|
72 |
+
st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
|
73 |
+
col1,col2,col3= st.columns([2,4,4])
|
74 |
+
with col1:
|
75 |
+
st.caption("OCR File processing")
|
76 |
+
# st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
|
77 |
+
st.write("50 sec")
|
78 |
+
|
79 |
+
with col2:
|
80 |
+
st.caption("Lexical Search on 200 paragraphs(~ 35 pages)")
|
81 |
+
# st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
|
82 |
+
st.write("15 sec")
|
83 |
+
|
84 |
+
with col3:
|
85 |
+
st.caption("Semantic search on 200 paragraphs(~ 35 pages)")
|
86 |
+
# st.markdown('<div style="text-align: center;">120 sec</div>', unsafe_allow_html=True)
|
87 |
+
st.write("120 sec(including emebedding creation)")
|
88 |
+
|
89 |
+
with st.sidebar:
|
90 |
+
with open('docStore/sample/keywordexample.json','r') as json_file:
|
91 |
+
keywordexample = json.load(json_file)
|
92 |
+
|
93 |
+
# genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
|
94 |
+
st.caption("Select Keyword Category")
|
95 |
+
genre = checkbox_without_preselect(list(keywordexample.keys()))
|
96 |
+
if genre:
|
97 |
+
keywordList = keywordexample[genre]
|
98 |
+
else:
|
99 |
+
keywordList = None
|
100 |
+
|
101 |
+
st.markdown("---")
|
102 |
+
|
103 |
+
with st.container():
|
104 |
+
type_hinting = "Please enter here your question and we \
|
105 |
+
will look for an answer in the document\
|
106 |
+
OR enter the keyword you are looking \
|
107 |
+
for and we will look for similar\
|
108 |
+
context in the document.\
|
109 |
+
You can also explore predefined sets of keywords from sidebar. "
|
110 |
+
if keywordList is not None:
|
111 |
+
# queryList = st.text_input("You selected the {} category we \
|
112 |
+
# will look for these keywords in document".format(genre)
|
113 |
+
# value="{}".format(keywordList))
|
114 |
+
queryList = st.text_input(type_hinting,
|
115 |
+
value = "{}".format(keywordList))
|
116 |
+
else:
|
117 |
+
queryList = st.text_input(type_hinting,
|
118 |
+
placeholder="Enter keyword/query here")
|
119 |
+
|
120 |
+
searchtype = st.checkbox("Show only Exact Matches")
|
121 |
+
if st.button("Find them"):
|
122 |
+
|
123 |
+
if queryList == "":
|
124 |
+
st.info("🤔 No keyword provided, if you dont have any, \
|
125 |
+
please try example sets from sidebar!")
|
126 |
+
logging.warning("Terminated as no keyword provided")
|
127 |
+
else:
|
128 |
+
if 'filepath' in st.session_state:
|
129 |
+
|
130 |
+
if searchtype:
|
131 |
+
all_documents = runLexicalPreprocessingPipeline(
|
132 |
+
file_name=st.session_state['filename'],
|
133 |
+
file_path=st.session_state['filepath'],
|
134 |
+
split_by=lexical_split_by,
|
135 |
+
split_length=lexical_split_length,
|
136 |
+
split_overlap=lexical_split_overlap,
|
137 |
+
remove_punc=lexical_remove_punc)
|
138 |
+
logging.info("performing lexical search")
|
139 |
+
with st.spinner("Performing Exact matching search \
|
140 |
+
(Lexical search) for you"):
|
141 |
+
lexical_search(query=queryList,
|
142 |
+
documents = all_documents['documents'],
|
143 |
+
top_k = lexical_top_k )
|
144 |
+
else:
|
145 |
+
all_documents = runSemanticPreprocessingPipeline(
|
146 |
+
file_path= st.session_state['filepath'],
|
147 |
+
file_name = st.session_state['filename'],
|
148 |
+
split_by=split_by,
|
149 |
+
split_length= split_length,
|
150 |
+
split_overlap=split_overlap,
|
151 |
+
remove_punc= remove_punc,
|
152 |
+
split_respect_sentence_boundary=split_respect_sentence_boundary)
|
153 |
+
if len(all_documents['documents']) > 100:
|
154 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
155 |
+
else:
|
156 |
+
warning_msg = ""
|
157 |
+
|
158 |
+
logging.info("starting semantic search")
|
159 |
+
with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
|
160 |
+
semantic_keywordsearch(query = queryList,
|
161 |
+
documents = all_documents['documents'],
|
162 |
+
embedding_model=embedding_model,
|
163 |
+
embedding_layer=embedding_layer,
|
164 |
+
embedding_model_format=embedding_model_format,
|
165 |
+
reader_model=reader_model,reader_top_k=reader_top_k,
|
166 |
+
retriever_top_k=retriever_top_k, embedding_dim=embedding_dim,
|
167 |
+
max_seq_len=max_seq_len,
|
168 |
+
top_k_per_candidate = top_k_per_candidate)
|
169 |
+
|
170 |
+
else:
|
171 |
+
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
172 |
+
logging.warning("Terminated as no document provided")
|
173 |
+
|
174 |
+
|
175 |
+
|
176 |
+
|
appStore/multiapp.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Frameworks for running multiple Streamlit applications as a single app.
|
2 |
+
"""
|
3 |
+
import streamlit as st
|
4 |
+
from PIL import Image
|
5 |
+
from streamlit_option_menu import option_menu
|
6 |
+
from utils.uploadAndExample import add_upload
|
7 |
+
|
8 |
+
class MultiApp:
|
9 |
+
"""Framework for combining multiple streamlit applications.
|
10 |
+
Usage:
|
11 |
+
def foo():
|
12 |
+
st.title("Hello Foo")
|
13 |
+
def bar():
|
14 |
+
st.title("Hello Bar")
|
15 |
+
app = MultiApp()
|
16 |
+
app.add_app("Foo", foo)
|
17 |
+
app.add_app("Bar", bar)
|
18 |
+
app.run()
|
19 |
+
It is also possible keep each application in a separate file.
|
20 |
+
import foo
|
21 |
+
import bar
|
22 |
+
app = MultiApp()
|
23 |
+
app.add_app("Foo", foo.app)
|
24 |
+
app.add_app("Bar", bar.app)
|
25 |
+
app.run()
|
26 |
+
"""
|
27 |
+
def __init__(self):
|
28 |
+
self.apps = []
|
29 |
+
|
30 |
+
def add_app(self,title,icon, func):
|
31 |
+
"""Adds a new application.
|
32 |
+
Parameters
|
33 |
+
----------
|
34 |
+
func:
|
35 |
+
the python function to render this app.
|
36 |
+
title:
|
37 |
+
title of the app. Appears in the dropdown in the sidebar.
|
38 |
+
"""
|
39 |
+
self.apps.append({
|
40 |
+
"title": title,
|
41 |
+
"icon": icon,
|
42 |
+
"function": func
|
43 |
+
})
|
44 |
+
|
45 |
+
def run(self):
|
46 |
+
|
47 |
+
st.sidebar.write(format_func=lambda app: app['title'])
|
48 |
+
image = Image.open('docStore/img/sdsn.png')
|
49 |
+
st.sidebar.image(image, width =200)
|
50 |
+
|
51 |
+
with st.sidebar:
|
52 |
+
selected = option_menu(None, [page["title"] for page in self.apps],
|
53 |
+
icons=[page["icon"] for page in self.apps],
|
54 |
+
menu_icon="cast", default_index=0)
|
55 |
+
st.markdown("---")
|
56 |
+
|
57 |
+
|
58 |
+
for index, item in enumerate(self.apps):
|
59 |
+
if item["title"] == selected:
|
60 |
+
self.apps[index]["function"]()
|
61 |
+
break
|
62 |
+
|
63 |
+
|
64 |
+
choice = st.sidebar.radio(label = 'Select the Document',
|
65 |
+
help = 'You can upload the document \
|
66 |
+
or else you can try a example document',
|
67 |
+
options = ('Upload Document', 'Try Example'),
|
68 |
+
horizontal = True)
|
69 |
+
add_upload(choice)
|
70 |
+
|
appStore/sdg_analysis.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# set path
|
2 |
+
import glob, os, sys;
|
3 |
+
sys.path.append('../utils')
|
4 |
+
|
5 |
+
#import needed libraries
|
6 |
+
import seaborn as sns
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
import streamlit as st
|
11 |
+
from st_aggrid import AgGrid
|
12 |
+
from st_aggrid.shared import ColumnsAutoSizeMode
|
13 |
+
from utils.sdg_classifier import sdg_classification
|
14 |
+
from utils.sdg_classifier import runSDGPreprocessingPipeline, load_sdgClassifier
|
15 |
+
from utils.keyword_extraction import textrank
|
16 |
+
import logging
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
from utils.checkconfig import getconfig
|
19 |
+
|
20 |
+
|
21 |
+
# Declare all the necessary variables
|
22 |
+
config = getconfig('paramconfig.cfg')
|
23 |
+
model_name = config.get('sdg','MODEL')
|
24 |
+
split_by = config.get('sdg','SPLIT_BY')
|
25 |
+
split_length = int(config.get('sdg','SPLIT_LENGTH'))
|
26 |
+
split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
|
27 |
+
remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
|
28 |
+
split_respect_sentence_boundary = bool(int(config.get('sdg','RESPECT_SENTENCE_BOUNDARY')))
|
29 |
+
threshold = float(config.get('sdg','THRESHOLD'))
|
30 |
+
top_n = int(config.get('sdg','TOP_KEY'))
|
31 |
+
|
32 |
+
|
33 |
+
def app():
|
34 |
+
|
35 |
+
#### APP INFO #####
|
36 |
+
with st.container():
|
37 |
+
st.markdown("<h1 style='text-align: center; color: black;'> SDG Classification and Keyphrase Extraction </h1>", unsafe_allow_html=True)
|
38 |
+
st.write(' ')
|
39 |
+
st.write(' ')
|
40 |
+
|
41 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
42 |
+
|
43 |
+
st.write(
|
44 |
+
"""
|
45 |
+
The *SDG Analysis* app is an easy-to-use interface built \
|
46 |
+
in Streamlit for analyzing policy documents with respect to SDG \
|
47 |
+
Classification for the paragraphs/texts in the document and \
|
48 |
+
extracting the keyphrase per SDG label - developed by GIZ Data \
|
49 |
+
and the Sustainable Development Solution Network. \n
|
50 |
+
""")
|
51 |
+
st.write("""**Document Processing:** The Uploaded/Selected document is \
|
52 |
+
automatically cleaned and split into paragraphs with a maximum \
|
53 |
+
length of 120 words using a Haystack preprocessing pipeline. The \
|
54 |
+
length of 120 is an empirical value which should reflect the length \
|
55 |
+
of a “context” and should limit the paragraph length deviation. \
|
56 |
+
However, since we want to respect the sentence boundary the limit \
|
57 |
+
can breach and hence this limit of 120 is tentative. \n
|
58 |
+
""")
|
59 |
+
st.write("""**SDG cLassification:** The application assigns paragraphs \
|
60 |
+
to 16 of the 17 United Nations Sustainable Development Goals (SDGs).\
|
61 |
+
SDG 17 “Partnerships for the Goals” is excluded from the analysis due \
|
62 |
+
to its broad nature which could potentially inflate the results. \
|
63 |
+
Each paragraph is assigned to one SDG only. Again, the results are \
|
64 |
+
displayed in a summary table including the number of the SDG, a \
|
65 |
+
relevancy score highlighted through a green color shading, and the \
|
66 |
+
respective text of the analyzed paragraph. Additionally, a pie \
|
67 |
+
chart with a blue color shading is displayed which illustrates the \
|
68 |
+
three most prominent SDGs in the document. The SDG classification \
|
69 |
+
uses open-source training [data](https://zenodo.org/record/5550238#.Y25ICHbMJPY) \
|
70 |
+
from [OSDG.ai](https://osdg.ai/) which is a global \
|
71 |
+
partnerships and growing community of researchers and institutions \
|
72 |
+
interested in the classification of research according to the \
|
73 |
+
Sustainable Development Goals. The summary table only displays \
|
74 |
+
paragraphs with a calculated relevancy score above 85%. \n""")
|
75 |
+
|
76 |
+
st.write("""**Keyphrase Extraction:** The application extracts 15 \
|
77 |
+
keyphrases from the document, for each SDG label and displays the \
|
78 |
+
results in a summary table. The keyphrases are extracted using \
|
79 |
+
using [Textrank](https://github.com/summanlp/textrank)\
|
80 |
+
which is an easy-to-use computational less expensive \
|
81 |
+
model leveraging combination of TFIDF and Graph networks.
|
82 |
+
""")
|
83 |
+
st.write("")
|
84 |
+
st.write("")
|
85 |
+
st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
|
86 |
+
col1,col2,col3,col4 = st.columns([2,2,4,4])
|
87 |
+
with col1:
|
88 |
+
st.caption("Loading Time Classifier")
|
89 |
+
# st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
|
90 |
+
st.write("12 sec")
|
91 |
+
with col2:
|
92 |
+
st.caption("OCR File processing")
|
93 |
+
# st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
|
94 |
+
st.write("50 sec")
|
95 |
+
with col3:
|
96 |
+
st.caption("SDG Classification of 200 paragraphs(~ 35 pages)")
|
97 |
+
# st.markdown('<div style="text-align: center;">120 sec</div>', unsafe_allow_html=True)
|
98 |
+
st.write("120 sec")
|
99 |
+
with col4:
|
100 |
+
st.caption("Keyword extraction for 200 paragraphs(~ 35 pages)")
|
101 |
+
# st.markdown('<div style="text-align: center;">3 sec</div>', unsafe_allow_html=True)
|
102 |
+
st.write("3 sec")
|
103 |
+
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
### Main app code ###
|
108 |
+
with st.container():
|
109 |
+
if st.button("RUN SDG Analysis"):
|
110 |
+
|
111 |
+
if 'filepath' in st.session_state:
|
112 |
+
file_name = st.session_state['filename']
|
113 |
+
file_path = st.session_state['filepath']
|
114 |
+
classifier = load_sdgClassifier(classifier_name=model_name)
|
115 |
+
st.session_state['sdg_classifier'] = classifier
|
116 |
+
all_documents = runSDGPreprocessingPipeline(file_name= file_name,
|
117 |
+
file_path= file_path, split_by= split_by,
|
118 |
+
split_length= split_length,
|
119 |
+
split_respect_sentence_boundary= split_respect_sentence_boundary,
|
120 |
+
split_overlap= split_overlap, remove_punc= remove_punc)
|
121 |
+
|
122 |
+
if len(all_documents['documents']) > 100:
|
123 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
124 |
+
else:
|
125 |
+
warning_msg = ""
|
126 |
+
|
127 |
+
with st.spinner("Running SDG Classification{}".format(warning_msg)):
|
128 |
+
|
129 |
+
df, x = sdg_classification(haystack_doc=all_documents['documents'],
|
130 |
+
threshold= threshold)
|
131 |
+
df = df.drop(['Relevancy'], axis = 1)
|
132 |
+
sdg_labels = x.SDG.unique()
|
133 |
+
textrank_keyword_list = []
|
134 |
+
for label in sdg_labels:
|
135 |
+
sdgdata = " ".join(df[df.SDG == label].text.to_list())
|
136 |
+
textranklist_ = textrank(textdata=sdgdata, words= top_n)
|
137 |
+
if len(textranklist_) > 0:
|
138 |
+
textrank_keyword_list.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
|
139 |
+
textrank_keywords_df = pd.DataFrame(textrank_keyword_list)
|
140 |
+
|
141 |
+
|
142 |
+
plt.rcParams['font.size'] = 25
|
143 |
+
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
|
144 |
+
# plot
|
145 |
+
fig, ax = plt.subplots()
|
146 |
+
ax.pie(x['count'], colors=colors, radius=2, center=(4, 4),
|
147 |
+
wedgeprops={"linewidth": 1, "edgecolor": "white"},
|
148 |
+
textprops={'fontsize': 14},
|
149 |
+
frame=False,labels =list(x.SDG_Num),
|
150 |
+
labeldistance=1.2)
|
151 |
+
# fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
|
152 |
+
|
153 |
+
|
154 |
+
st.markdown("#### Anything related to SDGs? ####")
|
155 |
+
|
156 |
+
c4, c5, c6 = st.columns([1,2,2])
|
157 |
+
|
158 |
+
with c5:
|
159 |
+
st.pyplot(fig)
|
160 |
+
with c6:
|
161 |
+
labeldf = x['SDG_name'].values.tolist()
|
162 |
+
labeldf = "<br>".join(labeldf)
|
163 |
+
st.markdown(labeldf, unsafe_allow_html=True)
|
164 |
+
st.write("")
|
165 |
+
st.markdown("###### What keywords are present under SDG classified text? ######")
|
166 |
+
|
167 |
+
AgGrid(textrank_keywords_df, reload_data = False,
|
168 |
+
update_mode="value_changed",
|
169 |
+
columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
|
170 |
+
st.write("")
|
171 |
+
st.markdown("###### Top few SDG Classified paragraph/text results ######")
|
172 |
+
|
173 |
+
AgGrid(df, reload_data = False, update_mode="value_changed",
|
174 |
+
columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
|
175 |
+
else:
|
176 |
+
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
177 |
+
logging.warning("Terminated as no document provided")
|
178 |
+
|
179 |
+
|