prashant
commited on
Commit
•
550b85d
1
Parent(s):
4e2e62f
changing list order adding coherence
Browse files- app.py +2 -1
- appStore/coherence.py +3 -262
- appStore/keyword_search.py +9 -12
- docStore/sample/files.json +3 -2
- docStore/sample/keywordexample.json +3 -3
- paramconfig.cfg +1 -1
- ver0.1 scripts/coherence.py +267 -0
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import appStore.keyword_search as keyword_search
|
2 |
import appStore.sdg_analysis as sdg_analysis
|
3 |
-
|
4 |
import appStore.info as info
|
5 |
from appStore.multiapp import MultiApp
|
6 |
import streamlit as st
|
@@ -13,5 +13,6 @@ app = MultiApp()
|
|
13 |
app.add_app("About","house", info.app)
|
14 |
app.add_app("SDG Analysis","gear",sdg_analysis.app)
|
15 |
app.add_app("Search","search", keyword_search.app)
|
|
|
16 |
|
17 |
app.run()
|
|
|
1 |
import appStore.keyword_search as keyword_search
|
2 |
import appStore.sdg_analysis as sdg_analysis
|
3 |
+
import appStore.coherence as coherence
|
4 |
import appStore.info as info
|
5 |
from appStore.multiapp import MultiApp
|
6 |
import streamlit as st
|
|
|
13 |
app.add_app("About","house", info.app)
|
14 |
app.add_app("SDG Analysis","gear",sdg_analysis.app)
|
15 |
app.add_app("Search","search", keyword_search.app)
|
16 |
+
app.add_app("NDC Coherence","exclude", coherence.app)
|
17 |
|
18 |
app.run()
|
appStore/coherence.py
CHANGED
@@ -1,267 +1,8 @@
|
|
1 |
# set path
|
2 |
-
import glob, os, sys;
|
|
|
3 |
|
4 |
-
#import helper
|
5 |
-
import udfPreprocess.docPreprocessing as pre
|
6 |
-
import udfPreprocess.cleaning as clean
|
7 |
-
|
8 |
-
#import needed libraries
|
9 |
-
import seaborn as sns
|
10 |
-
from pandas import DataFrame
|
11 |
-
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
12 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
13 |
-
# from keybert import KeyBERT
|
14 |
-
from transformers import pipeline
|
15 |
-
import matplotlib.pyplot as plt
|
16 |
-
import numpy as np
|
17 |
import streamlit as st
|
18 |
-
import pandas as pd
|
19 |
-
from rank_bm25 import BM25Okapi
|
20 |
-
from sklearn.feature_extraction import _stop_words
|
21 |
-
import string
|
22 |
-
from tqdm.autonotebook import tqdm
|
23 |
-
import numpy as np
|
24 |
-
import urllib.request
|
25 |
-
import ast
|
26 |
-
import tempfile
|
27 |
-
import sqlite3
|
28 |
-
import json
|
29 |
-
import urllib.request
|
30 |
-
import ast
|
31 |
-
import docx
|
32 |
-
from docx.shared import Inches
|
33 |
-
from docx.shared import Pt
|
34 |
-
from docx.enum.style import WD_STYLE_TYPE
|
35 |
|
36 |
def app():
|
37 |
-
|
38 |
-
st.sidebar.title('Check Coherence')
|
39 |
-
st.sidebar.write(' ')
|
40 |
-
with open('ndcs/countryList.txt') as dfile:
|
41 |
-
countryList = dfile.read()
|
42 |
-
|
43 |
-
countryList = ast.literal_eval(countryList)
|
44 |
-
countrynames = list(countryList.keys())
|
45 |
-
|
46 |
-
option = st.sidebar.selectbox('Select Country', (countrynames))
|
47 |
-
countryCode = countryList[option]
|
48 |
-
|
49 |
-
|
50 |
-
with st.container():
|
51 |
-
st.markdown("<h1 style='text-align: center; color: black;'> Check Coherence of Policy Document with NDCs</h1>", unsafe_allow_html=True)
|
52 |
-
st.write(' ')
|
53 |
-
st.write(' ')
|
54 |
-
|
55 |
-
with st.expander("ℹ️ - About this app", expanded=True):
|
56 |
-
|
57 |
-
st.write(
|
58 |
-
"""
|
59 |
-
The *Check Coherence* app is an easy-to-use interface built in Streamlit for doing analysis of policy document and finding the coherence between NDCs/New-Updated NDCs- developed by GIZ Data and the Sustainable Development Solution Network.
|
60 |
-
"""
|
61 |
-
)
|
62 |
-
|
63 |
-
st.markdown("")
|
64 |
-
|
65 |
-
st.markdown("")
|
66 |
-
st.markdown("## 📌 Step One: Upload document of the country selected ")
|
67 |
-
|
68 |
-
with st.container():
|
69 |
-
docs = None
|
70 |
-
# asking user for either upload or select existing doc
|
71 |
-
choice = st.radio(label = 'Select the Document',
|
72 |
-
help = 'You can upload the document \
|
73 |
-
or else you can try a example document.',
|
74 |
-
options = ('Upload Document', 'Try Example'),
|
75 |
-
horizontal = True)
|
76 |
-
|
77 |
-
if choice == 'Upload Document':
|
78 |
-
uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
|
79 |
-
if uploaded_file is not None:
|
80 |
-
with tempfile.NamedTemporaryFile(mode="wb") as temp:
|
81 |
-
bytes_data = uploaded_file.getvalue()
|
82 |
-
temp.write(bytes_data)
|
83 |
-
|
84 |
-
st.write("Uploaded Filename: ", uploaded_file.name)
|
85 |
-
file_name = uploaded_file.name
|
86 |
-
file_path = temp.name
|
87 |
-
docs = pre.load_document(file_path, file_name)
|
88 |
-
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
89 |
-
|
90 |
-
else:
|
91 |
-
# listing the options
|
92 |
-
option = st.selectbox('Select the example document',
|
93 |
-
('South Africa:Low Emission strategy',
|
94 |
-
'Ethiopia: 10 Year Development Plan'))
|
95 |
-
if option is 'South Africa:Low Emission strategy':
|
96 |
-
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
|
97 |
-
countryCode = countryList['South Africa']
|
98 |
-
st.write("Selected document:", file_name.split('/')[1])
|
99 |
-
# with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
|
100 |
-
# file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
|
101 |
-
else:
|
102 |
-
# with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
|
103 |
-
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
|
104 |
-
countryCode = countryList['Ethiopia']
|
105 |
-
st.write("Selected document:", file_name.split('/')[1])
|
106 |
-
|
107 |
-
if option is not None:
|
108 |
-
docs = pre.load_document(file_path,file_name)
|
109 |
-
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
110 |
-
|
111 |
-
with open('ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
|
112 |
-
cca_sent = dfile.read()
|
113 |
-
|
114 |
-
cca_sent = ast.literal_eval(cca_sent)
|
115 |
-
|
116 |
-
with open('ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
|
117 |
-
ccm_sent = dfile.read()
|
118 |
-
|
119 |
-
ccm_sent = ast.literal_eval(ccm_sent)
|
120 |
-
|
121 |
-
with open('ndcs/countryList.txt') as dfile:
|
122 |
-
countryList = dfile.read()
|
123 |
-
|
124 |
-
countryList = ast.literal_eval(countryList)
|
125 |
-
|
126 |
-
def get_document(countryCode: str):
|
127 |
-
link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
|
128 |
-
with urllib.request.urlopen(link) as urlfile:
|
129 |
-
data = json.loads(urlfile.read())
|
130 |
-
categoriesData = {}
|
131 |
-
categoriesData['categories']= data['categories']
|
132 |
-
categoriesData['subcategories']= data['subcategories']
|
133 |
-
keys_sub = categoriesData['subcategories'].keys()
|
134 |
-
documentType= 'NDCs'
|
135 |
-
if documentType in data.keys():
|
136 |
-
if countryCode in data[documentType].keys():
|
137 |
-
get_dict = {}
|
138 |
-
for key, value in data[documentType][countryCode].items():
|
139 |
-
if key not in ['country_name','region_id', 'region_name']:
|
140 |
-
get_dict[key] = value['classification']
|
141 |
-
else:
|
142 |
-
get_dict[key] = value
|
143 |
-
else:
|
144 |
-
return None
|
145 |
-
else:
|
146 |
-
return None
|
147 |
-
|
148 |
-
country = {}
|
149 |
-
for key in categoriesData['categories']:
|
150 |
-
country[key]= {}
|
151 |
-
for key,value in categoriesData['subcategories'].items():
|
152 |
-
country[value['category']][key] = get_dict[key]
|
153 |
-
|
154 |
-
return country
|
155 |
-
|
156 |
-
# country_ndc = get_document('NDCs', countryList[option])
|
157 |
-
|
158 |
-
def countrySpecificCCA(cca_sent, threshold, countryCode):
|
159 |
-
temp = {}
|
160 |
-
doc = get_document(countryCode)
|
161 |
-
for key,value in cca_sent.items():
|
162 |
-
id_ = doc['climate change adaptation'][key]['id']
|
163 |
-
if id_ >threshold:
|
164 |
-
temp[key] = value['id'][id_]
|
165 |
-
return temp
|
166 |
-
|
167 |
-
|
168 |
-
def countrySpecificCCM(ccm_sent, threshold, countryCode):
|
169 |
-
temp = {}
|
170 |
-
doc = get_document(countryCode)
|
171 |
-
for key,value in ccm_sent.items():
|
172 |
-
id_ = doc['climate change mitigation'][key]['id']
|
173 |
-
if id_ >threshold:
|
174 |
-
temp[key] = value['id'][id_]
|
175 |
-
|
176 |
-
return temp
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
if docs is not None:
|
181 |
-
sent_cca = countrySpecificCCA(cca_sent,1,countryCode)
|
182 |
-
sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode)
|
183 |
-
#st.write(sent_ccm)
|
184 |
-
@st.cache(allow_output_mutation=True)
|
185 |
-
def load_sentenceTransformer(name):
|
186 |
-
return SentenceTransformer(name)
|
187 |
-
model = load_sentenceTransformer('all-MiniLM-L6-v2')
|
188 |
-
|
189 |
-
document_embeddings = model.encode(paraList, show_progress_bar=True)
|
190 |
-
|
191 |
-
genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation'))
|
192 |
-
if genre == 'Climate Change Adaptation':
|
193 |
-
sent_dict = sent_cca
|
194 |
-
sent_labels = []
|
195 |
-
for key,sent in sent_dict.items():
|
196 |
-
sent_labels.append(sent)
|
197 |
-
label_embeddings = model.encode(sent_labels, show_progress_bar=True)
|
198 |
-
similarity_high_threshold = 0.55
|
199 |
-
similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
|
200 |
-
label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
|
201 |
-
|
202 |
-
positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
|
203 |
-
|
204 |
-
|
205 |
-
else:
|
206 |
-
sent_dict = sent_ccm
|
207 |
-
sent_labels = []
|
208 |
-
for key,sent in sent_dict.items():
|
209 |
-
sent_labels.append(sent)
|
210 |
-
label_embeddings = model.encode(sent_labels, show_progress_bar=True)
|
211 |
-
similarity_high_threshold = 0.55
|
212 |
-
similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
|
213 |
-
label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
|
214 |
-
|
215 |
-
positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
|
216 |
-
|
217 |
-
|
218 |
-
# sent_labels = []
|
219 |
-
# for key,sent in sent_dict.items():
|
220 |
-
# sent_labels.append(sent)
|
221 |
-
|
222 |
-
|
223 |
-
# label_embeddings = model.encode(sent_labels, show_progress_bar=True)
|
224 |
-
|
225 |
-
#similarity_high_threshold = 0.55
|
226 |
-
# similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
|
227 |
-
#label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
|
228 |
-
|
229 |
-
#positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
|
230 |
-
document = docx.Document()
|
231 |
-
document.add_heading('Document name:{}'.format(file_name), 2)
|
232 |
-
section = document.sections[0]
|
233 |
-
|
234 |
-
# Calling the footer
|
235 |
-
footer = section.footer
|
236 |
-
|
237 |
-
# Calling the paragraph already present in
|
238 |
-
# the footer section
|
239 |
-
footer_para = footer.paragraphs[0]
|
240 |
-
|
241 |
-
font_styles = document.styles
|
242 |
-
font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
|
243 |
-
font_object = font_charstyle.font
|
244 |
-
font_object.size = Pt(7)
|
245 |
-
# Adding the centered zoned footer
|
246 |
-
footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
|
247 |
-
|
248 |
-
document.add_paragraph("Country Code for which NDC is carried out {}".format(countryCode))
|
249 |
-
|
250 |
-
for _label_idx, _paragraph_idx in positive_indices:
|
251 |
-
st.write("This paragraph: \n")
|
252 |
-
document.add_paragraph("This paragraph: \n")
|
253 |
-
st.write(paraList[_paragraph_idx])
|
254 |
-
st.write(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
|
255 |
-
document.add_paragraph(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
|
256 |
-
st.write('-'*10)
|
257 |
-
document.add_paragraph('-'*10)
|
258 |
-
|
259 |
-
document.save('demo.docx')
|
260 |
-
with open("demo.docx", "rb") as file:
|
261 |
-
btn = st.download_button(
|
262 |
-
label="Download file",
|
263 |
-
data=file,
|
264 |
-
file_name="demo.docx",
|
265 |
-
mime="txt/docx"
|
266 |
-
)
|
267 |
-
|
|
|
1 |
# set path
|
2 |
+
import glob, os, sys;
|
3 |
+
sys.path.append('../utils')
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
def app():
|
8 |
+
st.write("Coming soon")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
appStore/keyword_search.py
CHANGED
@@ -56,10 +56,11 @@ def app():
|
|
56 |
on the context as well. The semantic search allows for a personalized\
|
57 |
experience in using the application. Both methods employ a \
|
58 |
probabilistic retrieval framework in its identification of relevant \
|
59 |
-
paragraphs. By defualt the search is
|
60 |
-
|
61 |
by pass semantic search.. Furthermore, the application allows the \
|
62 |
-
user to search for pre-defined keywords from different thematic buckets
|
|
|
63 |
|
64 |
|
65 |
with st.sidebar:
|
@@ -72,11 +73,6 @@ def app():
|
|
72 |
else:
|
73 |
keywordList = None
|
74 |
|
75 |
-
# searchtype = st.selectbox("Do you want to find exact macthes or similar \
|
76 |
-
# meaning/context",
|
77 |
-
# ['Exact Matches', 'Similar context/meaning'])
|
78 |
-
|
79 |
-
|
80 |
st.markdown("---")
|
81 |
|
82 |
with st.container():
|
@@ -84,7 +80,6 @@ def app():
|
|
84 |
# queryList = st.text_input("You selected the {} category we \
|
85 |
# will look for these keywords in document".format(genre),
|
86 |
# value="{}".format(keywordList))
|
87 |
-
# else:
|
88 |
queryList = st.text_input("Please enter here your question and we \
|
89 |
will look for an answer in the document\
|
90 |
OR enter the keyword you are looking \
|
@@ -92,7 +87,6 @@ def app():
|
|
92 |
context in the document. You can select the \
|
93 |
presets of keywords from sidebar.",
|
94 |
value = "{}".format(keywordList))
|
95 |
-
# placeholder="Enter keyword here")
|
96 |
searchtype = st.checkbox("Show only Exact Matches")
|
97 |
if st.button("Find them"):
|
98 |
|
@@ -129,10 +123,13 @@ def app():
|
|
129 |
split_overlap=split_overlap,
|
130 |
removePunc= remove_punc,
|
131 |
split_respect_sentence_boundary=split_respect_sentence_boundary)
|
132 |
-
|
|
|
|
|
|
|
133 |
|
134 |
logging.info("starting semantic search")
|
135 |
-
with st.spinner("Performing Similar/Contextual search"):
|
136 |
semantic_search(query = queryList,
|
137 |
documents = allDocuments['documents'],
|
138 |
embedding_model=embedding_model,
|
|
|
56 |
on the context as well. The semantic search allows for a personalized\
|
57 |
experience in using the application. Both methods employ a \
|
58 |
probabilistic retrieval framework in its identification of relevant \
|
59 |
+
paragraphs. By defualt the search is performed using 'Semantic Search'
|
60 |
+
to find 'Exact/Lexical Matches' please tick the checkbox provided, which will \
|
61 |
by pass semantic search.. Furthermore, the application allows the \
|
62 |
+
user to search for pre-defined keywords from different thematic buckets\
|
63 |
+
present in sidebar.""")
|
64 |
|
65 |
|
66 |
with st.sidebar:
|
|
|
73 |
else:
|
74 |
keywordList = None
|
75 |
|
|
|
|
|
|
|
|
|
|
|
76 |
st.markdown("---")
|
77 |
|
78 |
with st.container():
|
|
|
80 |
# queryList = st.text_input("You selected the {} category we \
|
81 |
# will look for these keywords in document".format(genre),
|
82 |
# value="{}".format(keywordList))
|
|
|
83 |
queryList = st.text_input("Please enter here your question and we \
|
84 |
will look for an answer in the document\
|
85 |
OR enter the keyword you are looking \
|
|
|
87 |
context in the document. You can select the \
|
88 |
presets of keywords from sidebar.",
|
89 |
value = "{}".format(keywordList))
|
|
|
90 |
searchtype = st.checkbox("Show only Exact Matches")
|
91 |
if st.button("Find them"):
|
92 |
|
|
|
123 |
split_overlap=split_overlap,
|
124 |
removePunc= remove_punc,
|
125 |
split_respect_sentence_boundary=split_respect_sentence_boundary)
|
126 |
+
if len(allDocuments['documents']) > 100:
|
127 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
128 |
+
else:
|
129 |
+
warning_msg = ""
|
130 |
|
131 |
logging.info("starting semantic search")
|
132 |
+
with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
|
133 |
semantic_search(query = queryList,
|
134 |
documents = allDocuments['documents'],
|
135 |
embedding_model=embedding_model,
|
docStore/sample/files.json
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
-
{"
|
2 |
-
|
|
|
|
1 |
+
{"Ethiopia: 10 Year Development Plan":"docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt",
|
2 |
+
"South Africa:Low Emission strategy":"docStore/sample/South Africa_s Low Emission Development Strategy.txt"
|
3 |
+
}
|
docStore/sample/keywordexample.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
-
"Food":"Food security,Nutrition,Diets,Food loss",
|
3 |
"Climate":"Climate,Adaptation,Mitigation,Decarbonization,Carbon neutrality,Net zero Emissions",
|
4 |
-
"
|
|
|
5 |
"Nature":"Nature,Nature-based solutions,Biodiversity,Degradation",
|
6 |
-
"
|
7 |
}
|
|
|
1 |
{
|
|
|
2 |
"Climate":"Climate,Adaptation,Mitigation,Decarbonization,Carbon neutrality,Net zero Emissions",
|
3 |
+
"Food":"Food security,Nutrition,Diets,Food loss",
|
4 |
+
"Implementation":"Implementation,transformation,reform,integration,strategy,policy",
|
5 |
"Nature":"Nature,Nature-based solutions,Biodiversity,Degradation",
|
6 |
+
"Social":"Indigenous,Local community(ies),Gender,Rural livelihoods,Minority"
|
7 |
}
|
paramconfig.cfg
CHANGED
@@ -12,7 +12,7 @@ RETRIEVER = msmarco-bert-base-dot-v5
|
|
12 |
RETRIEVER_FORMAT = sentence_transformers
|
13 |
RETRIEVER_EMB_LAYER = -1
|
14 |
READER = deepset/tinyroberta-squad2
|
15 |
-
READER_TOP_K =
|
16 |
THRESHOLD = 0.1
|
17 |
SPLIT_BY = sentence
|
18 |
SPLIT_LENGTH = 3
|
|
|
12 |
RETRIEVER_FORMAT = sentence_transformers
|
13 |
RETRIEVER_EMB_LAYER = -1
|
14 |
READER = deepset/tinyroberta-squad2
|
15 |
+
READER_TOP_K = 10
|
16 |
THRESHOLD = 0.1
|
17 |
SPLIT_BY = sentence
|
18 |
SPLIT_LENGTH = 3
|
ver0.1 scripts/coherence.py
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# set path
|
2 |
+
import glob, os, sys; sys.path.append('../udfPreprocess')
|
3 |
+
|
4 |
+
#import helper
|
5 |
+
import udfPreprocess.docPreprocessing as pre
|
6 |
+
import udfPreprocess.cleaning as clean
|
7 |
+
|
8 |
+
#import needed libraries
|
9 |
+
import seaborn as sns
|
10 |
+
from pandas import DataFrame
|
11 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
12 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
13 |
+
# from keybert import KeyBERT
|
14 |
+
from transformers import pipeline
|
15 |
+
import matplotlib.pyplot as plt
|
16 |
+
import numpy as np
|
17 |
+
import streamlit as st
|
18 |
+
import pandas as pd
|
19 |
+
from rank_bm25 import BM25Okapi
|
20 |
+
from sklearn.feature_extraction import _stop_words
|
21 |
+
import string
|
22 |
+
from tqdm.autonotebook import tqdm
|
23 |
+
import numpy as np
|
24 |
+
import urllib.request
|
25 |
+
import ast
|
26 |
+
import tempfile
|
27 |
+
import sqlite3
|
28 |
+
import json
|
29 |
+
import urllib.request
|
30 |
+
import ast
|
31 |
+
import docx
|
32 |
+
from docx.shared import Inches
|
33 |
+
from docx.shared import Pt
|
34 |
+
from docx.enum.style import WD_STYLE_TYPE
|
35 |
+
|
36 |
+
def app():
|
37 |
+
# Sidebar
|
38 |
+
st.sidebar.title('Check Coherence')
|
39 |
+
st.sidebar.write(' ')
|
40 |
+
with open('ndcs/countryList.txt') as dfile:
|
41 |
+
countryList = dfile.read()
|
42 |
+
|
43 |
+
countryList = ast.literal_eval(countryList)
|
44 |
+
countrynames = list(countryList.keys())
|
45 |
+
|
46 |
+
option = st.sidebar.selectbox('Select Country', (countrynames))
|
47 |
+
countryCode = countryList[option]
|
48 |
+
|
49 |
+
|
50 |
+
with st.container():
|
51 |
+
st.markdown("<h1 style='text-align: center; color: black;'> Check Coherence of Policy Document with NDCs</h1>", unsafe_allow_html=True)
|
52 |
+
st.write(' ')
|
53 |
+
st.write(' ')
|
54 |
+
|
55 |
+
with st.expander("ℹ️ - About this app", expanded=True):
|
56 |
+
|
57 |
+
st.write(
|
58 |
+
"""
|
59 |
+
The *Check Coherence* app is an easy-to-use interface built in Streamlit for doing analysis of policy document and finding the coherence between NDCs/New-Updated NDCs- developed by GIZ Data and the Sustainable Development Solution Network.
|
60 |
+
"""
|
61 |
+
)
|
62 |
+
|
63 |
+
st.markdown("")
|
64 |
+
|
65 |
+
st.markdown("")
|
66 |
+
st.markdown("## 📌 Step One: Upload document of the country selected ")
|
67 |
+
|
68 |
+
with st.container():
|
69 |
+
docs = None
|
70 |
+
# asking user for either upload or select existing doc
|
71 |
+
choice = st.radio(label = 'Select the Document',
|
72 |
+
help = 'You can upload the document \
|
73 |
+
or else you can try a example document.',
|
74 |
+
options = ('Upload Document', 'Try Example'),
|
75 |
+
horizontal = True)
|
76 |
+
|
77 |
+
if choice == 'Upload Document':
|
78 |
+
uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
|
79 |
+
if uploaded_file is not None:
|
80 |
+
with tempfile.NamedTemporaryFile(mode="wb") as temp:
|
81 |
+
bytes_data = uploaded_file.getvalue()
|
82 |
+
temp.write(bytes_data)
|
83 |
+
|
84 |
+
st.write("Uploaded Filename: ", uploaded_file.name)
|
85 |
+
file_name = uploaded_file.name
|
86 |
+
file_path = temp.name
|
87 |
+
docs = pre.load_document(file_path, file_name)
|
88 |
+
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
89 |
+
|
90 |
+
else:
|
91 |
+
# listing the options
|
92 |
+
option = st.selectbox('Select the example document',
|
93 |
+
('South Africa:Low Emission strategy',
|
94 |
+
'Ethiopia: 10 Year Development Plan'))
|
95 |
+
if option is 'South Africa:Low Emission strategy':
|
96 |
+
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
|
97 |
+
countryCode = countryList['South Africa']
|
98 |
+
st.write("Selected document:", file_name.split('/')[1])
|
99 |
+
# with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
|
100 |
+
# file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
|
101 |
+
else:
|
102 |
+
# with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
|
103 |
+
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
|
104 |
+
countryCode = countryList['Ethiopia']
|
105 |
+
st.write("Selected document:", file_name.split('/')[1])
|
106 |
+
|
107 |
+
if option is not None:
|
108 |
+
docs = pre.load_document(file_path,file_name)
|
109 |
+
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
110 |
+
|
111 |
+
with open('ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
|
112 |
+
cca_sent = dfile.read()
|
113 |
+
|
114 |
+
cca_sent = ast.literal_eval(cca_sent)
|
115 |
+
|
116 |
+
with open('ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
|
117 |
+
ccm_sent = dfile.read()
|
118 |
+
|
119 |
+
ccm_sent = ast.literal_eval(ccm_sent)
|
120 |
+
|
121 |
+
with open('ndcs/countryList.txt') as dfile:
|
122 |
+
countryList = dfile.read()
|
123 |
+
|
124 |
+
countryList = ast.literal_eval(countryList)
|
125 |
+
|
126 |
+
def get_document(countryCode: str):
|
127 |
+
link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
|
128 |
+
with urllib.request.urlopen(link) as urlfile:
|
129 |
+
data = json.loads(urlfile.read())
|
130 |
+
categoriesData = {}
|
131 |
+
categoriesData['categories']= data['categories']
|
132 |
+
categoriesData['subcategories']= data['subcategories']
|
133 |
+
keys_sub = categoriesData['subcategories'].keys()
|
134 |
+
documentType= 'NDCs'
|
135 |
+
if documentType in data.keys():
|
136 |
+
if countryCode in data[documentType].keys():
|
137 |
+
get_dict = {}
|
138 |
+
for key, value in data[documentType][countryCode].items():
|
139 |
+
if key not in ['country_name','region_id', 'region_name']:
|
140 |
+
get_dict[key] = value['classification']
|
141 |
+
else:
|
142 |
+
get_dict[key] = value
|
143 |
+
else:
|
144 |
+
return None
|
145 |
+
else:
|
146 |
+
return None
|
147 |
+
|
148 |
+
country = {}
|
149 |
+
for key in categoriesData['categories']:
|
150 |
+
country[key]= {}
|
151 |
+
for key,value in categoriesData['subcategories'].items():
|
152 |
+
country[value['category']][key] = get_dict[key]
|
153 |
+
|
154 |
+
return country
|
155 |
+
|
156 |
+
# country_ndc = get_document('NDCs', countryList[option])
|
157 |
+
|
158 |
+
def countrySpecificCCA(cca_sent, threshold, countryCode):
|
159 |
+
temp = {}
|
160 |
+
doc = get_document(countryCode)
|
161 |
+
for key,value in cca_sent.items():
|
162 |
+
id_ = doc['climate change adaptation'][key]['id']
|
163 |
+
if id_ >threshold:
|
164 |
+
temp[key] = value['id'][id_]
|
165 |
+
return temp
|
166 |
+
|
167 |
+
|
168 |
+
def countrySpecificCCM(ccm_sent, threshold, countryCode):
|
169 |
+
temp = {}
|
170 |
+
doc = get_document(countryCode)
|
171 |
+
for key,value in ccm_sent.items():
|
172 |
+
id_ = doc['climate change mitigation'][key]['id']
|
173 |
+
if id_ >threshold:
|
174 |
+
temp[key] = value['id'][id_]
|
175 |
+
|
176 |
+
return temp
|
177 |
+
|
178 |
+
|
179 |
+
|
180 |
+
if docs is not None:
|
181 |
+
sent_cca = countrySpecificCCA(cca_sent,1,countryCode)
|
182 |
+
sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode)
|
183 |
+
#st.write(sent_ccm)
|
184 |
+
@st.cache(allow_output_mutation=True)
|
185 |
+
def load_sentenceTransformer(name):
|
186 |
+
return SentenceTransformer(name)
|
187 |
+
model = load_sentenceTransformer('all-MiniLM-L6-v2')
|
188 |
+
|
189 |
+
document_embeddings = model.encode(paraList, show_progress_bar=True)
|
190 |
+
|
191 |
+
genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation'))
|
192 |
+
if genre == 'Climate Change Adaptation':
|
193 |
+
sent_dict = sent_cca
|
194 |
+
sent_labels = []
|
195 |
+
for key,sent in sent_dict.items():
|
196 |
+
sent_labels.append(sent)
|
197 |
+
label_embeddings = model.encode(sent_labels, show_progress_bar=True)
|
198 |
+
similarity_high_threshold = 0.55
|
199 |
+
similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
|
200 |
+
label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
|
201 |
+
|
202 |
+
positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
|
203 |
+
|
204 |
+
|
205 |
+
else:
|
206 |
+
sent_dict = sent_ccm
|
207 |
+
sent_labels = []
|
208 |
+
for key,sent in sent_dict.items():
|
209 |
+
sent_labels.append(sent)
|
210 |
+
label_embeddings = model.encode(sent_labels, show_progress_bar=True)
|
211 |
+
similarity_high_threshold = 0.55
|
212 |
+
similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
|
213 |
+
label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
|
214 |
+
|
215 |
+
positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
|
216 |
+
|
217 |
+
|
218 |
+
# sent_labels = []
|
219 |
+
# for key,sent in sent_dict.items():
|
220 |
+
# sent_labels.append(sent)
|
221 |
+
|
222 |
+
|
223 |
+
# label_embeddings = model.encode(sent_labels, show_progress_bar=True)
|
224 |
+
|
225 |
+
#similarity_high_threshold = 0.55
|
226 |
+
# similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
|
227 |
+
#label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
|
228 |
+
|
229 |
+
#positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
|
230 |
+
document = docx.Document()
|
231 |
+
document.add_heading('Document name:{}'.format(file_name), 2)
|
232 |
+
section = document.sections[0]
|
233 |
+
|
234 |
+
# Calling the footer
|
235 |
+
footer = section.footer
|
236 |
+
|
237 |
+
# Calling the paragraph already present in
|
238 |
+
# the footer section
|
239 |
+
footer_para = footer.paragraphs[0]
|
240 |
+
|
241 |
+
font_styles = document.styles
|
242 |
+
font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
|
243 |
+
font_object = font_charstyle.font
|
244 |
+
font_object.size = Pt(7)
|
245 |
+
# Adding the centered zoned footer
|
246 |
+
footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
|
247 |
+
|
248 |
+
document.add_paragraph("Country Code for which NDC is carried out {}".format(countryCode))
|
249 |
+
|
250 |
+
for _label_idx, _paragraph_idx in positive_indices:
|
251 |
+
st.write("This paragraph: \n")
|
252 |
+
document.add_paragraph("This paragraph: \n")
|
253 |
+
st.write(paraList[_paragraph_idx])
|
254 |
+
st.write(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
|
255 |
+
document.add_paragraph(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
|
256 |
+
st.write('-'*10)
|
257 |
+
document.add_paragraph('-'*10)
|
258 |
+
|
259 |
+
document.save('demo.docx')
|
260 |
+
with open("demo.docx", "rb") as file:
|
261 |
+
btn = st.download_button(
|
262 |
+
label="Download file",
|
263 |
+
data=file,
|
264 |
+
file_name="demo.docx",
|
265 |
+
mime="txt/docx"
|
266 |
+
)
|
267 |
+
|