prashant commited on
Commit
550b85d
1 Parent(s): 4e2e62f

changing list order adding coherence

Browse files
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import appStore.keyword_search as keyword_search
2
  import appStore.sdg_analysis as sdg_analysis
3
- #import appStore.coherence as coherence
4
  import appStore.info as info
5
  from appStore.multiapp import MultiApp
6
  import streamlit as st
@@ -13,5 +13,6 @@ app = MultiApp()
13
  app.add_app("About","house", info.app)
14
  app.add_app("SDG Analysis","gear",sdg_analysis.app)
15
  app.add_app("Search","search", keyword_search.app)
 
16
 
17
  app.run()
 
1
  import appStore.keyword_search as keyword_search
2
  import appStore.sdg_analysis as sdg_analysis
3
+ import appStore.coherence as coherence
4
  import appStore.info as info
5
  from appStore.multiapp import MultiApp
6
  import streamlit as st
 
13
  app.add_app("About","house", info.app)
14
  app.add_app("SDG Analysis","gear",sdg_analysis.app)
15
  app.add_app("Search","search", keyword_search.app)
16
+ app.add_app("NDC Coherence","exclude", coherence.app)
17
 
18
  app.run()
appStore/coherence.py CHANGED
@@ -1,267 +1,8 @@
1
  # set path
2
- import glob, os, sys; sys.path.append('../udfPreprocess')
 
3
 
4
- #import helper
5
- import udfPreprocess.docPreprocessing as pre
6
- import udfPreprocess.cleaning as clean
7
-
8
- #import needed libraries
9
- import seaborn as sns
10
- from pandas import DataFrame
11
- from sentence_transformers import SentenceTransformer, CrossEncoder, util
12
- from sklearn.metrics.pairwise import cosine_similarity
13
- # from keybert import KeyBERT
14
- from transformers import pipeline
15
- import matplotlib.pyplot as plt
16
- import numpy as np
17
  import streamlit as st
18
- import pandas as pd
19
- from rank_bm25 import BM25Okapi
20
- from sklearn.feature_extraction import _stop_words
21
- import string
22
- from tqdm.autonotebook import tqdm
23
- import numpy as np
24
- import urllib.request
25
- import ast
26
- import tempfile
27
- import sqlite3
28
- import json
29
- import urllib.request
30
- import ast
31
- import docx
32
- from docx.shared import Inches
33
- from docx.shared import Pt
34
- from docx.enum.style import WD_STYLE_TYPE
35
 
36
  def app():
37
- # Sidebar
38
- st.sidebar.title('Check Coherence')
39
- st.sidebar.write(' ')
40
- with open('ndcs/countryList.txt') as dfile:
41
- countryList = dfile.read()
42
-
43
- countryList = ast.literal_eval(countryList)
44
- countrynames = list(countryList.keys())
45
-
46
- option = st.sidebar.selectbox('Select Country', (countrynames))
47
- countryCode = countryList[option]
48
-
49
-
50
- with st.container():
51
- st.markdown("<h1 style='text-align: center; color: black;'> Check Coherence of Policy Document with NDCs</h1>", unsafe_allow_html=True)
52
- st.write(' ')
53
- st.write(' ')
54
-
55
- with st.expander("ℹ️ - About this app", expanded=True):
56
-
57
- st.write(
58
- """
59
- The *Check Coherence* app is an easy-to-use interface built in Streamlit for doing analysis of policy document and finding the coherence between NDCs/New-Updated NDCs- developed by GIZ Data and the Sustainable Development Solution Network.
60
- """
61
- )
62
-
63
- st.markdown("")
64
-
65
- st.markdown("")
66
- st.markdown("## 📌 Step One: Upload document of the country selected ")
67
-
68
- with st.container():
69
- docs = None
70
- # asking user for either upload or select existing doc
71
- choice = st.radio(label = 'Select the Document',
72
- help = 'You can upload the document \
73
- or else you can try a example document.',
74
- options = ('Upload Document', 'Try Example'),
75
- horizontal = True)
76
-
77
- if choice == 'Upload Document':
78
- uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
79
- if uploaded_file is not None:
80
- with tempfile.NamedTemporaryFile(mode="wb") as temp:
81
- bytes_data = uploaded_file.getvalue()
82
- temp.write(bytes_data)
83
-
84
- st.write("Uploaded Filename: ", uploaded_file.name)
85
- file_name = uploaded_file.name
86
- file_path = temp.name
87
- docs = pre.load_document(file_path, file_name)
88
- haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
89
-
90
- else:
91
- # listing the options
92
- option = st.selectbox('Select the example document',
93
- ('South Africa:Low Emission strategy',
94
- 'Ethiopia: 10 Year Development Plan'))
95
- if option is 'South Africa:Low Emission strategy':
96
- file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
97
- countryCode = countryList['South Africa']
98
- st.write("Selected document:", file_name.split('/')[1])
99
- # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
100
- # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
101
- else:
102
- # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
103
- file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
104
- countryCode = countryList['Ethiopia']
105
- st.write("Selected document:", file_name.split('/')[1])
106
-
107
- if option is not None:
108
- docs = pre.load_document(file_path,file_name)
109
- haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
110
-
111
- with open('ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
112
- cca_sent = dfile.read()
113
-
114
- cca_sent = ast.literal_eval(cca_sent)
115
-
116
- with open('ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
117
- ccm_sent = dfile.read()
118
-
119
- ccm_sent = ast.literal_eval(ccm_sent)
120
-
121
- with open('ndcs/countryList.txt') as dfile:
122
- countryList = dfile.read()
123
-
124
- countryList = ast.literal_eval(countryList)
125
-
126
- def get_document(countryCode: str):
127
- link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
128
- with urllib.request.urlopen(link) as urlfile:
129
- data = json.loads(urlfile.read())
130
- categoriesData = {}
131
- categoriesData['categories']= data['categories']
132
- categoriesData['subcategories']= data['subcategories']
133
- keys_sub = categoriesData['subcategories'].keys()
134
- documentType= 'NDCs'
135
- if documentType in data.keys():
136
- if countryCode in data[documentType].keys():
137
- get_dict = {}
138
- for key, value in data[documentType][countryCode].items():
139
- if key not in ['country_name','region_id', 'region_name']:
140
- get_dict[key] = value['classification']
141
- else:
142
- get_dict[key] = value
143
- else:
144
- return None
145
- else:
146
- return None
147
-
148
- country = {}
149
- for key in categoriesData['categories']:
150
- country[key]= {}
151
- for key,value in categoriesData['subcategories'].items():
152
- country[value['category']][key] = get_dict[key]
153
-
154
- return country
155
-
156
- # country_ndc = get_document('NDCs', countryList[option])
157
-
158
- def countrySpecificCCA(cca_sent, threshold, countryCode):
159
- temp = {}
160
- doc = get_document(countryCode)
161
- for key,value in cca_sent.items():
162
- id_ = doc['climate change adaptation'][key]['id']
163
- if id_ >threshold:
164
- temp[key] = value['id'][id_]
165
- return temp
166
-
167
-
168
- def countrySpecificCCM(ccm_sent, threshold, countryCode):
169
- temp = {}
170
- doc = get_document(countryCode)
171
- for key,value in ccm_sent.items():
172
- id_ = doc['climate change mitigation'][key]['id']
173
- if id_ >threshold:
174
- temp[key] = value['id'][id_]
175
-
176
- return temp
177
-
178
-
179
-
180
- if docs is not None:
181
- sent_cca = countrySpecificCCA(cca_sent,1,countryCode)
182
- sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode)
183
- #st.write(sent_ccm)
184
- @st.cache(allow_output_mutation=True)
185
- def load_sentenceTransformer(name):
186
- return SentenceTransformer(name)
187
- model = load_sentenceTransformer('all-MiniLM-L6-v2')
188
-
189
- document_embeddings = model.encode(paraList, show_progress_bar=True)
190
-
191
- genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation'))
192
- if genre == 'Climate Change Adaptation':
193
- sent_dict = sent_cca
194
- sent_labels = []
195
- for key,sent in sent_dict.items():
196
- sent_labels.append(sent)
197
- label_embeddings = model.encode(sent_labels, show_progress_bar=True)
198
- similarity_high_threshold = 0.55
199
- similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
200
- label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
201
-
202
- positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
203
-
204
-
205
- else:
206
- sent_dict = sent_ccm
207
- sent_labels = []
208
- for key,sent in sent_dict.items():
209
- sent_labels.append(sent)
210
- label_embeddings = model.encode(sent_labels, show_progress_bar=True)
211
- similarity_high_threshold = 0.55
212
- similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
213
- label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
214
-
215
- positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
216
-
217
-
218
- # sent_labels = []
219
- # for key,sent in sent_dict.items():
220
- # sent_labels.append(sent)
221
-
222
-
223
- # label_embeddings = model.encode(sent_labels, show_progress_bar=True)
224
-
225
- #similarity_high_threshold = 0.55
226
- # similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
227
- #label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
228
-
229
- #positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
230
- document = docx.Document()
231
- document.add_heading('Document name:{}'.format(file_name), 2)
232
- section = document.sections[0]
233
-
234
- # Calling the footer
235
- footer = section.footer
236
-
237
- # Calling the paragraph already present in
238
- # the footer section
239
- footer_para = footer.paragraphs[0]
240
-
241
- font_styles = document.styles
242
- font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
243
- font_object = font_charstyle.font
244
- font_object.size = Pt(7)
245
- # Adding the centered zoned footer
246
- footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
247
-
248
- document.add_paragraph("Country Code for which NDC is carried out {}".format(countryCode))
249
-
250
- for _label_idx, _paragraph_idx in positive_indices:
251
- st.write("This paragraph: \n")
252
- document.add_paragraph("This paragraph: \n")
253
- st.write(paraList[_paragraph_idx])
254
- st.write(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
255
- document.add_paragraph(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
256
- st.write('-'*10)
257
- document.add_paragraph('-'*10)
258
-
259
- document.save('demo.docx')
260
- with open("demo.docx", "rb") as file:
261
- btn = st.download_button(
262
- label="Download file",
263
- data=file,
264
- file_name="demo.docx",
265
- mime="txt/docx"
266
- )
267
-
 
1
  # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def app():
8
+ st.write("Coming soon")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
appStore/keyword_search.py CHANGED
@@ -56,10 +56,11 @@ def app():
56
  on the context as well. The semantic search allows for a personalized\
57
  experience in using the application. Both methods employ a \
58
  probabilistic retrieval framework in its identification of relevant \
59
- paragraphs. By defualt the search is perfomred using 'Semantic Search'
60
- , to find 'Exact/Lexical Matches' checkbox is provided, which will \
61
  by pass semantic search.. Furthermore, the application allows the \
62
- user to search for pre-defined keywords from different thematic buckets""")
 
63
 
64
 
65
  with st.sidebar:
@@ -72,11 +73,6 @@ def app():
72
  else:
73
  keywordList = None
74
 
75
- # searchtype = st.selectbox("Do you want to find exact macthes or similar \
76
- # meaning/context",
77
- # ['Exact Matches', 'Similar context/meaning'])
78
-
79
-
80
  st.markdown("---")
81
 
82
  with st.container():
@@ -84,7 +80,6 @@ def app():
84
  # queryList = st.text_input("You selected the {} category we \
85
  # will look for these keywords in document".format(genre),
86
  # value="{}".format(keywordList))
87
- # else:
88
  queryList = st.text_input("Please enter here your question and we \
89
  will look for an answer in the document\
90
  OR enter the keyword you are looking \
@@ -92,7 +87,6 @@ def app():
92
  context in the document. You can select the \
93
  presets of keywords from sidebar.",
94
  value = "{}".format(keywordList))
95
- # placeholder="Enter keyword here")
96
  searchtype = st.checkbox("Show only Exact Matches")
97
  if st.button("Find them"):
98
 
@@ -129,10 +123,13 @@ def app():
129
  split_overlap=split_overlap,
130
  removePunc= remove_punc,
131
  split_respect_sentence_boundary=split_respect_sentence_boundary)
132
-
 
 
 
133
 
134
  logging.info("starting semantic search")
135
- with st.spinner("Performing Similar/Contextual search"):
136
  semantic_search(query = queryList,
137
  documents = allDocuments['documents'],
138
  embedding_model=embedding_model,
 
56
  on the context as well. The semantic search allows for a personalized\
57
  experience in using the application. Both methods employ a \
58
  probabilistic retrieval framework in its identification of relevant \
59
+ paragraphs. By defualt the search is performed using 'Semantic Search'
60
+ to find 'Exact/Lexical Matches' please tick the checkbox provided, which will \
61
  by pass semantic search.. Furthermore, the application allows the \
62
+ user to search for pre-defined keywords from different thematic buckets\
63
+ present in sidebar.""")
64
 
65
 
66
  with st.sidebar:
 
73
  else:
74
  keywordList = None
75
 
 
 
 
 
 
76
  st.markdown("---")
77
 
78
  with st.container():
 
80
  # queryList = st.text_input("You selected the {} category we \
81
  # will look for these keywords in document".format(genre),
82
  # value="{}".format(keywordList))
 
83
  queryList = st.text_input("Please enter here your question and we \
84
  will look for an answer in the document\
85
  OR enter the keyword you are looking \
 
87
  context in the document. You can select the \
88
  presets of keywords from sidebar.",
89
  value = "{}".format(keywordList))
 
90
  searchtype = st.checkbox("Show only Exact Matches")
91
  if st.button("Find them"):
92
 
 
123
  split_overlap=split_overlap,
124
  removePunc= remove_punc,
125
  split_respect_sentence_boundary=split_respect_sentence_boundary)
126
+ if len(allDocuments['documents']) > 100:
127
+ warning_msg = ": This might take sometime, please sit back and relax."
128
+ else:
129
+ warning_msg = ""
130
 
131
  logging.info("starting semantic search")
132
+ with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
133
  semantic_search(query = queryList,
134
  documents = allDocuments['documents'],
135
  embedding_model=embedding_model,
docStore/sample/files.json CHANGED
@@ -1,2 +1,3 @@
1
- {"South Africa:Low Emission strategy":"docStore/sample/South Africa_s Low Emission Development Strategy.txt",
2
- "Ethiopia: 10 Year Development Plan":"docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt"}
 
 
1
+ {"Ethiopia: 10 Year Development Plan":"docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt",
2
+ "South Africa:Low Emission strategy":"docStore/sample/South Africa_s Low Emission Development Strategy.txt"
3
+ }
docStore/sample/keywordexample.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "Food":"Food security,Nutrition,Diets,Food loss",
3
  "Climate":"Climate,Adaptation,Mitigation,Decarbonization,Carbon neutrality,Net zero Emissions",
4
- "Social":"Indigenous,Local community(ies),Gender,Rural livelihoods,Minority",
 
5
  "Nature":"Nature,Nature-based solutions,Biodiversity,Degradation",
6
- "Implementation":"Implementation,transformation,reform,integration,strategy,policy"
7
  }
 
1
  {
 
2
  "Climate":"Climate,Adaptation,Mitigation,Decarbonization,Carbon neutrality,Net zero Emissions",
3
+ "Food":"Food security,Nutrition,Diets,Food loss",
4
+ "Implementation":"Implementation,transformation,reform,integration,strategy,policy",
5
  "Nature":"Nature,Nature-based solutions,Biodiversity,Degradation",
6
+ "Social":"Indigenous,Local community(ies),Gender,Rural livelihoods,Minority"
7
  }
paramconfig.cfg CHANGED
@@ -12,7 +12,7 @@ RETRIEVER = msmarco-bert-base-dot-v5
12
  RETRIEVER_FORMAT = sentence_transformers
13
  RETRIEVER_EMB_LAYER = -1
14
  READER = deepset/tinyroberta-squad2
15
- READER_TOP_K = 5
16
  THRESHOLD = 0.1
17
  SPLIT_BY = sentence
18
  SPLIT_LENGTH = 3
 
12
  RETRIEVER_FORMAT = sentence_transformers
13
  RETRIEVER_EMB_LAYER = -1
14
  READER = deepset/tinyroberta-squad2
15
+ READER_TOP_K = 10
16
  THRESHOLD = 0.1
17
  SPLIT_BY = sentence
18
  SPLIT_LENGTH = 3
ver0.1 scripts/coherence.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys; sys.path.append('../udfPreprocess')
3
+
4
+ #import helper
5
+ import udfPreprocess.docPreprocessing as pre
6
+ import udfPreprocess.cleaning as clean
7
+
8
+ #import needed libraries
9
+ import seaborn as sns
10
+ from pandas import DataFrame
11
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+ # from keybert import KeyBERT
14
+ from transformers import pipeline
15
+ import matplotlib.pyplot as plt
16
+ import numpy as np
17
+ import streamlit as st
18
+ import pandas as pd
19
+ from rank_bm25 import BM25Okapi
20
+ from sklearn.feature_extraction import _stop_words
21
+ import string
22
+ from tqdm.autonotebook import tqdm
23
+ import numpy as np
24
+ import urllib.request
25
+ import ast
26
+ import tempfile
27
+ import sqlite3
28
+ import json
29
+ import urllib.request
30
+ import ast
31
+ import docx
32
+ from docx.shared import Inches
33
+ from docx.shared import Pt
34
+ from docx.enum.style import WD_STYLE_TYPE
35
+
36
+ def app():
37
+ # Sidebar
38
+ st.sidebar.title('Check Coherence')
39
+ st.sidebar.write(' ')
40
+ with open('ndcs/countryList.txt') as dfile:
41
+ countryList = dfile.read()
42
+
43
+ countryList = ast.literal_eval(countryList)
44
+ countrynames = list(countryList.keys())
45
+
46
+ option = st.sidebar.selectbox('Select Country', (countrynames))
47
+ countryCode = countryList[option]
48
+
49
+
50
+ with st.container():
51
+ st.markdown("<h1 style='text-align: center; color: black;'> Check Coherence of Policy Document with NDCs</h1>", unsafe_allow_html=True)
52
+ st.write(' ')
53
+ st.write(' ')
54
+
55
+ with st.expander("ℹ️ - About this app", expanded=True):
56
+
57
+ st.write(
58
+ """
59
+ The *Check Coherence* app is an easy-to-use interface built in Streamlit for doing analysis of policy document and finding the coherence between NDCs/New-Updated NDCs- developed by GIZ Data and the Sustainable Development Solution Network.
60
+ """
61
+ )
62
+
63
+ st.markdown("")
64
+
65
+ st.markdown("")
66
+ st.markdown("## 📌 Step One: Upload document of the country selected ")
67
+
68
+ with st.container():
69
+ docs = None
70
+ # asking user for either upload or select existing doc
71
+ choice = st.radio(label = 'Select the Document',
72
+ help = 'You can upload the document \
73
+ or else you can try a example document.',
74
+ options = ('Upload Document', 'Try Example'),
75
+ horizontal = True)
76
+
77
+ if choice == 'Upload Document':
78
+ uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
79
+ if uploaded_file is not None:
80
+ with tempfile.NamedTemporaryFile(mode="wb") as temp:
81
+ bytes_data = uploaded_file.getvalue()
82
+ temp.write(bytes_data)
83
+
84
+ st.write("Uploaded Filename: ", uploaded_file.name)
85
+ file_name = uploaded_file.name
86
+ file_path = temp.name
87
+ docs = pre.load_document(file_path, file_name)
88
+ haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
89
+
90
+ else:
91
+ # listing the options
92
+ option = st.selectbox('Select the example document',
93
+ ('South Africa:Low Emission strategy',
94
+ 'Ethiopia: 10 Year Development Plan'))
95
+ if option is 'South Africa:Low Emission strategy':
96
+ file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
97
+ countryCode = countryList['South Africa']
98
+ st.write("Selected document:", file_name.split('/')[1])
99
+ # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
100
+ # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
101
+ else:
102
+ # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
103
+ file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
104
+ countryCode = countryList['Ethiopia']
105
+ st.write("Selected document:", file_name.split('/')[1])
106
+
107
+ if option is not None:
108
+ docs = pre.load_document(file_path,file_name)
109
+ haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
110
+
111
+ with open('ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
112
+ cca_sent = dfile.read()
113
+
114
+ cca_sent = ast.literal_eval(cca_sent)
115
+
116
+ with open('ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
117
+ ccm_sent = dfile.read()
118
+
119
+ ccm_sent = ast.literal_eval(ccm_sent)
120
+
121
+ with open('ndcs/countryList.txt') as dfile:
122
+ countryList = dfile.read()
123
+
124
+ countryList = ast.literal_eval(countryList)
125
+
126
+ def get_document(countryCode: str):
127
+ link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
128
+ with urllib.request.urlopen(link) as urlfile:
129
+ data = json.loads(urlfile.read())
130
+ categoriesData = {}
131
+ categoriesData['categories']= data['categories']
132
+ categoriesData['subcategories']= data['subcategories']
133
+ keys_sub = categoriesData['subcategories'].keys()
134
+ documentType= 'NDCs'
135
+ if documentType in data.keys():
136
+ if countryCode in data[documentType].keys():
137
+ get_dict = {}
138
+ for key, value in data[documentType][countryCode].items():
139
+ if key not in ['country_name','region_id', 'region_name']:
140
+ get_dict[key] = value['classification']
141
+ else:
142
+ get_dict[key] = value
143
+ else:
144
+ return None
145
+ else:
146
+ return None
147
+
148
+ country = {}
149
+ for key in categoriesData['categories']:
150
+ country[key]= {}
151
+ for key,value in categoriesData['subcategories'].items():
152
+ country[value['category']][key] = get_dict[key]
153
+
154
+ return country
155
+
156
+ # country_ndc = get_document('NDCs', countryList[option])
157
+
158
+ def countrySpecificCCA(cca_sent, threshold, countryCode):
159
+ temp = {}
160
+ doc = get_document(countryCode)
161
+ for key,value in cca_sent.items():
162
+ id_ = doc['climate change adaptation'][key]['id']
163
+ if id_ >threshold:
164
+ temp[key] = value['id'][id_]
165
+ return temp
166
+
167
+
168
+ def countrySpecificCCM(ccm_sent, threshold, countryCode):
169
+ temp = {}
170
+ doc = get_document(countryCode)
171
+ for key,value in ccm_sent.items():
172
+ id_ = doc['climate change mitigation'][key]['id']
173
+ if id_ >threshold:
174
+ temp[key] = value['id'][id_]
175
+
176
+ return temp
177
+
178
+
179
+
180
+ if docs is not None:
181
+ sent_cca = countrySpecificCCA(cca_sent,1,countryCode)
182
+ sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode)
183
+ #st.write(sent_ccm)
184
+ @st.cache(allow_output_mutation=True)
185
+ def load_sentenceTransformer(name):
186
+ return SentenceTransformer(name)
187
+ model = load_sentenceTransformer('all-MiniLM-L6-v2')
188
+
189
+ document_embeddings = model.encode(paraList, show_progress_bar=True)
190
+
191
+ genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation'))
192
+ if genre == 'Climate Change Adaptation':
193
+ sent_dict = sent_cca
194
+ sent_labels = []
195
+ for key,sent in sent_dict.items():
196
+ sent_labels.append(sent)
197
+ label_embeddings = model.encode(sent_labels, show_progress_bar=True)
198
+ similarity_high_threshold = 0.55
199
+ similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
200
+ label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
201
+
202
+ positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
203
+
204
+
205
+ else:
206
+ sent_dict = sent_ccm
207
+ sent_labels = []
208
+ for key,sent in sent_dict.items():
209
+ sent_labels.append(sent)
210
+ label_embeddings = model.encode(sent_labels, show_progress_bar=True)
211
+ similarity_high_threshold = 0.55
212
+ similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
213
+ label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
214
+
215
+ positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
216
+
217
+
218
+ # sent_labels = []
219
+ # for key,sent in sent_dict.items():
220
+ # sent_labels.append(sent)
221
+
222
+
223
+ # label_embeddings = model.encode(sent_labels, show_progress_bar=True)
224
+
225
+ #similarity_high_threshold = 0.55
226
+ # similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
227
+ #label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
228
+
229
+ #positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
230
+ document = docx.Document()
231
+ document.add_heading('Document name:{}'.format(file_name), 2)
232
+ section = document.sections[0]
233
+
234
+ # Calling the footer
235
+ footer = section.footer
236
+
237
+ # Calling the paragraph already present in
238
+ # the footer section
239
+ footer_para = footer.paragraphs[0]
240
+
241
+ font_styles = document.styles
242
+ font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
243
+ font_object = font_charstyle.font
244
+ font_object.size = Pt(7)
245
+ # Adding the centered zoned footer
246
+ footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
247
+
248
+ document.add_paragraph("Country Code for which NDC is carried out {}".format(countryCode))
249
+
250
+ for _label_idx, _paragraph_idx in positive_indices:
251
+ st.write("This paragraph: \n")
252
+ document.add_paragraph("This paragraph: \n")
253
+ st.write(paraList[_paragraph_idx])
254
+ st.write(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
255
+ document.add_paragraph(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
256
+ st.write('-'*10)
257
+ document.add_paragraph('-'*10)
258
+
259
+ document.save('demo.docx')
260
+ with open("demo.docx", "rb") as file:
261
+ btn = st.download_button(
262
+ label="Download file",
263
+ data=file,
264
+ file_name="demo.docx",
265
+ mime="txt/docx"
266
+ )
267
+