prashant commited on
Commit
2caced7
·
1 Parent(s): 3f0df44

moving old SDGandPreProc files

Browse files
appStore/sdg_analysis.py CHANGED
@@ -3,18 +3,13 @@ import glob, os, sys;
3
  sys.path.append('../udfPreprocess')
4
 
5
  #import helper
6
- import udfPreprocess.docPreprocessing as pre
7
- import udfPreprocess.cleaning as clean
8
 
9
  #import needed libraries
10
  import seaborn as sns
11
- from pandas import DataFrame
12
- from keybert import KeyBERT
13
- from transformers import pipeline
14
  import matplotlib.pyplot as plt
15
  import numpy as np
16
  import streamlit as st
17
- import pandas as pd
18
  import docx
19
  from docx.shared import Inches
20
  from docx.shared import Pt
@@ -29,17 +24,6 @@ logger = logging.getLogger(__name__)
29
 
30
 
31
 
32
- # @st.cache(allow_output_mutation=True)
33
- # def load_keyBert():
34
- # return KeyBERT()
35
-
36
- # @st.cache(allow_output_mutation=True)
37
- # def load_sdgClassifier():
38
- # classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
39
- # return classifier
40
-
41
-
42
-
43
  def app():
44
 
45
  with st.container():
@@ -66,19 +50,6 @@ def app():
66
 
67
  df, x = sdg_classification(paraList)
68
 
69
-
70
- # classifier = load_sdgClassifier()
71
-
72
- # labels = classifier(par_list)
73
- # labels_= [(l['label'],l['score']) for l in labels]
74
- # df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
75
- # df2['text'] = par_list
76
- # df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
77
- # df2.index += 1
78
- # df2 =df2[df2['Relevancy']>.85]
79
- # x = df2['SDG'].value_counts()
80
- # df3 = df2.copy()
81
-
82
  plt.rcParams['font.size'] = 25
83
  colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
84
  # plot
@@ -88,26 +59,8 @@ def app():
88
  # fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
89
  st.markdown("#### Anything related to SDGs? ####")
90
 
91
- # st.markdown("#### 🎈 Anything related to SDGs? ####")
92
-
93
  c4, c5, c6 = st.columns([2, 2, 2])
94
 
95
- # Add styling
96
- cmGreen = sns.light_palette("green", as_cmap=True)
97
- cmRed = sns.light_palette("red", as_cmap=True)
98
- # df2 = df2.style.background_gradient(
99
- # cmap=cmGreen,
100
- # subset=[
101
- # "Relevancy",
102
- # ],
103
- # )
104
-
105
- # format_dictionary = {
106
- # "Relevancy": "{:.1%}",
107
- # }
108
-
109
- # df2 = df2.format(format_dictionary)
110
-
111
  with c5:
112
  st.pyplot(fig)
113
 
 
3
  sys.path.append('../udfPreprocess')
4
 
5
  #import helper
6
+
 
7
 
8
  #import needed libraries
9
  import seaborn as sns
 
 
 
10
  import matplotlib.pyplot as plt
11
  import numpy as np
12
  import streamlit as st
 
13
  import docx
14
  from docx.shared import Inches
15
  from docx.shared import Pt
 
24
 
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
27
  def app():
28
 
29
  with st.container():
 
50
 
51
  df, x = sdg_classification(paraList)
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  plt.rcParams['font.size'] = 25
54
  colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
55
  # plot
 
59
  # fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
60
  st.markdown("#### Anything related to SDGs? ####")
61
 
 
 
62
  c4, c5, c6 = st.columns([2, 2, 2])
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  with c5:
65
  st.pyplot(fig)
66
 
udfPreprocess/sdg_classifier.py CHANGED
@@ -1,4 +1,3 @@
1
- from tkinter import Text
2
  from haystack.nodes import TransformersDocumentClassifier
3
  from haystack.schema import Document
4
  from typing import List, Tuple
@@ -71,11 +70,18 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
71
 
72
  return df, x
73
 
74
- def runSDGPreprocessingPipeline()->List[Text]:
75
  """
76
  creates the pipeline and runs the preprocessing pipeline,
77
  the params for pipeline are fetched from paramconfig
78
 
 
 
 
 
 
 
 
79
  """
80
  file_path = st.session_state['filepath']
81
  file_name = st.session_state['filename']
 
 
1
  from haystack.nodes import TransformersDocumentClassifier
2
  from haystack.schema import Document
3
  from typing import List, Tuple
 
70
 
71
  return df, x
72
 
73
+ def runSDGPreprocessingPipeline()->List[Document]:
74
  """
75
  creates the pipeline and runs the preprocessing pipeline,
76
  the params for pipeline are fetched from paramconfig
77
 
78
+ Return
79
+ --------------
80
+ List[Document]: When preprocessing pipeline is run, the output dictionary
81
+ has four objects. For the Haysatck implementation of SDG classification we,
82
+ need to use the List of Haystack Document, which can be fetched by
83
+ key = 'documents' on output.
84
+
85
  """
86
  file_path = st.session_state['filepath']
87
  file_name = st.session_state['filename']
udfPreprocess/uploadAndExample.py CHANGED
@@ -1,52 +1,40 @@
1
  import streamlit as st
2
  import tempfile
3
- import udfPreprocess.docPreprocessing as pre
4
- import udfPreprocess.cleaning as clean
5
 
6
  def add_upload(choice):
 
 
 
 
 
 
7
 
8
 
9
  if choice == 'Upload Document':
10
- uploaded_file = st.sidebar.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
11
- if uploaded_file is not None:
 
12
  with tempfile.NamedTemporaryFile(mode="wb") as temp:
13
  bytes_data = uploaded_file.getvalue()
14
  temp.write(bytes_data)
15
  st.session_state['filename'] = uploaded_file.name
16
- # st.write("Uploaded Filename: ", uploaded_file.name)
17
  file_name = uploaded_file.name
18
  file_path = temp.name
19
- # docs = pre.load_document(file_path, file_name)
20
- # haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
21
  st.session_state['filename'] = file_name
22
- # st.session_state['paraList'] = paraList
23
  st.session_state['filepath'] = file_path
24
 
25
 
26
 
27
  else:
28
- # listing the options
29
- option = st.sidebar.selectbox('Select the example document',
30
- ('South Africa:Low Emission strategy',
31
- 'Ethiopia: 10 Year Development Plan'))
32
- if option is 'South Africa:Low Emission strategy':
33
  file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
34
  st.session_state['filename'] = file_name
35
  st.sesion_state['filepath'] = file_path
36
- # st.write("Selected document:", file_name.split('/')[1])
37
- # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
38
- # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
39
- else:
40
- # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
41
  file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
42
  st.session_state['filename'] = file_name
43
- st.session_state['filepath'] = file_path
44
- # st.write("Selected document:", file_name.split('/')[1])
45
-
46
- # if option is not None:
47
- # docs = pre.load_document(file_path,file_name)
48
- # haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
49
- # st.session_state['docs'] = docs
50
- # st.session_state['paraList'] = paraList
51
-
52
-
 
1
  import streamlit as st
2
  import tempfile
 
 
3
 
4
  def add_upload(choice):
5
+ """
6
+ Provdies the user with choice to either 'Upload Document' or 'Try Example'.
7
+ Based on user choice runs streamlit processes and save the path and name of
8
+ the 'file' to streamlit session_state which then can be fetched later.
9
+
10
+ """
11
 
12
 
13
  if choice == 'Upload Document':
14
+ uploaded_file = st.sidebar.file_uploader('Upload the File',
15
+ type=['pdf', 'docx', 'txt'])
16
+ if uploaded_file is not None:
17
  with tempfile.NamedTemporaryFile(mode="wb") as temp:
18
  bytes_data = uploaded_file.getvalue()
19
  temp.write(bytes_data)
20
  st.session_state['filename'] = uploaded_file.name
 
21
  file_name = uploaded_file.name
22
  file_path = temp.name
 
 
23
  st.session_state['filename'] = file_name
 
24
  st.session_state['filepath'] = file_path
25
 
26
 
27
 
28
  else:
29
+ # listing the options
30
+ option = st.sidebar.selectbox('Select the example document',
31
+ ('South Africa:Low Emission strategy',
32
+ 'Ethiopia: 10 Year Development Plan'))
33
+ if option is 'South Africa:Low Emission strategy':
34
  file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
35
  st.session_state['filename'] = file_name
36
  st.sesion_state['filepath'] = file_path
37
+ else:
 
 
 
 
38
  file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
39
  st.session_state['filename'] = file_name
40
+ st.session_state['filepath'] = file_path
 
 
 
 
 
 
 
 
 
{udfPreprocess → ver0.1 scripts}/cleaning.py RENAMED
File without changes
{udfPreprocess → ver0.1 scripts}/docPreprocessing.py RENAMED
File without changes
{udfPreprocess → ver0.1 scripts}/sdg.py RENAMED
File without changes
ver0.1 scripts/sdg_analysis.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../udfPreprocess')
4
+
5
+ #import helper
6
+
7
+
8
+ #import needed libraries
9
+ import seaborn as sns
10
+ import matplotlib.pyplot as plt
11
+ import numpy as np
12
+ import streamlit as st
13
+ import docx
14
+ from docx.shared import Inches
15
+ from docx.shared import Pt
16
+ from docx.enum.style import WD_STYLE_TYPE
17
+ from udfPreprocess.sdg_classifier import sdg_classification
18
+ from udfPreprocess.sdg_classifier import runSDGPreprocessingPipeline
19
+ import configparser
20
+ import tempfile
21
+ import sqlite3
22
+ import logging
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+
27
+ def app():
28
+
29
+ with st.container():
30
+ st.markdown("<h1 style='text-align: center; color: black;'> SDSN x GIZ Policy Action Tracking v0.1</h1>", unsafe_allow_html=True)
31
+ st.write(' ')
32
+ st.write(' ')
33
+
34
+ with st.expander("ℹ️ - About this app", expanded=False):
35
+
36
+ st.write(
37
+ """
38
+ The *Analyse Policy Document* app is an easy-to-use interface built in Streamlit for analyzing policy documents with respect to SDG Classification for the paragraphs/texts in the document - developed by GIZ Data and the Sustainable Development Solution Network. \n
39
+ """)
40
+ st.markdown("")
41
+
42
+
43
+ with st.container():
44
+
45
+
46
+
47
+ if 'filepath' in st.session_state:
48
+ paraList = runSDGPreprocessingPipeline()
49
+ with st.spinner("Running SDG"):
50
+
51
+ df, x = sdg_classification(paraList)
52
+
53
+
54
+ # classifier = load_sdgClassifier()
55
+
56
+ # labels = classifier(par_list)
57
+ # labels_= [(l['label'],l['score']) for l in labels]
58
+ # df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
59
+ # df2['text'] = par_list
60
+ # df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
61
+ # df2.index += 1
62
+ # df2 =df2[df2['Relevancy']>.85]
63
+ # x = df2['SDG'].value_counts()
64
+ # df3 = df2.copy()
65
+
66
+ plt.rcParams['font.size'] = 25
67
+ colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
68
+ # plot
69
+ fig, ax = plt.subplots()
70
+ ax.pie(x, colors=colors, radius=2, center=(4, 4),
71
+ wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
72
+ # fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
73
+ st.markdown("#### Anything related to SDGs? ####")
74
+
75
+ # st.markdown("#### 🎈 Anything related to SDGs? ####")
76
+
77
+ c4, c5, c6 = st.columns([2, 2, 2])
78
+
79
+ # Add styling
80
+ cmGreen = sns.light_palette("green", as_cmap=True)
81
+ cmRed = sns.light_palette("red", as_cmap=True)
82
+ # df2 = df2.style.background_gradient(
83
+ # cmap=cmGreen,
84
+ # subset=[
85
+ # "Relevancy",
86
+ # ],
87
+ # )
88
+
89
+ # format_dictionary = {
90
+ # "Relevancy": "{:.1%}",
91
+ # }
92
+
93
+ # df2 = df2.format(format_dictionary)
94
+
95
+ with c5:
96
+ st.pyplot(fig)
97
+
98
+ c7, c8, c9 = st.columns([1, 10, 1])
99
+ with c8:
100
+ st.table(df)
101
+
102
+
103
+ # 1. Keyword heatmap \n
104
+ # 2. SDG Classification for the paragraphs/texts in the document
105
+ #
106
+
107
+ # with st.container():
108
+ # if 'docs' in st.session_state:
109
+ # docs = st.session_state['docs']
110
+ # docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
111
+ # # paraList = st.session_state['paraList']
112
+ # logging.info("keybert")
113
+ # with st.spinner("Running Key bert"):
114
+
115
+ # kw_model = load_keyBert()
116
+
117
+ # keywords = kw_model.extract_keywords(
118
+ # all_text,
119
+ # keyphrase_ngram_range=(1, 3),
120
+ # use_mmr=True,
121
+ # stop_words="english",
122
+ # top_n=10,
123
+ # diversity=0.7,
124
+ # )
125
+
126
+ # st.markdown("## 🎈 What is my document about?")
127
+
128
+ # df = (
129
+ # DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
130
+ # .sort_values(by="Relevancy", ascending=False)
131
+ # .reset_index(drop=True)
132
+ # )
133
+ # df1 = (
134
+ # DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
135
+ # .sort_values(by="Relevancy", ascending=False)
136
+ # .reset_index(drop=True)
137
+ # )
138
+ # df.index += 1
139
+
140
+ # # Add styling
141
+ # cmGreen = sns.light_palette("green", as_cmap=True)
142
+ # cmRed = sns.light_palette("red", as_cmap=True)
143
+ # df = df.style.background_gradient(
144
+ # cmap=cmGreen,
145
+ # subset=[
146
+ # "Relevancy",
147
+ # ],
148
+ # )
149
+
150
+ # c1, c2, c3 = st.columns([1, 3, 1])
151
+
152
+ # format_dictionary = {
153
+ # "Relevancy": "{:.1%}",
154
+ # }
155
+
156
+ # df = df.format(format_dictionary)
157
+
158
+ # with c2:
159
+ #
160
+ # st.table(df)
ver0.1 scripts/uploadAndExample.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ import udfPreprocess.docPreprocessing as pre
4
+ import udfPreprocess.cleaning as clean
5
+
6
+ def add_upload(choice):
7
+
8
+
9
+ if choice == 'Upload Document':
10
+ uploaded_file = st.sidebar.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
11
+ if uploaded_file is not None:
12
+ with tempfile.NamedTemporaryFile(mode="wb") as temp:
13
+ bytes_data = uploaded_file.getvalue()
14
+ temp.write(bytes_data)
15
+ st.session_state['filename'] = uploaded_file.name
16
+ # st.write("Uploaded Filename: ", uploaded_file.name)
17
+ file_name = uploaded_file.name
18
+ file_path = temp.name
19
+ # docs = pre.load_document(file_path, file_name)
20
+ # haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
21
+ st.session_state['filename'] = file_name
22
+ # st.session_state['paraList'] = paraList
23
+ st.session_state['filepath'] = file_path
24
+
25
+
26
+
27
+ else:
28
+ # listing the options
29
+ option = st.sidebar.selectbox('Select the example document',
30
+ ('South Africa:Low Emission strategy',
31
+ 'Ethiopia: 10 Year Development Plan'))
32
+ if option is 'South Africa:Low Emission strategy':
33
+ file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
34
+ st.session_state['filename'] = file_name
35
+ st.sesion_state['filepath'] = file_path
36
+ # st.write("Selected document:", file_name.split('/')[1])
37
+ # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
38
+ # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
39
+ else:
40
+ # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
41
+ file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
42
+ st.session_state['filename'] = file_name
43
+ st.session_state['filepath'] = file_path
44
+ # st.write("Selected document:", file_name.split('/')[1])
45
+
46
+ # if option is not None:
47
+ # docs = pre.load_document(file_path,file_name)
48
+ # haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
49
+ # st.session_state['docs'] = docs
50
+ # st.session_state['paraList'] = paraList
51
+
52
+