prashant commited on
Commit
ce1209f
·
1 Parent(s): 1984bd1

info and sdg update

Browse files
appStore/info.py CHANGED
@@ -28,7 +28,7 @@ def app():
28
  </div>
29
  """
30
  st.markdown(footer, unsafe_allow_html=True)
31
- # <div class="text">
32
  c1, c2, c3 = st.columns([8,1,12])
33
  with c1:
34
  st.image("docStore/img/ndc.png")
@@ -42,13 +42,12 @@ def app():
42
  evaluation of stated goals and targets and their actual implementation on \
43
  the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
44
  Language Processing (NLP) methods can help in shortening and easing this \
45
- task for policy analysts.</div>',
46
  unsafe_allow_html=True)
47
 
48
  intro = """
49
  <div style="text-align: justify;">
50
 
51
-
52
  For this purpose, the United Nations Sustainable Development Solutions \
53
  Network (SDSN) and the Deutsche Gesellschaft für Internationale \
54
  Zusammenarbeit (GIZ) GmbH are collaborating since 2021 in the development \
 
28
  </div>
29
  """
30
  st.markdown(footer, unsafe_allow_html=True)
31
+
32
  c1, c2, c3 = st.columns([8,1,12])
33
  with c1:
34
  st.image("docStore/img/ndc.png")
 
42
  evaluation of stated goals and targets and their actual implementation on \
43
  the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
44
  Language Processing (NLP) methods can help in shortening and easing this \
45
+ task for policy analysts.</div><br>',
46
  unsafe_allow_html=True)
47
 
48
  intro = """
49
  <div style="text-align: justify;">
50
 
 
51
  For this purpose, the United Nations Sustainable Development Solutions \
52
  Network (SDSN) and the Deutsche Gesellschaft für Internationale \
53
  Zusammenarbeit (GIZ) GmbH are collaborating since 2021 in the development \
appStore/sdg_analysis.py CHANGED
@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)
23
  def app():
24
 
25
  with st.container():
26
- st.markdown("<h2 style='text-align: center; color: black;'> SDG Analysis on Polcy Document</h2>", unsafe_allow_html=True)
27
  st.write(' ')
28
  st.write(' ')
29
 
@@ -31,12 +31,45 @@ def app():
31
 
32
  st.write(
33
  """
34
- The *SDG Analysis on Polcy Document* app is an easy-to-use interface built \
35
  in Streamlit for analyzing policy documents with respect to SDG \
36
  Classification for the paragraphs/texts in the document and \
37
  extracting the keyphrase per SDG label - developed by GIZ Data \
38
  and the Sustainable Development Solution Network. \n
39
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  st.markdown("")
41
 
42
 
@@ -57,11 +90,11 @@ def app():
57
 
58
  df, x = sdg_classification(allDocuments['documents'])
59
  sdg_labels = df.SDG.unique()
60
- tfidfkeywordList = []
61
  textrankkeywordlist = []
62
  for label in sdg_labels:
63
  sdgdata = " ".join(df[df.SDG == label].text.to_list())
64
- tfidflist_ = keywordExtraction(label,[sdgdata])
65
  textranklist_ = textrank(sdgdata, words = 20)
66
  tfidfkeywordList.append({'SDG':label, 'TFIDF Keywords':tfidflist_})
67
  textrankkeywordlist.append({'SDG':label, 'TextRank Keywords':textranklist_})
@@ -69,8 +102,6 @@ def app():
69
  tRkeywordsDf = pd.DataFrame(textrankkeywordlist)
70
 
71
 
72
-
73
-
74
  plt.rcParams['font.size'] = 25
75
  colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
76
  # plot
 
23
  def app():
24
 
25
  with st.container():
26
+ st.markdown("<h2 style='text-align: center; color: black;'> SDG Classification and Keyphrase Extraction </h2>", unsafe_allow_html=True)
27
  st.write(' ')
28
  st.write(' ')
29
 
 
31
 
32
  st.write(
33
  """
34
+ The *SDG Analysis* app is an easy-to-use interface built \
35
  in Streamlit for analyzing policy documents with respect to SDG \
36
  Classification for the paragraphs/texts in the document and \
37
  extracting the keyphrase per SDG label - developed by GIZ Data \
38
  and the Sustainable Development Solution Network. \n
39
  """)
40
+ st.write("""Document Processing: The Uploaded/Selected document is \
41
+ automatically cleaned and split into paragraphs with a maximum \
42
+ length of 120 words using a Haystack preprocessing pipeline. The \
43
+ length of 120 is an empirical value which should reflect the length \
44
+ of a “context” and should limit the paragraph length deviation. \
45
+ However, since we want to respect the sentence boundary the limit \
46
+ can breach and hence this limit of 120 is tentative.\n
47
+
48
+ SDG cLassification: The application assigns paragraphs to 15 of \
49
+ the 17 United Nations Sustainable Development Goals (SDGs). SDG 16 \
50
+ “Peace, Justice and Strong Institutions” and SDG 17 \
51
+ “Partnerships for the Goals” are excluded from the analysis due to \
52
+ their broad nature which could potentially inflate the results. \
53
+ Each paragraph is assigned to one SDG only. Again, the results are \
54
+ displayed in a summary table including the number of the SDG, a \
55
+ relevancy score highlighted through a green color shading, and the \
56
+ respective text of the analyzed paragraph. Additionally, a pie \
57
+ chart with a blue color shading is displayed which illustrates the \
58
+ three most prominent SDGs in the document. The SDG classification \
59
+ uses open-source training [data](https://zenodo.org/record/5550238#.Y25ICHbMJPY) \
60
+ from [OSDG.ai](https://osdg.ai/) which is a global \
61
+ partnerships and growing community of researchers and institutions \
62
+ interested in the classification of research according to the \
63
+ Sustainable Development Goals. The summary table only displays \
64
+ paragraphs with a calculated relevancy score above 85%.\n
65
+
66
+ Keyphrase Extraction: The application extracts 15 keyphrases from \
67
+ the document, calculates a respective relevancy score, and displays \
68
+ the results in a summary table. The keyphrases are extracted using \
69
+ using [Textrank](https://github.com/summanlp/textrank) which is an \
70
+ easy-to-use computational less expensive \
71
+ model leveraging combination of TFIDF and Graph networks.
72
+ """)
73
  st.markdown("")
74
 
75
 
 
90
 
91
  df, x = sdg_classification(allDocuments['documents'])
92
  sdg_labels = df.SDG.unique()
93
+ # tfidfkeywordList = []
94
  textrankkeywordlist = []
95
  for label in sdg_labels:
96
  sdgdata = " ".join(df[df.SDG == label].text.to_list())
97
+ # tfidflist_ = keywordExtraction(label,[sdgdata])
98
  textranklist_ = textrank(sdgdata, words = 20)
99
  tfidfkeywordList.append({'SDG':label, 'TFIDF Keywords':tfidflist_})
100
  textrankkeywordlist.append({'SDG':label, 'TextRank Keywords':textranklist_})
 
102
  tRkeywordsDf = pd.DataFrame(textrankkeywordlist)
103
 
104
 
 
 
105
  plt.rcParams['font.size'] = 25
106
  colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
107
  # plot
paramconfig.cfg CHANGED
@@ -22,8 +22,9 @@ THRESHOLD = 0.85
22
  MODEL = jonas/sdg_classifier_osdg
23
  SPLIT_BY = word
24
  REMOVE_PUNC = 0
25
- SPLIT_LENGTH = 110
26
  SPLIT_OVERLAP = 10
 
27
 
28
  [preprocessor]
29
  SPLIT_OVERLAP_WORD = 10
 
22
  MODEL = jonas/sdg_classifier_osdg
23
  SPLIT_BY = word
24
  REMOVE_PUNC = 0
25
+ SPLIT_LENGTH = 120
26
  SPLIT_OVERLAP = 10
27
+ RESPECT_SENTENCE_BOUNDARY = 1
28
 
29
  [preprocessor]
30
  SPLIT_OVERLAP_WORD = 10
utils/preprocessing.py CHANGED
@@ -9,10 +9,6 @@ import logging
9
  import re
10
  import string
11
  from haystack.pipelines import Pipeline
12
- import configparser
13
- config = configparser.ConfigParser()
14
- config.read_file(open('paramconfig.cfg'))
15
- top_k = int(config.get('lexical_search','TOP_K'))
16
 
17
  def useOCR(file_path: str)-> Text:
18
  """
@@ -167,11 +163,10 @@ class UdfPreProcessor(BaseComponent):
167
 
168
  """
169
  outgoing_edges = 1
170
- # split_overlap_word = int(config.get('preprocessor','SPLIT_OVERLAP_WORD'))
171
- # split_overlap_sentence = int(config.get('preprocessor','SPLIT_OVERLAP_SENTENCE'))
172
 
173
  def run(self, documents:List[Document], removePunc:bool,
174
  split_by: Literal["sentence", "word"] = 'sentence',
 
175
  split_length:int = 2, split_overlap = 0):
176
 
177
  """ this is required method to invoke the component in
@@ -198,11 +193,9 @@ class UdfPreProcessor(BaseComponent):
198
 
199
  if split_by == 'sentence':
200
  split_respect_sentence_boundary = False
201
- # split_overlap=self.split_overlap_sentence
202
 
203
  else:
204
- split_respect_sentence_boundary = True
205
- # split_overlap= self.split_overlap_word
206
 
207
  preprocessor = PreProcessor(
208
  clean_empty_lines=True,
@@ -218,6 +211,8 @@ class UdfPreProcessor(BaseComponent):
218
  )
219
 
220
  for i in documents:
 
 
221
  docs_processed = preprocessor.process([i])
222
  for item in docs_processed:
223
  item.content = basic(item.content, removePunc= removePunc)
@@ -243,7 +238,7 @@ class UdfPreProcessor(BaseComponent):
243
  def processingpipeline():
244
  """
245
  Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
246
- from utils.
247
 
248
  """
249
 
 
9
  import re
10
  import string
11
  from haystack.pipelines import Pipeline
 
 
 
 
12
 
13
  def useOCR(file_path: str)-> Text:
14
  """
 
163
 
164
  """
165
  outgoing_edges = 1
 
 
166
 
167
  def run(self, documents:List[Document], removePunc:bool,
168
  split_by: Literal["sentence", "word"] = 'sentence',
169
+ split_respect_sentence_boundary = False,
170
  split_length:int = 2, split_overlap = 0):
171
 
172
  """ this is required method to invoke the component in
 
193
 
194
  if split_by == 'sentence':
195
  split_respect_sentence_boundary = False
 
196
 
197
  else:
198
+ split_respect_sentence_boundary = split_respect_sentence_boundary
 
199
 
200
  preprocessor = PreProcessor(
201
  clean_empty_lines=True,
 
211
  )
212
 
213
  for i in documents:
214
+ # # basic cleaning before passing it to preprocessor.
215
+ # i = basic(i)
216
  docs_processed = preprocessor.process([i])
217
  for item in docs_processed:
218
  item.content = basic(item.content, removePunc= removePunc)
 
238
  def processingpipeline():
239
  """
240
  Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
241
+ from utils.preprocessing
242
 
243
  """
244
 
utils/sdg_classifier.py CHANGED
@@ -106,7 +106,7 @@ def runSDGPreprocessingPipeline(filePath, fileName)->List[Document]:
106
  split_length = int(config.get('sdg','SPLIT_LENGTH'))
107
  split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
108
  remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
109
-
110
 
111
  output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
112
  params= {"FileConverter": {"file_path": filePath, \
@@ -114,6 +114,7 @@ def runSDGPreprocessingPipeline(filePath, fileName)->List[Document]:
114
  "UdfPreProcessor": {"removePunc": remove_punc, \
115
  "split_by": split_by, \
116
  "split_length":split_length,\
117
- "split_overlap": split_overlap}})
 
118
 
119
  return output_sdg_pre
 
106
  split_length = int(config.get('sdg','SPLIT_LENGTH'))
107
  split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
108
  remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
109
+ split_respect_sentence_boundary = bool(int(config.get('sdg','RESPECT_SENTENCE_BOUNDARY')))
110
 
111
  output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
112
  params= {"FileConverter": {"file_path": filePath, \
 
114
  "UdfPreProcessor": {"removePunc": remove_punc, \
115
  "split_by": split_by, \
116
  "split_length":split_length,\
117
+ "split_overlap": split_overlap, \
118
+ "split_respect_sentence_boundary":split_respect_sentence_boundary}})
119
 
120
  return output_sdg_pre
utils/uploadAndExample.py CHANGED
@@ -8,7 +8,6 @@ def add_upload(choice):
8
  the 'file' to streamlit session_state which then can be fetched later.
9
 
10
  """
11
-
12
 
13
  if choice == 'Upload Document':
14
  uploaded_file = st.sidebar.file_uploader('Upload the File',
@@ -21,7 +20,6 @@ def add_upload(choice):
21
  st.session_state['filepath'] = temp.name
22
 
23
 
24
-
25
  else:
26
  # listing the options
27
  option = st.sidebar.selectbox('Select the example document',
 
8
  the 'file' to streamlit session_state which then can be fetched later.
9
 
10
  """
 
11
 
12
  if choice == 'Upload Document':
13
  uploaded_file = st.sidebar.file_uploader('Upload the File',
 
20
  st.session_state['filepath'] = temp.name
21
 
22
 
 
23
  else:
24
  # listing the options
25
  option = st.sidebar.selectbox('Select the example document',