leavoigt commited on
Commit
579b090
·
1 Parent(s): c7b02ac

Upload 11 files

Browse files
utils/adapmit_classifier.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from typing_extensions import Literal
3
+ import logging
4
+ import pandas as pd
5
+ from pandas import DataFrame, Series
6
+ from utils.config import getconfig
7
+ from utils.preprocessing import processingpipeline
8
+ import streamlit as st
9
+ from transformers import pipeline
10
+
11
+ @st.cache_resource
12
+ def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
13
+ """
14
+ loads the document classifier using haystack, where the name/path of model
15
+ in HF-hub as string is used to fetch the model object.Either configfile or
16
+ model should be passed.
17
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
18
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
19
+ Params
20
+ --------
21
+ config_file: config file path from which to read the model name
22
+ classifier_name: if modelname is passed, it takes a priority if not \
23
+ found then will look for configfile, else raise error.
24
+ Return: document classifier model
25
+ """
26
+ if not classifier_name:
27
+ if not config_file:
28
+ logging.warning("Pass either model name or config file")
29
+ return
30
+ else:
31
+ config = getconfig(config_file)
32
+ classifier_name = config.get('adapmit','MODEL')
33
+
34
+ logging.info("Loading Adaptation Mitigation classifier")
35
+ doc_classifier = pipeline("text-classification",
36
+ model=classifier_name,
37
+ return_all_scores=True,
38
+ function_to_apply= "sigmoid")
39
+
40
+
41
+ return doc_classifier
42
+
43
+
44
+ @st.cache_data
45
+ def adapmit_classification(haystack_doc:pd.DataFrame,
46
+ threshold:float = 0.5,
47
+ classifier_model:pipeline= None
48
+ )->Tuple[DataFrame,Series]:
49
+ """
50
+ Text-Classification on the list of texts provided. Classifier provides the
51
+ most appropriate label for each text. these labels are in terms of if text
52
+ belongs to which particular Sustainable Devleopment Goal (SDG).
53
+ Params
54
+ ---------
55
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
56
+ contains the list of paragraphs in different format,here the list of
57
+ Haystack Documents is used.
58
+ threshold: threshold value for the model to keep the results from classifier
59
+ classifiermodel: you can pass the classifier model directly,which takes priority
60
+ however if not then looks for model in streamlit session.
61
+ In case of streamlit avoid passing the model directly.
62
+ Returns
63
+ ----------
64
+ df: Dataframe with two columns['SDG:int', 'text']
65
+ x: Series object with the unique SDG covered in the document uploaded and
66
+ the number of times it is covered/discussed/count_of_paragraphs.
67
+ """
68
+ logging.info("Working on Adaptation-Mitigation Identification")
69
+ haystack_doc['Adapt-Mitig Label'] = 'NA'
70
+ # df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
71
+ # df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
72
+
73
+ if not classifier_model:
74
+ classifier_model = st.session_state['adapmit_classifier']
75
+
76
+ predictions = classifier_model(list(haystack_doc.text))
77
+ # converting the predictions to desired format
78
+ list_ = []
79
+ for i in range(len(predictions)):
80
+
81
+ temp = predictions[i]
82
+ placeholder = {}
83
+ for j in range(len(temp)):
84
+ placeholder[temp[j]['label']] = temp[j]['score']
85
+ list_.append(placeholder)
86
+ labels_ = [{**list_[l]} for l in range(len(predictions))]
87
+ truth_df = DataFrame.from_dict(labels_)
88
+ truth_df = truth_df.round(2)
89
+ truth_df = truth_df.astype(float) >= threshold
90
+ truth_df = truth_df.astype(str)
91
+ categories = list(truth_df.columns)
92
+ truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
93
+ else None for i in categories}, axis=1)
94
+ truth_df['Adapt-Mitig Label'] = truth_df.apply(lambda x:
95
+ list(x['Adapt-Mitig Label'] -{None}),axis=1)
96
+ haystack_doc['Adapt-Mitig Label'] = list(truth_df['Adapt-Mitig Label'])
97
+ #df = pd.concat([df,df1])
98
+
99
+ return haystack_doc
utils/conditional_classifier.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from typing_extensions import Literal
3
+ import logging
4
+ import pandas as pd
5
+ from pandas import DataFrame, Series
6
+ from utils.config import getconfig
7
+ from utils.preprocessing import processingpipeline
8
+ import streamlit as st
9
+ from transformers import pipeline
10
+
11
+
12
+ @st.cache_resource
13
+ def load_conditionalClassifier(config_file:str = None, classifier_name:str = None):
14
+ """
15
+ loads the document classifier using haystack, where the name/path of model
16
+ in HF-hub as string is used to fetch the model object.Either configfile or
17
+ model should be passed.
18
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
+ Params
21
+ --------
22
+ config_file: config file path from which to read the model name
23
+ classifier_name: if modelname is passed, it takes a priority if not \
24
+ found then will look for configfile, else raise error.
25
+ Return: document classifier model
26
+ """
27
+ if not classifier_name:
28
+ if not config_file:
29
+ logging.warning("Pass either model name or config file")
30
+ return
31
+ else:
32
+ config = getconfig(config_file)
33
+ classifier_name = config.get('conditional','MODEL')
34
+
35
+ logging.info("Loading conditional classifier")
36
+ doc_classifier = pipeline("text-classification",
37
+ model=classifier_name,
38
+ top_k =1)
39
+
40
+ return doc_classifier
41
+
42
+
43
+ @st.cache_data
44
+ def conditional_classification(haystack_doc:pd.DataFrame,
45
+ threshold:float = 0.8,
46
+ classifier_model:pipeline= None
47
+ )->Tuple[DataFrame,Series]:
48
+ """
49
+ Text-Classification on the list of texts provided. Classifier provides the
50
+ most appropriate label for each text. It informs if paragraph contains any
51
+ netzero information or not.
52
+ Params
53
+ ---------
54
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
55
+ contains the list of paragraphs in different format,here the list of
56
+ Haystack Documents is used.
57
+ threshold: threshold value for the model to keep the results from classifier
58
+ classifiermodel: you can pass the classifier model directly,which takes priority
59
+ however if not then looks for model in streamlit session.
60
+ In case of streamlit avoid passing the model directly.
61
+ Returns
62
+ ----------
63
+ df: Dataframe
64
+ """
65
+ logging.info("Working on Conditionality Identification")
66
+ haystack_doc['Conditional Label'] = 'NA'
67
+ haystack_doc['Conditional Score'] = 0.0
68
+ haystack_doc['cond_check'] = False
69
+ haystack_doc['PA_check'] = haystack_doc['Policy-Action Label'].apply(lambda x: True if len(x) != 0 else False)
70
+
71
+ #df1 = haystack_doc[haystack_doc['PA_check'] == True]
72
+ #df = haystack_doc[haystack_doc['PA_check'] == False]
73
+ haystack_doc['cond_check'] = haystack_doc.apply(lambda x: True if (
74
+ (x['Target Label'] == 'TARGET') | (x['PA_check'] == True)) else
75
+ False, axis=1)
76
+ # we apply Netzero to only paragraphs which are classified as 'Target' related
77
+ temp = haystack_doc[haystack_doc['cond_check'] == True]
78
+ temp = temp.reset_index(drop=True)
79
+ df = haystack_doc[haystack_doc['cond_check'] == False]
80
+ df = df.reset_index(drop=True)
81
+
82
+ if not classifier_model:
83
+ classifier_model = st.session_state['conditional_classifier']
84
+
85
+ results = classifier_model(list(temp.text))
86
+ labels_= [(l[0]['label'],l[0]['score']) for l in results]
87
+ temp['Conditional Label'],temp['Conditional Score'] = zip(*labels_)
88
+ # temp[' Label'] = temp['Netzero Label'].apply(lambda x: _lab_dict[x])
89
+ # merging Target with Non Target dataframe
90
+ df = pd.concat([df,temp])
91
+ df = df.drop(columns = ['cond_check','PA_check'])
92
+ df = df.reset_index(drop =True)
93
+ df.index += 1
94
+
95
+ return df
utils/config.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import configparser
2
+ import logging
3
+
4
+ def getconfig(configfile_path:str):
5
+ """
6
+ configfile_path: file path of .cfg file
7
+ """
8
+
9
+ config = configparser.ConfigParser()
10
+
11
+ try:
12
+ config.read_file(open(configfile_path))
13
+ return config
14
+ except:
15
+ logging.warning("config file not found")
16
+
17
+
18
+ # Declare all the necessary variables
19
+ def get_classifier_params(model_name):
20
+ config = getconfig('paramconfig.cfg')
21
+ params = {}
22
+ params['model_name'] = config.get(model_name,'MODEL')
23
+ params['split_by'] = config.get(model_name,'SPLIT_BY')
24
+ params['split_length'] = int(config.get(model_name,'SPLIT_LENGTH'))
25
+ params['split_overlap'] = int(config.get(model_name,'SPLIT_OVERLAP'))
26
+ params['remove_punc'] = bool(int(config.get(model_name,'REMOVE_PUNC')))
27
+ params['split_respect_sentence_boundary'] = bool(int(config.get(model_name,'RESPECT_SENTENCE_BOUNDARY')))
28
+ params['threshold'] = float(config.get(model_name,'THRESHOLD'))
29
+ params['top_n'] = int(config.get(model_name,'TOP_KEY'))
30
+
31
+ return params
utils/ghg_classifier.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from typing_extensions import Literal
3
+ import logging
4
+ import pandas as pd
5
+ from pandas import DataFrame, Series
6
+ from utils.config import getconfig
7
+ from utils.preprocessing import processingpipeline
8
+ import streamlit as st
9
+ from transformers import pipeline
10
+
11
+ # Labels dictionary ###
12
+ _lab_dict = {
13
+ 'GHG':'GHG',
14
+ 'NOT_GHG':'NON GHG TRANSPORT TARGET',
15
+ 'NEGATIVE':'OTHERS',
16
+ }
17
+
18
+
19
+ @st.cache_resource
20
+ def load_ghgClassifier(config_file:str = None, classifier_name:str = None):
21
+ """
22
+ loads the document classifier using haystack, where the name/path of model
23
+ in HF-hub as string is used to fetch the model object.Either configfile or
24
+ model should be passed.
25
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
26
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
27
+ Params
28
+ --------
29
+ config_file: config file path from which to read the model name
30
+ classifier_name: if modelname is passed, it takes a priority if not \
31
+ found then will look for configfile, else raise error.
32
+ Return: document classifier model
33
+ """
34
+ if not classifier_name:
35
+ if not config_file:
36
+ logging.warning("Pass either model name or config file")
37
+ return
38
+ else:
39
+ config = getconfig(config_file)
40
+ classifier_name = config.get('ghg','MODEL')
41
+
42
+ logging.info("Loading ghg classifier")
43
+ doc_classifier = pipeline("text-classification",
44
+ model=classifier_name,
45
+ top_k =1)
46
+
47
+ return doc_classifier
48
+
49
+
50
+ @st.cache_data
51
+ def ghg_classification(haystack_doc:pd.DataFrame,
52
+ threshold:float = 0.5,
53
+ classifier_model:pipeline= None
54
+ )->Tuple[DataFrame,Series]:
55
+ """
56
+ Text-Classification on the list of texts provided. Classifier provides the
57
+ most appropriate label for each text. these labels are in terms of if text
58
+ belongs to which particular Sustainable Devleopment Goal (SDG).
59
+ Params
60
+ ---------
61
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
62
+ contains the list of paragraphs in different format,here the list of
63
+ Haystack Documents is used.
64
+ threshold: threshold value for the model to keep the results from classifier
65
+ classifiermodel: you can pass the classifier model directly,which takes priority
66
+ however if not then looks for model in streamlit session.
67
+ In case of streamlit avoid passing the model directly.
68
+ Returns
69
+ ----------
70
+ df: Dataframe with two columns['SDG:int', 'text']
71
+ x: Series object with the unique SDG covered in the document uploaded and
72
+ the number of times it is covered/discussed/count_of_paragraphs.
73
+ """
74
+ logging.info("Working on GHG Extraction")
75
+ haystack_doc['GHG Label'] = 'NA'
76
+ haystack_doc['GHG Score'] = 0.0
77
+ # applying GHG Identifier to only 'Target' paragraphs.
78
+ temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
79
+ temp = temp.reset_index(drop=True)
80
+ df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
81
+ df = df.reset_index(drop=True)
82
+
83
+ if not classifier_model:
84
+ classifier_model = st.session_state['ghg_classifier']
85
+
86
+ results = classifier_model(list(temp.text))
87
+ labels_= [(l[0]['label'],l[0]['score']) for l in results]
88
+ temp['GHG Label'],temp['GHG Score'] = zip(*labels_)
89
+ temp['GHG Label'] = temp['GHG Label'].apply(lambda x: _lab_dict[x])
90
+ # merge back Target and non-Target dataframe
91
+ df = pd.concat([df,temp])
92
+ df = df.reset_index(drop =True)
93
+ df['GHG Score'] = df['GHG Score'].round(2)
94
+ df.index += 1
95
+
96
+ return df
utils/indicator_classifier.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from typing_extensions import Literal
3
+ import logging
4
+ import pandas as pd
5
+ from pandas import DataFrame, Series
6
+ from utils.config import getconfig
7
+ from utils.preprocessing import processingpipeline
8
+ import streamlit as st
9
+ from transformers import pipeline
10
+
11
+
12
+ @st.cache_resource
13
+ def load_indicatorClassifier(config_file:str = None, classifier_name:str = None):
14
+ """
15
+ loads the document classifier using haystack, where the name/path of model
16
+ in HF-hub as string is used to fetch the model object.Either configfile or
17
+ model should be passed.
18
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
+ Params
21
+ --------
22
+ config_file: config file path from which to read the model name
23
+ classifier_name: if modelname is passed, it takes a priority if not \
24
+ found then will look for configfile, else raise error.
25
+ Return: document classifier model
26
+ """
27
+ if not classifier_name:
28
+ if not config_file:
29
+ logging.warning("Pass either model name or config file")
30
+ return
31
+ else:
32
+ config = getconfig(config_file)
33
+ classifier_name = config.get('indicator','MODEL')
34
+
35
+ logging.info("Loading indicator classifier")
36
+ # we are using the pipeline as the model is multilabel and DocumentClassifier
37
+ # from Haystack doesnt support multilabel
38
+ # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
39
+ # if not then it will automatically use softmax, which is not a desired thing.
40
+ # doc_classifier = TransformersDocumentClassifier(
41
+ # model_name_or_path=classifier_name,
42
+ # task="text-classification",
43
+ # top_k = None)
44
+
45
+ doc_classifier = pipeline("text-classification",
46
+ model=classifier_name,
47
+ return_all_scores=True,
48
+ function_to_apply= "sigmoid")
49
+
50
+ return doc_classifier
51
+
52
+
53
+ @st.cache_data
54
+ def indicator_classification(haystack_doc:pd.DataFrame,
55
+ threshold:float = 0.5,
56
+ classifier_model:pipeline= None
57
+ )->Tuple[DataFrame,Series]:
58
+ """
59
+ Text-Classification on the list of texts provided. Classifier provides the
60
+ most appropriate label for each text. these labels are in terms of if text
61
+ belongs to which particular Sustainable Devleopment Goal (SDG).
62
+ Params
63
+ ---------
64
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
65
+ contains the list of paragraphs in different format,here the list of
66
+ Haystack Documents is used.
67
+ threshold: threshold value for the model to keep the results from classifier
68
+ classifiermodel: you can pass the classifier model directly,which takes priority
69
+ however if not then looks for model in streamlit session.
70
+ In case of streamlit avoid passing the model directly.
71
+ Returns
72
+ ----------
73
+ df: Dataframe with two columns['SDG:int', 'text']
74
+ x: Series object with the unique SDG covered in the document uploaded and
75
+ the number of times it is covered/discussed/count_of_paragraphs.
76
+ """
77
+ logging.info("Working on Indicator Identification")
78
+ haystack_doc['Indicator Label'] = 'NA'
79
+ haystack_doc['PA_check'] = haystack_doc['Policy-Action Label'].apply(lambda x: True if len(x) != 0 else False)
80
+
81
+ df1 = haystack_doc[haystack_doc['PA_check'] == True]
82
+ df = haystack_doc[haystack_doc['PA_check'] == False]
83
+ if not classifier_model:
84
+ classifier_model = st.session_state['indicator_classifier']
85
+
86
+ predictions = classifier_model(list(df1.text))
87
+
88
+ list_ = []
89
+ for i in range(len(predictions)):
90
+
91
+ temp = predictions[i]
92
+ placeholder = {}
93
+ for j in range(len(temp)):
94
+ placeholder[temp[j]['label']] = temp[j]['score']
95
+ list_.append(placeholder)
96
+ labels_ = [{**list_[l]} for l in range(len(predictions))]
97
+ truth_df = DataFrame.from_dict(labels_)
98
+ truth_df = truth_df.round(2)
99
+ truth_df = truth_df.astype(float) >= threshold
100
+ truth_df = truth_df.astype(str)
101
+ categories = list(truth_df.columns)
102
+ truth_df['Indicator Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
103
+ None for i in categories}, axis=1)
104
+ truth_df['Indicator Label'] = truth_df.apply(lambda x: list(x['Indicator Label']
105
+ -{None}),axis=1)
106
+ df1['Indicator Label'] = list(truth_df['Indicator Label'])
107
+ df = pd.concat([df,df1])
108
+ df = df.drop(columns = ['PA_check'])
109
+ return df
utils/netzero_classifier.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from typing_extensions import Literal
3
+ import logging
4
+ import pandas as pd
5
+ from pandas import DataFrame, Series
6
+ from utils.config import getconfig
7
+ from utils.preprocessing import processingpipeline
8
+ import streamlit as st
9
+ from transformers import pipeline
10
+
11
+ # Labels dictionary ###
12
+ _lab_dict = {
13
+ 'NEGATIVE':'NO NETZERO TARGET',
14
+ 'NETZERO':'NETZERO TARGET',
15
+ }
16
+
17
+ @st.cache_resource
18
+ def load_netzeroClassifier(config_file:str = None, classifier_name:str = None):
19
+ """
20
+ loads the document classifier using haystack, where the name/path of model
21
+ in HF-hub as string is used to fetch the model object.Either configfile or
22
+ model should be passed.
23
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
24
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
25
+ Params
26
+ --------
27
+ config_file: config file path from which to read the model name
28
+ classifier_name: if modelname is passed, it takes a priority if not \
29
+ found then will look for configfile, else raise error.
30
+ Return: document classifier model
31
+ """
32
+ if not classifier_name:
33
+ if not config_file:
34
+ logging.warning("Pass either model name or config file")
35
+ return
36
+ else:
37
+ config = getconfig(config_file)
38
+ classifier_name = config.get('netzero','MODEL')
39
+
40
+ logging.info("Loading netzero classifier")
41
+ doc_classifier = pipeline("text-classification",
42
+ model=classifier_name,
43
+ top_k =1)
44
+
45
+ return doc_classifier
46
+
47
+
48
+ @st.cache_data
49
+ def netzero_classification(haystack_doc:pd.DataFrame,
50
+ threshold:float = 0.8,
51
+ classifier_model:pipeline= None
52
+ )->Tuple[DataFrame,Series]:
53
+ """
54
+ Text-Classification on the list of texts provided. Classifier provides the
55
+ most appropriate label for each text. these labels are in terms of if text
56
+ belongs to which particular Sustainable Devleopment Goal (SDG).
57
+ Params
58
+ ---------
59
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
60
+ contains the list of paragraphs in different format,here the list of
61
+ Haystack Documents is used.
62
+ threshold: threshold value for the model to keep the results from classifier
63
+ classifiermodel: you can pass the classifier model directly,which takes priority
64
+ however if not then looks for model in streamlit session.
65
+ In case of streamlit avoid passing the model directly.
66
+ Returns
67
+ ----------
68
+ df: Dataframe with two columns['SDG:int', 'text']
69
+ x: Series object with the unique SDG covered in the document uploaded and
70
+ the number of times it is covered/discussed/count_of_paragraphs.
71
+ """
72
+ logging.info("Working on Netzero Extraction")
73
+ haystack_doc['Netzero Label'] = 'NA'
74
+ haystack_doc['Netzero Score'] = 'NA'
75
+ temp = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
76
+ df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
77
+
78
+ if not classifier_model:
79
+ classifier_model = st.session_state['netzero_classifier']
80
+
81
+ results = classifier_model(list(temp.text))
82
+ labels_= [(l[0]['label'],l[0]['score']) for l in results]
83
+ temp['Netzero Label'],temp['Netzero Score'] = zip(*labels_)
84
+ df = pd.concat([df,temp])
85
+ df = df.reset_index(drop =True)
86
+ df.index += 1
87
+
88
+ return df
utils/policyaction_classifier.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from typing_extensions import Literal
3
+ import logging
4
+ import pandas as pd
5
+ from pandas import DataFrame, Series
6
+ from utils.config import getconfig
7
+ from utils.preprocessing import processingpipeline
8
+ import streamlit as st
9
+ from transformers import pipeline
10
+
11
+ ## Labels dictionary ###
12
+ _lab_dict = {
13
+ 'NEGATIVE':'NO TARGET INFO',
14
+ 'TARGET':'TARGET',
15
+ }
16
+
17
+ @st.cache_resource
18
+ def load_policyactionClassifier(config_file:str = None, classifier_name:str = None):
19
+ """
20
+ loads the document classifier using haystack, where the name/path of model
21
+ in HF-hub as string is used to fetch the model object.Either configfile or
22
+ model should be passed.
23
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
24
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
25
+ Params
26
+ --------
27
+ config_file: config file path from which to read the model name
28
+ classifier_name: if modelname is passed, it takes a priority if not \
29
+ found then will look for configfile, else raise error.
30
+ Return: document classifier model
31
+ """
32
+ if not classifier_name:
33
+ if not config_file:
34
+ logging.warning("Pass either model name or config file")
35
+ return
36
+ else:
37
+ config = getconfig(config_file)
38
+ classifier_name = config.get('policyaction','MODEL')
39
+
40
+ logging.info("Loading classifier")
41
+
42
+ doc_classifier = pipeline("text-classification",
43
+ model=classifier_name,
44
+ return_all_scores=True,
45
+ function_to_apply= "sigmoid")
46
+
47
+ return doc_classifier
48
+
49
+
50
+ @st.cache_data
51
+ def policyaction_classification(haystack_doc:pd.DataFrame,
52
+ threshold:float = 0.5,
53
+ classifier_model:pipeline= None
54
+ )->Tuple[DataFrame,Series]:
55
+ """
56
+ Text-Classification on the list of texts provided. Classifier provides the
57
+ most appropriate label for each text. these labels are in terms of if text
58
+ belongs to which particular Sustainable Devleopment Goal (SDG).
59
+ Params
60
+ ---------
61
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
62
+ contains the list of paragraphs in different format,here the list of
63
+ Haystack Documents is used.
64
+ threshold: threshold value for the model to keep the results from classifier
65
+ classifiermodel: you can pass the classifier model directly,which takes priority
66
+ however if not then looks for model in streamlit session.
67
+ In case of streamlit avoid passing the model directly.
68
+ Returns
69
+ ----------
70
+ df: Dataframe with two columns['SDG:int', 'text']
71
+ x: Series object with the unique SDG covered in the document uploaded and
72
+ the number of times it is covered/discussed/count_of_paragraphs.
73
+ """
74
+ logging.info("Working on Policy/Action. Extraction")
75
+ haystack_doc['Policy-Action Label'] = 'NA'
76
+ if not classifier_model:
77
+ classifier_model = st.session_state['policyaction_classifier']
78
+
79
+ predictions = classifier_model(list(haystack_doc.text))
80
+ list_ = []
81
+ for i in range(len(predictions)):
82
+
83
+ temp = predictions[i]
84
+ placeholder = {}
85
+ for j in range(len(temp)):
86
+ placeholder[temp[j]['label']] = temp[j]['score']
87
+ list_.append(placeholder)
88
+ labels_ = [{**list_[l]} for l in range(len(predictions))]
89
+ truth_df = DataFrame.from_dict(labels_)
90
+ truth_df = truth_df.round(2)
91
+ truth_df = truth_df.astype(float) >= threshold
92
+ truth_df = truth_df.astype(str)
93
+ categories = list(truth_df.columns)
94
+ truth_df['Policy-Action Label'] = truth_df.apply(lambda x: {i if x[i]=='True'
95
+ else None for i in categories}, axis=1)
96
+ truth_df['Policy-Action Label'] = truth_df.apply(lambda x:
97
+ list(x['Policy-Action Label'] -{None}),axis=1)
98
+
99
+ haystack_doc['Policy-Action Label'] = list(truth_df['Policy-Action Label'])
100
+
101
+ return haystack_doc
utils/preprocessing (1).py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes.base import BaseComponent
2
+ from haystack.schema import Document
3
+ from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
4
+ from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
5
+ from typing import Callable, Dict, List, Optional, Text, Tuple, Union
6
+ from typing_extensions import Literal
7
+ import pandas as pd
8
+ import logging
9
+ import re
10
+ import string
11
+ from haystack.pipelines import Pipeline
12
+
13
+ def useOCR(file_path: str)-> Text:
14
+ """
15
+ Converts image pdfs into text, Using the Farm-haystack[OCR]
16
+
17
+ Params
18
+ ----------
19
+ file_path: file_path of uploade file, returned by add_upload function in
20
+ uploadAndExample.py
21
+
22
+ Returns the text file as string.
23
+ """
24
+
25
+
26
+ converter = PDFToTextOCRConverter(remove_numeric_tables=True,
27
+ valid_languages=["eng"])
28
+ docs = converter.convert(file_path=file_path, meta=None)
29
+ return docs[0].content
30
+
31
+
32
+
33
+
34
+ class FileConverter(BaseComponent):
35
+ """
36
+ Wrapper class to convert uploaded document into text by calling appropriate
37
+ Converter class, will use internally haystack PDFToTextOCR in case of image
38
+ pdf. Cannot use the FileClassifier from haystack as its doesnt has any
39
+ label/output class for image.
40
+
41
+ 1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
42
+ 2. https://docs.haystack.deepset.ai/docs/file_converters
43
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
44
+ 4. https://docs.haystack.deepset.ai/reference/file-converters-api
45
+
46
+
47
+ """
48
+
49
+ outgoing_edges = 1
50
+
51
+ def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
52
+ id_hash_keys: Optional[List[str]] = None,
53
+ ) -> Tuple[dict,str]:
54
+ """ this is required method to invoke the component in
55
+ the pipeline implementation.
56
+
57
+ Params
58
+ ----------
59
+ file_name: name of file
60
+ file_path: file_path of uploade file, returned by add_upload function in
61
+ uploadAndExample.py
62
+
63
+ See the links provided in Class docstring/description to see other params
64
+
65
+ Return
66
+ ---------
67
+ output: dictionary, with key as identifier and value could be anything
68
+ we need to return. In this case its the List of Hasyatck Document
69
+
70
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
71
+ """
72
+ try:
73
+ if file_name.endswith('.pdf'):
74
+ converter = PDFToTextConverter(remove_numeric_tables=True)
75
+ if file_name.endswith('.txt'):
76
+ converter = TextConverter(remove_numeric_tables=True)
77
+ if file_name.endswith('.docx'):
78
+ converter = DocxToTextConverter()
79
+ except Exception as e:
80
+ logging.error(e)
81
+ return
82
+
83
+
84
+
85
+ documents = []
86
+
87
+
88
+ # encoding is empty, probably should be utf-8
89
+ document = converter.convert(
90
+ file_path=file_path, meta=None,
91
+ encoding=encoding, id_hash_keys=id_hash_keys
92
+ )[0]
93
+
94
+ text = document.content
95
+
96
+ # in case of scanned/images only PDF the content might contain only
97
+ # the page separator (\f or \x0c). We check if is so and use
98
+ # use the OCR to get the text.
99
+ filtered = re.sub(r'\x0c', '', text)
100
+
101
+ if filtered == "":
102
+ logging.info("Using OCR")
103
+ text = useOCR(file_path)
104
+
105
+ documents.append(Document(content=text,
106
+ meta={"name": file_name},
107
+ id_hash_keys=id_hash_keys))
108
+
109
+ logging.info('file conversion succesful')
110
+ output = {'documents': documents}
111
+ return output, 'output_1'
112
+
113
+ def run_batch():
114
+ """
115
+ we dont have requirement to process the multiple files in one go
116
+ therefore nothing here, however to use the custom node we need to have
117
+ this method for the class.
118
+ """
119
+
120
+ return
121
+
122
+
123
+ def basic(s:str, remove_punc:bool = False):
124
+
125
+ """
126
+ Performs basic cleaning of text.
127
+
128
+ Params
129
+ ----------
130
+ s: string to be processed
131
+ removePunc: to remove all Punctuation including ',' and '.' or not
132
+
133
+ Returns: processed string: see comments in the source code for more info
134
+ """
135
+
136
+ # Remove URLs
137
+ s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
138
+ s = re.sub(r"http\S+", " ", s)
139
+
140
+ # Remove new line characters
141
+ s = re.sub('\n', ' ', s)
142
+
143
+ # Remove punctuations
144
+ if remove_punc == True:
145
+ translator = str.maketrans(' ', ' ', string.punctuation)
146
+ s = s.translate(translator)
147
+ # Remove distracting single quotes and dotted pattern
148
+ s = re.sub("\'", " ", s)
149
+ s = s.replace("..","")
150
+
151
+ return s.strip()
152
+
153
+ def paraLengthCheck(paraList, max_len = 100):
154
+ """
155
+ There are cases where preprocessor cannot respect word limit, when using
156
+ respect sentence boundary flag due to missing sentence boundaries.
157
+ Therefore we run one more round of split here for those paragraphs
158
+
159
+ Params
160
+ ---------------
161
+ paraList : list of paragraphs/text
162
+ max_len : max length to be respected by sentences which bypassed
163
+ preprocessor strategy
164
+
165
+ """
166
+ new_para_list = []
167
+ for passage in paraList:
168
+ # check if para exceeds words limit
169
+ if len(passage.content.split()) > max_len:
170
+ # we might need few iterations example if para = 512 tokens
171
+ # we need to iterate 5 times to reduce para to size limit of '100'
172
+ iterations = int(len(passage.content.split())/max_len)
173
+ for i in range(iterations):
174
+ temp = " ".join(passage.content.split()[max_len*i:max_len*(i+1)])
175
+ new_para_list.append((temp,passage.meta['page']))
176
+ temp = " ".join(passage.content.split()[max_len*(i+1):])
177
+ new_para_list.append((temp,passage.meta['page']))
178
+ else:
179
+ # paragraphs which dont need any splitting
180
+ new_para_list.append((passage.content, passage.meta['page']))
181
+
182
+ logging.info("New paragraphs length {}".format(len(new_para_list)))
183
+ return new_para_list
184
+
185
+ class UdfPreProcessor(BaseComponent):
186
+ """
187
+ class to preprocess the document returned by FileConverter. It will check
188
+ for splitting strategy and splits the document by word or sentences and then
189
+ synthetically create the paragraphs.
190
+
191
+ 1. https://docs.haystack.deepset.ai/docs/preprocessor
192
+ 2. https://docs.haystack.deepset.ai/reference/preprocessor-api
193
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
194
+
195
+ """
196
+ outgoing_edges = 1
197
+
198
+ def run(self, documents:List[Document], remove_punc:bool=False,
199
+ split_by: Literal["sentence", "word"] = 'sentence',
200
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
201
+ split_overlap:int = 0):
202
+
203
+ """ this is required method to invoke the component in
204
+ the pipeline implementation.
205
+
206
+ Params
207
+ ----------
208
+ documents: documents from the output dictionary returned by Fileconverter
209
+ remove_punc: to remove all Punctuation including ',' and '.' or not
210
+ split_by: document splitting strategy either as word or sentence
211
+ split_length: when synthetically creating the paragrpahs from document,
212
+ it defines the length of paragraph.
213
+ split_respect_sentence_boundary: Used when using 'word' strategy for
214
+ splititng of text.
215
+ split_overlap: Number of words or sentences that overlap when creating
216
+ the paragraphs. This is done as one sentence or 'some words' make sense
217
+ when read in together with others. Therefore the overlap is used.
218
+
219
+ Return
220
+ ---------
221
+ output: dictionary, with key as identifier and value could be anything
222
+ we need to return. In this case the output will contain 4 objects
223
+ the paragraphs text list as List, Haystack document, Dataframe and
224
+ one raw text file.
225
+
226
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
227
+
228
+ """
229
+
230
+ if split_by == 'sentence':
231
+ split_respect_sentence_boundary = False
232
+
233
+ else:
234
+ split_respect_sentence_boundary = split_respect_sentence_boundary
235
+
236
+ preprocessor = PreProcessor(
237
+ clean_empty_lines=True,
238
+ clean_whitespace=True,
239
+ clean_header_footer=True,
240
+ split_by=split_by,
241
+ split_length=split_length,
242
+ split_respect_sentence_boundary= split_respect_sentence_boundary,
243
+ split_overlap=split_overlap,
244
+
245
+ # will add page number only in case of PDF not for text/docx file.
246
+ add_page_number=True
247
+ )
248
+
249
+ for i in documents:
250
+ # # basic cleaning before passing it to preprocessor.
251
+ # i = basic(i)
252
+ docs_processed = preprocessor.process([i])
253
+ for item in docs_processed:
254
+ item.content = basic(item.content, remove_punc= remove_punc)
255
+
256
+ df = pd.DataFrame(docs_processed)
257
+ all_text = " ".join(df.content.to_list())
258
+ para_list = df.content.to_list()
259
+ logging.info('document split into {} paragraphs'.format(len(para_list)))
260
+ output = {'documents': docs_processed,
261
+ 'dataframe': df,
262
+ 'text': all_text,
263
+ 'paraList': para_list
264
+ }
265
+ return output, "output_1"
266
+ def run_batch():
267
+ """
268
+ we dont have requirement to process the multiple files in one go
269
+ therefore nothing here, however to use the custom node we need to have
270
+ this method for the class.
271
+ """
272
+ return
273
+
274
+ def processingpipeline():
275
+ """
276
+ Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
277
+ from utils.preprocessing
278
+
279
+ """
280
+
281
+ preprocessing_pipeline = Pipeline()
282
+ file_converter = FileConverter()
283
+ custom_preprocessor = UdfPreProcessor()
284
+
285
+ preprocessing_pipeline.add_node(component=file_converter,
286
+ name="FileConverter", inputs=["File"])
287
+ preprocessing_pipeline.add_node(component = custom_preprocessor,
288
+ name ='UdfPreProcessor', inputs=["FileConverter"])
289
+
290
+ return preprocessing_pipeline
291
+
utils/sector_classifier.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from typing_extensions import Literal
3
+ import logging
4
+ import pandas as pd
5
+ from pandas import DataFrame, Series
6
+ from utils.config import getconfig
7
+ from utils.preprocessing import processingpipeline
8
+ import streamlit as st
9
+ from transformers import pipeline
10
+
11
+
12
+ @st.cache_resource
13
+ def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
14
+ """
15
+ loads the document classifier using haystack, where the name/path of model
16
+ in HF-hub as string is used to fetch the model object.Either configfile or
17
+ model should be passed.
18
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
+ Params
21
+ --------
22
+ config_file: config file path from which to read the model name
23
+ classifier_name: if modelname is passed, it takes a priority if not \
24
+ found then will look for configfile, else raise error.
25
+ Return: document classifier model
26
+ """
27
+ if not classifier_name:
28
+ if not config_file:
29
+ logging.warning("Pass either model name or config file")
30
+ return
31
+ else:
32
+ config = getconfig(config_file)
33
+ classifier_name = config.get('sector','MODEL')
34
+
35
+ logging.info("Loading sector classifier")
36
+ # we are using the pipeline as the model is multilabel and DocumentClassifier
37
+ # from Haystack doesnt support multilabel
38
+ # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
39
+ # if not then it will automatically use softmax, which is not a desired thing.
40
+ # doc_classifier = TransformersDocumentClassifier(
41
+ # model_name_or_path=classifier_name,
42
+ # task="text-classification",
43
+ # top_k = None)
44
+
45
+ doc_classifier = pipeline("text-classification",
46
+ model=classifier_name,
47
+ return_all_scores=True,
48
+ function_to_apply= "sigmoid")
49
+
50
+ return doc_classifier
51
+
52
+
53
+ @st.cache_data
54
+ def sector_classification(haystack_doc:pd.DataFrame,
55
+ threshold:float = 0.5,
56
+ classifier_model:pipeline= None
57
+ )->Tuple[DataFrame,Series]:
58
+ """
59
+ Text-Classification on the list of texts provided. Classifier provides the
60
+ most appropriate label for each text. these labels are in terms of if text
61
+ belongs to which particular Sustainable Devleopment Goal (SDG).
62
+ Params
63
+ ---------
64
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
65
+ contains the list of paragraphs in different format,here the list of
66
+ Haystack Documents is used.
67
+ threshold: threshold value for the model to keep the results from classifier
68
+ classifiermodel: you can pass the classifier model directly,which takes priority
69
+ however if not then looks for model in streamlit session.
70
+ In case of streamlit avoid passing the model directly.
71
+ Returns
72
+ ----------
73
+ df: Dataframe with two columns['SDG:int', 'text']
74
+ x: Series object with the unique SDG covered in the document uploaded and
75
+ the number of times it is covered/discussed/count_of_paragraphs.
76
+ """
77
+ logging.info("Working on Sector Identification")
78
+ haystack_doc['Sector Label'] = 'NA'
79
+ # df1 = haystack_doc[haystack_doc['Target Label'] == 'TARGET']
80
+ # df = haystack_doc[haystack_doc['Target Label'] == 'NEGATIVE']
81
+ if not classifier_model:
82
+ classifier_model = st.session_state['sector_classifier']
83
+
84
+ predictions = classifier_model(list(haystack_doc.text))
85
+
86
+ list_ = []
87
+ for i in range(len(predictions)):
88
+
89
+ temp = predictions[i]
90
+ placeholder = {}
91
+ for j in range(len(temp)):
92
+ placeholder[temp[j]['label']] = temp[j]['score']
93
+ list_.append(placeholder)
94
+ labels_ = [{**list_[l]} for l in range(len(predictions))]
95
+ truth_df = DataFrame.from_dict(labels_)
96
+ truth_df = truth_df.round(2)
97
+ truth_df = truth_df.astype(float) >= threshold
98
+ truth_df = truth_df.astype(str)
99
+ categories = list(truth_df.columns)
100
+ truth_df['Sector Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
101
+ None for i in categories}, axis=1)
102
+ truth_df['Sector Label'] = truth_df.apply(lambda x: list(x['Sector Label']
103
+ -{None}),axis=1)
104
+ haystack_doc['Sector Label'] = list(truth_df['Sector Label'])
105
+ # df = pd.concat([df,df1])
106
+ return haystack_doc
utils/target_classifier.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ from typing_extensions import Literal
3
+ import logging
4
+ import pandas as pd
5
+ from pandas import DataFrame, Series
6
+ from utils.config import getconfig
7
+ from utils.preprocessing import processingpipeline
8
+ import streamlit as st
9
+ from transformers import pipeline
10
+
11
+ ## Labels dictionary ###
12
+ _lab_dict = {
13
+ 'NEGATIVE':'NO TARGET INFO',
14
+ 'TARGET':'TARGET',
15
+ }
16
+
17
+ @st.cache_resource
18
+ def load_targetClassifier(config_file:str = None, classifier_name:str = None):
19
+ """
20
+ loads the document classifier using haystack, where the name/path of model
21
+ in HF-hub as string is used to fetch the model object.Either configfile or
22
+ model should be passed.
23
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
24
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
25
+ Params
26
+ --------
27
+ config_file: config file path from which to read the model name
28
+ classifier_name: if modelname is passed, it takes a priority if not \
29
+ found then will look for configfile, else raise error.
30
+ Return: document classifier model
31
+ """
32
+ if not classifier_name:
33
+ if not config_file:
34
+ logging.warning("Pass either model name or config file")
35
+ return
36
+ else:
37
+ config = getconfig(config_file)
38
+ classifier_name = config.get('target','MODEL')
39
+
40
+ logging.info("Loading classifier")
41
+
42
+ doc_classifier = pipeline("text-classification",
43
+ model=classifier_name,
44
+ top_k =1)
45
+
46
+ return doc_classifier
47
+
48
+
49
+ @st.cache_data
50
+ def target_classification(haystack_doc:pd.DataFrame,
51
+ threshold:float = 0.5,
52
+ classifier_model:pipeline= None
53
+ )->Tuple[DataFrame,Series]:
54
+ """
55
+ Text-Classification on the list of texts provided. Classifier provides the
56
+ most appropriate label for each text. these labels are in terms of if text
57
+ belongs to which particular Sustainable Devleopment Goal (SDG).
58
+ Params
59
+ ---------
60
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
61
+ contains the list of paragraphs in different format,here the list of
62
+ Haystack Documents is used.
63
+ threshold: threshold value for the model to keep the results from classifier
64
+ classifiermodel: you can pass the classifier model directly,which takes priority
65
+ however if not then looks for model in streamlit session.
66
+ In case of streamlit avoid passing the model directly.
67
+ Returns
68
+ ----------
69
+ df: Dataframe with two columns['SDG:int', 'text']
70
+ x: Series object with the unique SDG covered in the document uploaded and
71
+ the number of times it is covered/discussed/count_of_paragraphs.
72
+ """
73
+ logging.info("Working on Target Extraction")
74
+ if not classifier_model:
75
+ classifier_model = st.session_state['target_classifier']
76
+
77
+ results = classifier_model(list(haystack_doc.text))
78
+ labels_= [(l[0]['label'],
79
+ l[0]['score']) for l in results]
80
+
81
+
82
+ df1 = DataFrame(labels_, columns=["Target Label","Relevancy"])
83
+ df = pd.concat([haystack_doc,df1],axis=1)
84
+
85
+ df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
86
+ df.index += 1
87
+ df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
88
+
89
+ return df
utils/uploadAndExample (1).py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ import json
4
+
5
+ def add_upload(choice):
6
+ """
7
+ Provdies the user with choice to either 'Upload Document' or 'Try Example'.
8
+ Based on user choice runs streamlit processes and save the path and name of
9
+ the 'file' to streamlit session_state which then can be fetched later.
10
+
11
+ """
12
+
13
+ if choice == 'Upload Document':
14
+
15
+ # if 'filename' in st.session_state:
16
+ # Delete all the items in Session state
17
+ # for key in st.session_state.keys():
18
+ # del st.session_state[key]
19
+
20
+ uploaded_file = st.sidebar.file_uploader('Upload the File',
21
+ type=['pdf', 'docx', 'txt'])
22
+ if uploaded_file is not None:
23
+ with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
24
+ bytes_data = uploaded_file.getvalue()
25
+ temp.write(bytes_data)
26
+ st.session_state['filename'] = uploaded_file.name
27
+ st.session_state['filepath'] = temp.name
28
+
29
+
30
+ else:
31
+ # listing the options
32
+ with open('docStore/sample/files.json','r') as json_file:
33
+ files = json.load(json_file)
34
+
35
+ option = st.sidebar.selectbox('Select the example document',
36
+ list(files.keys()))
37
+ file_name = file_path = files[option]
38
+ st.session_state['filename'] = file_name
39
+ st.session_state['filepath'] = file_path