leavoigt commited on
Commit
cfcd3f8
1 Parent(s): f937ba1

Upload 9 files

Browse files
utils/checkconfig.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import configparser
2
+ import logging
3
+
4
+ def getconfig(configfile_path:str):
5
+ """
6
+ configfile_path: file path of .cfg file
7
+ """
8
+
9
+ config = configparser.ConfigParser()
10
+
11
+ try:
12
+ config.read_file(open(configfile_path))
13
+ return config
14
+ except:
15
+ logging.warning("config file not found")
utils/keyword_extraction.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ # from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
3
+ # import nltk
4
+ # nltk.download('stopwords')
5
+ # from nltk.corpus import stopwords
6
+ import pickle
7
+ from typing import List, Text
8
+ import logging
9
+ from summa import keywords
10
+
11
+ try:
12
+ import streamlit as st
13
+ except ImportError:
14
+ logging.info("Streamlit not installed")
15
+
16
+
17
+ def sort_coo(coo_matrix):
18
+ """
19
+ It takes Coordinate format scipy sparse matrix and extracts info from same.\
20
+ 1. https://kavita-ganesan.com/python-keyword-extraction/#.Y2-TFHbMJPb
21
+ """
22
+ tuples = zip(coo_matrix.col, coo_matrix.data)
23
+ return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
24
+
25
+ def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
26
+ """get the feature names and tf-idf score of top n items
27
+
28
+ Params
29
+ ---------
30
+ feature_names: list of words from vectorizer
31
+ sorted_items: tuple returned by sort_coo function defined in \
32
+ keyword_extraction.py
33
+ topn: topn words to be extracted using tfidf
34
+
35
+ Return
36
+ ----------
37
+ results: top extracted keywords
38
+
39
+ """
40
+
41
+ #use only topn items from vector
42
+ sorted_items = sorted_items[:top_n]
43
+ score_vals = []
44
+ feature_vals = []
45
+
46
+ # word index and corresponding tf-idf score
47
+ for idx, score in sorted_items:
48
+
49
+ #keep track of feature name and its corresponding score
50
+ score_vals.append(round(score, 3))
51
+ feature_vals.append(feature_names[idx])
52
+
53
+ results= {}
54
+ for idx in range(len(feature_vals)):
55
+ results[feature_vals[idx]]=score_vals[idx]
56
+
57
+ return results
58
+
59
+
60
+ def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n):
61
+ """
62
+ TFIDF based keywords extraction
63
+
64
+ Params
65
+ ---------
66
+ vectorizer: trained cont vectorizer model
67
+ tfidfmodel: TFIDF Tranformer model
68
+ top_n: Top N keywords to be extracted
69
+ textdata: text data to which needs keyword extraction
70
+
71
+ Return
72
+ ----------
73
+ keywords: top extracted keywords
74
+
75
+ """
76
+ features = vectorizer.get_feature_names_out()
77
+ tf_idf_vector=tfidfmodel.transform(vectorizer.transform(textdata))
78
+ sorted_items=sort_coo(tf_idf_vector.tocoo())
79
+ results=extract_topn_from_vector(features,sorted_items,top_n)
80
+ keywords = [keyword for keyword in results]
81
+ return keywords
82
+
83
+ def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
84
+ """
85
+ TFIDF based keywords extraction
86
+
87
+ Params
88
+ ---------
89
+ sdg: which sdg tfidf model to be used
90
+ sdgdata: text data to which needs keyword extraction
91
+
92
+
93
+ Return
94
+ ----------
95
+ keywords: top extracted keywords
96
+
97
+ """
98
+ model_path = "docStore/sdg{}/".format(sdg)
99
+ vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
100
+ tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
101
+ features = vectorizer.get_feature_names_out()
102
+ tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
103
+ sorted_items=sort_coo(tf_idf_vector.tocoo())
104
+ top_n = top_n
105
+ results=extract_topn_from_vector(features,sorted_items,top_n)
106
+ keywords = [keyword for keyword in results]
107
+ return keywords
108
+
109
+ @st.cache(allow_output_mutation=True)
110
+ def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]:
111
+ """
112
+ wrappper function to perform textrank, uses either ratio or wordcount to
113
+ extract top keywords limited by words or ratio.
114
+ 1. https://github.com/summanlp/textrank/blob/master/summa/keywords.py
115
+
116
+ Params
117
+ --------
118
+ textdata: text data to perform the textrank.
119
+ ratio: float to limit the number of keywords as proportion of total token \
120
+ in textdata
121
+ words: number of keywords to be extracted. Takes priority over ratio if \
122
+ Non zero. Howevr incase the pagerank returns lesser keywords than \
123
+ compared to fix value then ratio is used.
124
+
125
+ Return
126
+ --------
127
+ results: extracted keywords
128
+ """
129
+ if words == 0:
130
+ logging.info("Textrank using defulat ratio value = 0.1, as no words limit given")
131
+ results = keywords.keywords(textdata, ratio= ratio).split("\n")
132
+ else:
133
+ try:
134
+ results = keywords.keywords(textdata, words= words).split("\n")
135
+ except:
136
+ results = keywords.keywords(textdata, ratio = ratio).split("\n")
137
+
138
+ return results
139
+
140
+
utils/lexical_search.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes import TfidfRetriever
2
+ from haystack.document_stores import InMemoryDocumentStore
3
+ import spacy
4
+ import re
5
+ from spacy.matcher import Matcher
6
+ from markdown import markdown
7
+ from annotated_text import annotation
8
+ from haystack.schema import Document
9
+ from typing import List, Text, Tuple
10
+ from typing_extensions import Literal
11
+ from utils.preprocessing import processingpipeline
12
+ from utils.streamlitcheck import check_streamlit
13
+ import logging
14
+ try:
15
+ from termcolor import colored
16
+ except:
17
+ pass
18
+
19
+ try:
20
+ import streamlit as st
21
+ except ImportError:
22
+ logging.info("Streamlit not installed")
23
+
24
+
25
+ def runLexicalPreprocessingPipeline(file_name:str,file_path:str,
26
+ split_by: Literal["sentence", "word"] = 'word',
27
+ split_length:int = 80, split_overlap:int = 0,
28
+ remove_punc:bool = False,)->List[Document]:
29
+ """
30
+ creates the pipeline and runs the preprocessing pipeline,
31
+ the params for pipeline are fetched from paramconfig. As lexical doesnt gets
32
+ affected by overlap, threfore split_overlap = 0 in default paramconfig and
33
+ split_by = word.
34
+
35
+ Params
36
+ ------------
37
+
38
+ file_name: filename, in case of streamlit application use
39
+ st.session_state['filename']
40
+ file_path: filepath, in case of streamlit application use
41
+ st.session_state['filepath']
42
+ split_by: document splitting strategy either as word or sentence
43
+ split_length: when synthetically creating the paragrpahs from document,
44
+ it defines the length of paragraph.
45
+ split_overlap: Number of words or sentences that overlap when creating
46
+ the paragraphs. This is done as one sentence or 'some words' make sense
47
+ when read in together with others. Therefore the overlap is used.
48
+ splititng of text.
49
+ removePunc: to remove all Punctuation including ',' and '.' or not
50
+
51
+ Return
52
+ --------------
53
+ List[Document]: When preprocessing pipeline is run, the output dictionary
54
+ has four objects. For the lexicaal search using TFIDFRetriever we
55
+ need to use the List of Haystack Document, which can be fetched by
56
+ key = 'documents' on output.
57
+
58
+ """
59
+
60
+ lexical_processing_pipeline = processingpipeline()
61
+
62
+
63
+ output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
64
+ params= {"FileConverter": {"file_path": file_path, \
65
+ "file_name": file_name},
66
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
67
+ "split_by": split_by, \
68
+ "split_length":split_length,\
69
+ "split_overlap": split_overlap}})
70
+
71
+ return output_lexical_pre
72
+
73
+
74
+ def tokenize_lexical_query(query:str)-> List[str]:
75
+ """
76
+ Removes the stop words from query and returns the list of important keywords
77
+ in query. For the lexical search the relevent paragraphs in document are
78
+ retreived using TfIDFretreiver from Haystack. However to highlight these
79
+ keywords we need the tokenized form of query.
80
+
81
+ Params
82
+ --------
83
+ query: string which represents either list of keywords user is looking for
84
+ or a query in form of Question.
85
+
86
+ Return
87
+ -----------
88
+ token_list: list of important keywords in the query.
89
+
90
+ """
91
+ nlp = spacy.load("en_core_web_sm")
92
+ token_list = [token.text.lower() for token in nlp(query)
93
+ if not (token.is_stop or token.is_punct)]
94
+ return token_list
95
+
96
+ def runSpacyMatcher(token_list:List[str], document:Text
97
+ )->Tuple[List[List[int]],spacy.tokens.doc.Doc]:
98
+ """
99
+ Using the spacy in backend finds the keywords in the document using the
100
+ Matcher class from spacy. We can alternatively use the regex, but spacy
101
+ finds all keywords in serialized manner which helps in annotation of answers.
102
+
103
+ Params
104
+ -------
105
+ token_list: this is token list which tokenize_lexical_query function returns
106
+ document: text in which we need to find the tokens
107
+
108
+ Return
109
+ --------
110
+ matches: List of [start_index, end_index] in the spacydoc(at word level not
111
+ character) for the keywords in token list.
112
+
113
+ spacydoc: the keyword index in the spacydoc are at word level and not character,
114
+ therefore to allow the annotator to work seamlessly we return the spacydoc.
115
+
116
+ """
117
+ nlp = spacy.load("en_core_web_sm")
118
+ spacydoc = nlp(document)
119
+ matcher = Matcher(nlp.vocab)
120
+ token_pattern = [[{"LOWER":token}] for token in token_list]
121
+ matcher.add(",".join(token_list), token_pattern)
122
+ spacymatches = matcher(spacydoc)
123
+
124
+ # getting start and end index in spacydoc so that annotator can work seamlessly
125
+ matches = []
126
+ for match_id, start, end in spacymatches:
127
+ matches = matches + [[start, end]]
128
+
129
+ return matches, spacydoc
130
+
131
+ def runRegexMatcher(token_list:List[str], document:Text):
132
+ """
133
+ Using the regex in backend finds the keywords in the document.
134
+
135
+ Params
136
+ -------
137
+ token_list: this is token list which tokenize_lexical_query function returns
138
+
139
+ document: text in which we need to find the tokens
140
+
141
+ Return
142
+ --------
143
+ matches: List of [start_index, end_index] in the document for the keywords
144
+ in token list at character level.
145
+
146
+ document: the keyword index returned by regex are at character level,
147
+ therefore to allow the annotator to work seamlessly we return the text back.
148
+
149
+ """
150
+ matches = []
151
+ for token in token_list:
152
+ matches = (matches +
153
+ [[val.start(), val.start() +
154
+ len(token)] for val in re.finditer(token, document)])
155
+
156
+ return matches, document
157
+
158
+ def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
159
+ """
160
+ This is spacy Annotator and needs spacy.doc
161
+ Annotates the text in the document defined by list of [start index, end index]
162
+ Example: "How are you today", if document type is text, matches = [[0,3]]
163
+ will give answer = "How", however in case we used the spacy matcher then the
164
+ matches = [[0,3]] will give answer = "How are you". However if spacy is used
165
+ to find "How" then the matches = [[0,1]] for the string defined above.
166
+
167
+ Params
168
+ -----------
169
+ matches: As mentioned its list of list. Example [[0,1],[10,13]]
170
+ document: document which needs to be indexed.
171
+
172
+
173
+ Return
174
+ --------
175
+ will send the output to either app front end using streamlit or
176
+ write directly to output screen.
177
+
178
+ """
179
+ start = 0
180
+ annotated_text = ""
181
+ for match in matches:
182
+ start_idx = match[0]
183
+ end_idx = match[1]
184
+
185
+ if check_streamlit():
186
+ annotated_text = (annotated_text + document[start:start_idx].text
187
+ + str(annotation(body=document[start_idx:end_idx].text,
188
+ label="ANSWER", background="#964448", color='#ffffff')))
189
+ else:
190
+ annotated_text = (annotated_text + document[start:start_idx].text
191
+ + colored(document[start_idx:end_idx].text,
192
+ "green", attrs = ['bold']))
193
+
194
+
195
+ start = end_idx
196
+
197
+ annotated_text = annotated_text + document[end_idx:].text
198
+
199
+
200
+ if check_streamlit():
201
+
202
+ st.write(
203
+ markdown(annotated_text),
204
+ unsafe_allow_html=True,
205
+ )
206
+ else:
207
+ print(annotated_text)
208
+
209
+ def lexical_search(query:Text, documents:List[Document],top_k:int):
210
+ """
211
+ Performs the Lexical search on the List of haystack documents which is
212
+ returned by preprocessing Pipeline.
213
+
214
+ Params
215
+ -------
216
+ query: Keywords that need to be searche in documents.
217
+ documents: List of Haystack documents returned by preprocessing pipeline.
218
+ top_k: Number of Top results to be fetched.
219
+
220
+ """
221
+
222
+ document_store = InMemoryDocumentStore()
223
+ document_store.write_documents(documents)
224
+
225
+ # Haystack Retriever works with document stores only.
226
+ retriever = TfidfRetriever(document_store)
227
+ results = retriever.retrieve(query=query, top_k = top_k)
228
+ query_tokens = tokenize_lexical_query(query)
229
+ flag = True
230
+ for count, result in enumerate(results):
231
+ matches, doc = runSpacyMatcher(query_tokens,result.content)
232
+
233
+ if len(matches) != 0:
234
+ if flag:
235
+ flag = False
236
+ if check_streamlit():
237
+ st.markdown("##### Top few lexical search (TFIDF) hits #####")
238
+ else:
239
+ print("Top few lexical search (TFIDF) hits")
240
+
241
+ if check_streamlit():
242
+ st.write("Result {}".format(count+1))
243
+ else:
244
+ print("Results {}".format(count +1))
245
+ spacyAnnotator(matches, doc)
246
+
247
+ if flag:
248
+ if check_streamlit():
249
+ st.info("🤔 No relevant result found. Please try another keyword.")
250
+ else:
251
+ print("No relevant result found. Please try another keyword.")
utils/ndc_explorer.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import urllib.request
3
+ import json
4
+
5
+ link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
6
+ def get_document(country_code: str):
7
+ """
8
+ read the country NDC data from
9
+ https://klimalog.die-gdi.de/ndc/open-data/dataset.json
10
+ using the country code.
11
+
12
+ Params
13
+ -------
14
+ country_code:"""
15
+ with urllib.request.urlopen(link) as urlfile:
16
+ data = json.loads(urlfile.read())
17
+ categoriesData = {}
18
+ categoriesData['categories']= data['categories']
19
+ categoriesData['subcategories']= data['subcategories']
20
+ keys_sub = categoriesData['subcategories'].keys()
21
+ documentType= 'NDCs'
22
+ if documentType in data.keys():
23
+ if country_code in data[documentType].keys():
24
+ get_dict = {}
25
+ for key, value in data[documentType][country_code].items():
26
+ if key not in ['country_name','region_id', 'region_name']:
27
+ get_dict[key] = value['classification']
28
+ else:
29
+ get_dict[key] = value
30
+ else:
31
+ return None
32
+ else:
33
+ return None
34
+
35
+ country = {}
36
+ for key in categoriesData['categories']:
37
+ country[key]= {}
38
+ for key,value in categoriesData['subcategories'].items():
39
+ country[value['category']][key] = get_dict[key]
40
+
41
+ return country
42
+
43
+
44
+ def countrySpecificCCA(cca_sent:dict, threshold:int, countryCode:str):
45
+ """
46
+ based on the countrycode, reads the country data from
47
+ https://klimalog.die-gdi.de/ndc/open-data/dataset.json
48
+ using get_documents from utils.ndc_explorer.py
49
+ then based on thereshold value filters the Climate Change Adaptation
50
+ targets assigned by NDC explorer team to that country. Using the sentences
51
+ create by Data services team of GIZ for each target level, tries to find the
52
+ relevant passages from the document by doing the semantic search.
53
+
54
+ Params
55
+ -------
56
+ cca_sent: dictionary with key as 'target labels' and manufactured sentences
57
+ reflecting the target level. Please see the docStore/ndcs/cca.txt
58
+
59
+ threshold: NDC target have many categoriees ranging from [0-5], with 0
60
+ refelcting most relaxed attitude and 5 being most aggrisive towards Climate
61
+ change. We select the threshold value beyond which we need to focus on.
62
+
63
+ countryCode: standard country code to allow us to fetch the country specific
64
+ data.
65
+
66
+ """
67
+ temp = {}
68
+ doc = get_document(countryCode)
69
+ for key,value in cca_sent.items():
70
+ id_ = doc['climate change adaptation'][key]['id']
71
+ if id_ >threshold:
72
+ temp[key] = value['id'][id_]
73
+ return temp
74
+
75
+
76
+ def countrySpecificCCM(ccm_sent, threshold, countryCode):
77
+ """
78
+ see the documentation of countrySpecificCCA. This is same instead of
79
+ this gets the data pertaining to Adaptation
80
+
81
+ """
82
+
83
+ temp = {}
84
+ doc = get_document(countryCode)
85
+ for key,value in ccm_sent.items():
86
+ id_ = doc['climate change mitigation'][key]['id']
87
+ if id_ >threshold:
88
+ temp[key] = value['id'][id_]
89
+
90
+ return temp
utils/preprocessing.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes.base import BaseComponent
2
+ from haystack.schema import Document
3
+ from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
4
+ from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
5
+ from typing import Callable, Dict, List, Optional, Text, Tuple, Union
6
+ from typing_extensions import Literal
7
+ import pandas as pd
8
+ import logging
9
+ import re
10
+ import string
11
+ from haystack.pipelines import Pipeline
12
+
13
+ def useOCR(file_path: str)-> Text:
14
+ """
15
+ Converts image pdfs into text, Using the Farm-haystack[OCR]
16
+
17
+ Params
18
+ ----------
19
+ file_path: file_path of uploade file, returned by add_upload function in
20
+ uploadAndExample.py
21
+
22
+ Returns the text file as string.
23
+ """
24
+
25
+
26
+ converter = PDFToTextOCRConverter(remove_numeric_tables=True,
27
+ valid_languages=["eng"])
28
+ docs = converter.convert(file_path=file_path, meta=None)
29
+ return docs[0].content
30
+
31
+
32
+
33
+
34
+ class FileConverter(BaseComponent):
35
+ """
36
+ Wrapper class to convert uploaded document into text by calling appropriate
37
+ Converter class, will use internally haystack PDFToTextOCR in case of image
38
+ pdf. Cannot use the FileClassifier from haystack as its doesnt has any
39
+ label/output class for image.
40
+
41
+ 1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
42
+ 2. https://docs.haystack.deepset.ai/docs/file_converters
43
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
44
+ 4. https://docs.haystack.deepset.ai/reference/file-converters-api
45
+
46
+
47
+ """
48
+
49
+ outgoing_edges = 1
50
+
51
+ def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
52
+ id_hash_keys: Optional[List[str]] = None,
53
+ ) -> Tuple[dict,str]:
54
+ """ this is required method to invoke the component in
55
+ the pipeline implementation.
56
+
57
+ Params
58
+ ----------
59
+ file_name: name of file
60
+ file_path: file_path of uploade file, returned by add_upload function in
61
+ uploadAndExample.py
62
+
63
+ See the links provided in Class docstring/description to see other params
64
+
65
+ Return
66
+ ---------
67
+ output: dictionary, with key as identifier and value could be anything
68
+ we need to return. In this case its the List of Hasyatck Document
69
+
70
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
71
+ """
72
+ try:
73
+ if file_name.endswith('.pdf'):
74
+ converter = PDFToTextConverter(remove_numeric_tables=True)
75
+ if file_name.endswith('.txt'):
76
+ converter = TextConverter(remove_numeric_tables=True)
77
+ if file_name.endswith('.docx'):
78
+ converter = DocxToTextConverter()
79
+ except Exception as e:
80
+ logging.error(e)
81
+ return
82
+
83
+
84
+
85
+ documents = []
86
+
87
+ document = converter.convert(
88
+ file_path=file_path, meta=None,
89
+ encoding=encoding, id_hash_keys=id_hash_keys
90
+ )[0]
91
+
92
+ text = document.content
93
+
94
+ # if file is image pdf then it will have {'content': "\x0c\x0c\x0c\x0c"}
95
+ # subsitute this substring with '',and check if content is empty string
96
+
97
+ text = re.sub(r'\x0c', '', text)
98
+ documents.append(Document(content=text,
99
+ meta={"name": file_name},
100
+ id_hash_keys=id_hash_keys))
101
+
102
+
103
+ # check if text is empty and apply pdfOCR converter.
104
+ for i in documents:
105
+ if i.content == "":
106
+ logging.info("Using OCR")
107
+ i.content = useOCR(file_path)
108
+
109
+ logging.info('file conversion succesful')
110
+ output = {'documents': documents}
111
+ return output, 'output_1'
112
+
113
+ def run_batch():
114
+ """
115
+ we dont have requirement to process the multiple files in one go
116
+ therefore nothing here, however to use the custom node we need to have
117
+ this method for the class.
118
+ """
119
+
120
+ return
121
+
122
+
123
+ def basic(s:str, remove_punc:bool = False):
124
+
125
+ """
126
+ Performs basic cleaning of text.
127
+
128
+ Params
129
+ ----------
130
+ s: string to be processed
131
+ removePunc: to remove all Punctuation including ',' and '.' or not
132
+
133
+ Returns: processed string: see comments in the source code for more info
134
+ """
135
+
136
+ # Remove URLs
137
+ s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
138
+ s = re.sub(r"http\S+", " ", s)
139
+
140
+ # Remove new line characters
141
+ s = re.sub('\n', ' ', s)
142
+
143
+ # Remove punctuations
144
+ if remove_punc == True:
145
+ translator = str.maketrans(' ', ' ', string.punctuation)
146
+ s = s.translate(translator)
147
+ # Remove distracting single quotes and dotted pattern
148
+ s = re.sub("\'", " ", s)
149
+ s = s.replace("..","")
150
+
151
+ return s.strip()
152
+
153
+
154
+ class UdfPreProcessor(BaseComponent):
155
+ """
156
+ class to preprocess the document returned by FileConverter. It will check
157
+ for splitting strategy and splits the document by word or sentences and then
158
+ synthetically create the paragraphs.
159
+
160
+ 1. https://docs.haystack.deepset.ai/docs/preprocessor
161
+ 2. https://docs.haystack.deepset.ai/reference/preprocessor-api
162
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
163
+
164
+ """
165
+ outgoing_edges = 1
166
+
167
+ def run(self, documents:List[Document], remove_punc:bool=False,
168
+ split_by: Literal["sentence", "word"] = 'sentence',
169
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
170
+ split_overlap:int = 0):
171
+
172
+ """ this is required method to invoke the component in
173
+ the pipeline implementation.
174
+
175
+ Params
176
+ ----------
177
+ documents: documents from the output dictionary returned by Fileconverter
178
+ remove_punc: to remove all Punctuation including ',' and '.' or not
179
+ split_by: document splitting strategy either as word or sentence
180
+ split_length: when synthetically creating the paragrpahs from document,
181
+ it defines the length of paragraph.
182
+ split_respect_sentence_boundary: Used when using 'word' strategy for
183
+ splititng of text.
184
+ split_overlap: Number of words or sentences that overlap when creating
185
+ the paragraphs. This is done as one sentence or 'some words' make sense
186
+ when read in together with others. Therefore the overlap is used.
187
+
188
+ Return
189
+ ---------
190
+ output: dictionary, with key as identifier and value could be anything
191
+ we need to return. In this case the output will contain 4 objects
192
+ the paragraphs text list as List, Haystack document, Dataframe and
193
+ one raw text file.
194
+
195
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
196
+
197
+ """
198
+
199
+ if split_by == 'sentence':
200
+ split_respect_sentence_boundary = False
201
+
202
+ else:
203
+ split_respect_sentence_boundary = split_respect_sentence_boundary
204
+
205
+ preprocessor = PreProcessor(
206
+ clean_empty_lines=True,
207
+ clean_whitespace=True,
208
+ clean_header_footer=True,
209
+ split_by=split_by,
210
+ split_length=split_length,
211
+ split_respect_sentence_boundary= split_respect_sentence_boundary,
212
+ split_overlap=split_overlap,
213
+
214
+ # will add page number only in case of PDF not for text/docx file.
215
+ add_page_number=True
216
+ )
217
+
218
+ for i in documents:
219
+ # # basic cleaning before passing it to preprocessor.
220
+ # i = basic(i)
221
+ docs_processed = preprocessor.process([i])
222
+ for item in docs_processed:
223
+ item.content = basic(item.content, remove_punc= remove_punc)
224
+
225
+ df = pd.DataFrame(docs_processed)
226
+ all_text = " ".join(df.content.to_list())
227
+ para_list = df.content.to_list()
228
+ logging.info('document split into {} paragraphs'.format(len(para_list)))
229
+ output = {'documents': docs_processed,
230
+ 'dataframe': df,
231
+ 'text': all_text,
232
+ 'paraList': para_list
233
+ }
234
+ return output, "output_1"
235
+ def run_batch():
236
+ """
237
+ we dont have requirement to process the multiple files in one go
238
+ therefore nothing here, however to use the custom node we need to have
239
+ this method for the class.
240
+ """
241
+ return
242
+
243
+ def processingpipeline():
244
+ """
245
+ Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
246
+ from utils.preprocessing
247
+
248
+ """
249
+
250
+ preprocessing_pipeline = Pipeline()
251
+ file_converter = FileConverter()
252
+ custom_preprocessor = UdfPreProcessor()
253
+
254
+ preprocessing_pipeline.add_node(component=file_converter,
255
+ name="FileConverter", inputs=["File"])
256
+ preprocessing_pipeline.add_node(component = custom_preprocessor,
257
+ name ='UdfPreProcessor', inputs=["FileConverter"])
258
+
259
+ return preprocessing_pipeline
260
+
utils/sdg_classifier.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes import TransformersDocumentClassifier
2
+ from haystack.schema import Document
3
+ from typing import List, Tuple
4
+ from typing_extensions import Literal
5
+ import logging
6
+ import pandas as pd
7
+ from pandas import DataFrame, Series
8
+ from utils.checkconfig import getconfig
9
+ from utils.streamlitcheck import check_streamlit
10
+ from utils.preprocessing import processingpipeline
11
+ try:
12
+ import streamlit as st
13
+ except ImportError:
14
+ logging.info("Streamlit not installed")
15
+
16
+ ## Labels dictionary ###
17
+ _lab_dict = {0: 'no_cat',
18
+ 1:'SDG 1 - No poverty',
19
+ 2:'SDG 2 - Zero hunger',
20
+ 3:'SDG 3 - Good health and well-being',
21
+ 4:'SDG 4 - Quality education',
22
+ 5:'SDG 5 - Gender equality',
23
+ 6:'SDG 6 - Clean water and sanitation',
24
+ 7:'SDG 7 - Affordable and clean energy',
25
+ 8:'SDG 8 - Decent work and economic growth',
26
+ 9:'SDG 9 - Industry, Innovation and Infrastructure',
27
+ 10:'SDG 10 - Reduced inequality',
28
+ 11:'SDG 11 - Sustainable cities and communities',
29
+ 12:'SDG 12 - Responsible consumption and production',
30
+ 13:'SDG 13 - Climate action',
31
+ 14:'SDG 14 - Life below water',
32
+ 15:'SDG 15 - Life on land',
33
+ 16:'SDG 16 - Peace, justice and strong institutions',
34
+ 17:'SDG 17 - Partnership for the goals',}
35
+
36
+ @st.cache(allow_output_mutation=True)
37
+ def load_sdgClassifier(config_file:str = None, classifier_name:str = None):
38
+ """
39
+ loads the document classifier using haystack, where the name/path of model
40
+ in HF-hub as string is used to fetch the model object.Either configfile or
41
+ model should be passed.
42
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
43
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
44
+
45
+ Params
46
+ --------
47
+ config_file: config file path from which to read the model name
48
+ classifier_name: if modelname is passed, it takes a priority if not \
49
+ found then will look for configfile, else raise error.
50
+
51
+
52
+ Return: document classifier model
53
+ """
54
+ if not classifier_name:
55
+ if not config_file:
56
+ logging.warning("Pass either model name or config file")
57
+ return
58
+ else:
59
+ config = getconfig(config_file)
60
+ classifier_name = config.get('sdg','MODEL')
61
+
62
+ logging.info("Loading classifier")
63
+ doc_classifier = TransformersDocumentClassifier(
64
+ model_name_or_path=classifier_name,
65
+ task="text-classification")
66
+
67
+ return doc_classifier
68
+
69
+
70
+ @st.cache(allow_output_mutation=True)
71
+ def sdg_classification(haystack_doc:List[Document],
72
+ threshold:float = 0.8,
73
+ classifier_model:TransformersDocumentClassifier= None
74
+ )->Tuple[DataFrame,Series]:
75
+ """
76
+ Text-Classification on the list of texts provided. Classifier provides the
77
+ most appropriate label for each text. these labels are in terms of if text
78
+ belongs to which particular Sustainable Devleopment Goal (SDG).
79
+
80
+ Params
81
+ ---------
82
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
83
+ contains the list of paragraphs in different format,here the list of
84
+ Haystack Documents is used.
85
+ threshold: threshold value for the model to keep the results from classifier
86
+ classifiermodel: you can pass the classifier model directly,which takes priority
87
+ however if not then looks for model in streamlit session.
88
+ In case of streamlit avoid passing the model directly.
89
+
90
+
91
+ Returns
92
+ ----------
93
+ df: Dataframe with two columns['SDG:int', 'text']
94
+ x: Series object with the unique SDG covered in the document uploaded and
95
+ the number of times it is covered/discussed/count_of_paragraphs.
96
+
97
+ """
98
+ logging.info("Working on SDG Classification")
99
+ if not classifier_model:
100
+ if check_streamlit():
101
+ classifier_model = st.session_state['sdg_classifier']
102
+ else:
103
+ logging.warning("No streamlit envinornment found, Pass the classifier")
104
+ return
105
+
106
+ results = classifier_model.predict(haystack_doc)
107
+
108
+
109
+ labels_= [(l.meta['classification']['label'],
110
+ l.meta['classification']['score'],l.content,) for l in results]
111
+
112
+ df = DataFrame(labels_, columns=["SDG","Relevancy","text"])
113
+
114
+ df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
115
+ df.index += 1
116
+ df =df[df['Relevancy']>threshold]
117
+
118
+ # creating the dataframe for value counts of SDG, along with 'title' of SDGs
119
+ x = df['SDG'].value_counts()
120
+ x = x.rename('count')
121
+ x = x.rename_axis('SDG').reset_index()
122
+ x["SDG"] = pd.to_numeric(x["SDG"])
123
+ x = x.sort_values(by=['count'], ascending=False)
124
+ x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
125
+ x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
126
+
127
+ df['SDG'] = pd.to_numeric(df['SDG'])
128
+ df = df.sort_values('SDG')
129
+
130
+ return df, x
131
+
132
+ def runSDGPreprocessingPipeline(file_name:str, file_path:str,
133
+ split_by: Literal["sentence", "word"] = 'sentence',
134
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
135
+ split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
136
+ """
137
+ creates the pipeline and runs the preprocessing pipeline,
138
+ the params for pipeline are fetched from paramconfig
139
+
140
+ Params
141
+ ------------
142
+
143
+ file_name: filename, in case of streamlit application use
144
+ st.session_state['filename']
145
+ file_path: filepath, in case of streamlit application use st.session_state['filepath']
146
+ split_by: document splitting strategy either as word or sentence
147
+ split_length: when synthetically creating the paragrpahs from document,
148
+ it defines the length of paragraph.
149
+ split_respect_sentence_boundary: Used when using 'word' strategy for
150
+ splititng of text.
151
+ split_overlap: Number of words or sentences that overlap when creating
152
+ the paragraphs. This is done as one sentence or 'some words' make sense
153
+ when read in together with others. Therefore the overlap is used.
154
+ remove_punc: to remove all Punctuation including ',' and '.' or not
155
+
156
+
157
+ Return
158
+ --------------
159
+ List[Document]: When preprocessing pipeline is run, the output dictionary
160
+ has four objects. For the Haysatck implementation of SDG classification we,
161
+ need to use the List of Haystack Document, which can be fetched by
162
+ key = 'documents' on output.
163
+
164
+ """
165
+
166
+ sdg_processing_pipeline = processingpipeline()
167
+
168
+ output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
169
+ params= {"FileConverter": {"file_path": file_path, \
170
+ "file_name": file_name},
171
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
172
+ "split_by": split_by, \
173
+ "split_length":split_length,\
174
+ "split_overlap": split_overlap, \
175
+ "split_respect_sentence_boundary":split_respect_sentence_boundary}})
176
+
177
+ return output_sdg_pre
utils/semantic_search.py ADDED
@@ -0,0 +1,582 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes import TransformersQueryClassifier, Docs2Answers
2
+ from haystack.nodes import EmbeddingRetriever, FARMReader
3
+ from haystack.nodes.base import BaseComponent
4
+ from haystack.document_stores import InMemoryDocumentStore
5
+ from markdown import markdown
6
+ from annotated_text import annotation
7
+ from haystack.schema import Document
8
+ from typing import List, Text, Union
9
+ from typing_extensions import Literal
10
+ from utils.preprocessing import processingpipeline
11
+ from utils.streamlitcheck import check_streamlit
12
+ from haystack.pipelines import Pipeline
13
+ import pandas as pd
14
+ import logging
15
+ try:
16
+ from termcolor import colored
17
+ except:
18
+ pass
19
+ try:
20
+ import streamlit as st
21
+ except ImportError:
22
+ logging.info("Streamlit not installed")
23
+
24
+
25
+ @st.cache(allow_output_mutation=True)
26
+ def loadQueryClassifier():
27
+ """
28
+ retuns the haystack query classifier model
29
+ model = shahrukhx01/bert-mini-finetune-question-detection
30
+
31
+ """
32
+ query_classifier = TransformersQueryClassifier(model_name_or_path=
33
+ "shahrukhx01/bert-mini-finetune-question-detection")
34
+ return query_classifier
35
+
36
+ class QueryCheck(BaseComponent):
37
+ """
38
+ Uses Query Classifier from Haystack, process the query based on query type.
39
+ Ability to determine the statements is not so good, therefore the chances
40
+ statement also get modified. Ex: "List water related issues" will be
41
+ identified by the model as keywords, and therefore it be processed as "what
42
+ are the 'list all water related issues' related issues and discussions?".
43
+ This is one shortcoming but is igonred for now, as semantic search will not
44
+ get affected a lot, by this. If you want to pass keywords list and want to
45
+ do batch processing use. run_batch. Example: if you want to find relevant
46
+ passages for water, food security, poverty then querylist = ["water", "food
47
+ security","poverty"] and then execute QueryCheck.run_batch(queries = querylist)
48
+
49
+ 1. https://docs.haystack.deepset.ai/docs/query_classifier
50
+
51
+ """
52
+
53
+ outgoing_edges = 1
54
+
55
+ def run(self, query:str):
56
+ """
57
+ mandatory method to use the custom node. Determines the query type, if
58
+ if the query is of type keyword/statement will modify it to make it more
59
+ useful for sentence transoformers.
60
+
61
+ Params
62
+ --------
63
+ query: query/statement/keywords in form of string
64
+
65
+ Return
66
+ ------
67
+ output: dictionary, with key as identifier and value could be anything
68
+ we need to return. In this case the output contain key = 'query'.
69
+
70
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
71
+
72
+ """
73
+ query_classifier = loadQueryClassifier()
74
+ result = query_classifier.run(query=query)
75
+
76
+ if result[1] == "output_1":
77
+ output = {"query":query,
78
+ "query_type": 'question/statement'}
79
+ else:
80
+ output = {"query": "what are the {} related issues and \
81
+ discussions?".format(query),
82
+ "query_type": 'statements/keyword'}
83
+ logging.info(output)
84
+ return output, "output_1"
85
+
86
+ def run_batch(self, queries:List[str]):
87
+ """
88
+ running multiple queries in one go, howeevr need the queries to be passed
89
+ as list of string. Example: if you want to find relevant passages for
90
+ water, food security, poverty then querylist = ["water", "food security",
91
+ "poverty"] and then execute QueryCheck.run_batch(queries = querylist)
92
+
93
+ Params
94
+ --------
95
+ queries: queries/statements/keywords in form of string encapsulated
96
+ within List
97
+
98
+ Return
99
+ ------
100
+ output: dictionary, with key as identifier and value could be anything
101
+ we need to return. In this case the output contain key = 'queries'.
102
+
103
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
104
+ """
105
+ query_classifier = loadQueryClassifier()
106
+ query_list = []
107
+ for query in queries:
108
+ result = query_classifier.run(query=query)
109
+ if result[1] == "output_1":
110
+ query_list.append(query)
111
+ else:
112
+ query_list.append("what are the {} related issues and \
113
+ discussions?".format(query))
114
+ output = {'queries':query_list}
115
+ logging.info(output)
116
+ return output, "output_1"
117
+
118
+
119
+ @st.cache(allow_output_mutation=True)
120
+ def runSemanticPreprocessingPipeline(file_path:str, file_name:str,
121
+ split_by: Literal["sentence", "word"] = 'sentence',
122
+ split_length:int = 2, split_overlap:int = 0,
123
+ split_respect_sentence_boundary:bool = False,
124
+ remove_punc:bool = False)->List[Document]:
125
+ """
126
+ creates the pipeline and runs the preprocessing pipeline.
127
+
128
+ Params
129
+ ------------
130
+
131
+ file_name: filename, in case of streamlit application use
132
+ st.session_state['filename']
133
+ file_path: filepath, in case of streamlit application use
134
+ st.session_state['filepath']
135
+ split_by: document splitting strategy either as word or sentence
136
+ split_length: when synthetically creating the paragrpahs from document,
137
+ it defines the length of paragraph.
138
+ split_overlap: Number of words or sentences that overlap when creating the
139
+ paragraphs. This is done as one sentence or 'some words' make sense
140
+ when read in together with others. Therefore the overlap is used.
141
+ split_respect_sentence_boundary: Used when using 'word' strategy for
142
+ splititng of text.
143
+ remove_punc: to remove all Punctuation including ',' and '.' or not
144
+
145
+ Return
146
+ --------------
147
+ List[Document]: When preprocessing pipeline is run, the output dictionary
148
+ has four objects. For the Haysatck implementation of semantic search we,
149
+ need to use the List of Haystack Document, which can be fetched by
150
+ key = 'documents' on output.
151
+
152
+ """
153
+
154
+ semantic_processing_pipeline = processingpipeline()
155
+
156
+ output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
157
+ params= {"FileConverter": {"file_path": file_path, \
158
+ "file_name": file_name},
159
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
160
+ "split_by": split_by, \
161
+ "split_length":split_length,\
162
+ "split_overlap": split_overlap,
163
+ "split_respect_sentence_boundary":split_respect_sentence_boundary}})
164
+
165
+ return output_semantic_pre
166
+
167
+
168
+ @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
169
+ allow_output_mutation=True)
170
+ def loadRetriever(embedding_model:Text=None, embedding_model_format:Text = None,
171
+ embedding_layer:int = None, retriever_top_k:int = 10,
172
+ max_seq_len:int=512, document_store:InMemoryDocumentStore=None):
173
+ """
174
+ Returns the Retriever model based on params provided.
175
+ 1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
176
+ 2. https://www.sbert.net/examples/applications/semantic-search/README.html
177
+ 3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
178
+
179
+
180
+ Params
181
+ ---------
182
+ embedding_model: Name of the model to be used for embedding. Check the links
183
+ provided in documentation
184
+ embedding_model_format: check the github link of Haystack provided in
185
+ documentation embedding_layer: check the github link of Haystack
186
+ provided in documentation retriever_top_k: Number of Top results to
187
+ be returned by
188
+ retriever max_seq_len: everymodel has max seq len it can handle, check in
189
+ model card. Needed to hanlde the edge cases.
190
+ document_store: InMemoryDocumentStore, write haystack Document list to
191
+ DocumentStore and pass the same to function call. Can be done using
192
+ createDocumentStore from utils.
193
+
194
+ Return
195
+ -------
196
+ retriever: embedding model
197
+ """
198
+ logging.info("loading retriever")
199
+ if document_store is None:
200
+ logging.warning("Retriever initialization requires the DocumentStore")
201
+ return
202
+
203
+ retriever = EmbeddingRetriever(
204
+ embedding_model=embedding_model,top_k = retriever_top_k,
205
+ document_store = document_store,
206
+ emb_extraction_layer=embedding_layer, scale_score =True,
207
+ model_format=embedding_model_format, use_gpu = True,
208
+ max_seq_len = max_seq_len )
209
+ if check_streamlit:
210
+ st.session_state['retriever'] = retriever
211
+ return retriever
212
+
213
+ @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
214
+ allow_output_mutation=True)
215
+ def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
216
+ embedding_dim:int = 768):
217
+ """
218
+ Creates the InMemory Document Store from haystack list of Documents.
219
+ It is mandatory component for Retriever to work in Haystack frame work.
220
+
221
+ Params
222
+ -------
223
+ documents: List of haystack document. If using the preprocessing pipeline,
224
+ can be fetched key = 'documents; on output of preprocessing pipeline.
225
+ similarity: scoring function, can be either 'cosine' or 'dot_product'
226
+ embedding_dim: Document store has default value of embedding size = 768, and
227
+ update_embeddings method of Docstore cannot infer the embedding size of
228
+ retiever automatically, therefore set this value as per the model card.
229
+
230
+ Return
231
+ -------
232
+ document_store: InMemory Document Store object type.
233
+
234
+ """
235
+ document_store = InMemoryDocumentStore(similarity = similarity,
236
+ embedding_dim = embedding_dim )
237
+ document_store.write_documents(documents)
238
+
239
+ return document_store
240
+
241
+
242
+ @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
243
+ allow_output_mutation=True)
244
+ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
245
+ embedding_model_format:Text = None,embedding_layer:int = None,
246
+ embedding_dim:int = 768,retriever_top_k:int = 10,
247
+ reader_model:str = None, reader_top_k:int = 10,
248
+ max_seq_len:int =512,useQueryCheck = True,
249
+ top_k_per_candidate:int = 1):
250
+ """
251
+ creates the semantic search pipeline and document Store object from the
252
+ list of haystack documents. The top_k for the Reader and Retirever are kept
253
+ same, so that all the results returned by Retriever are used, however the
254
+ context is extracted by Reader for each retrieved result. The querycheck is
255
+ added as node to process the query. This pipeline is suited for keyword search,
256
+ and to some extent extractive QA purpose. The purpose of Reader is strictly to
257
+ highlight the context for retrieved result and not for QA, however as stated
258
+ it can work for QA too in limited sense.
259
+ There are 4 variants of pipeline it can return
260
+ 1.QueryCheck > Retriever > Reader
261
+ 2.Retriever > Reader
262
+ 3.QueryCheck > Retriever > Docs2Answers : If reader is None,
263
+ then Doc2answer is used to keep the output of pipeline structurally same.
264
+ 4.Retriever > Docs2Answers
265
+
266
+ Links
267
+
268
+ 1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
269
+ 2. https://www.sbert.net/examples/applications/semantic-search/README.html
270
+ 3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
271
+ 4. https://docs.haystack.deepset.ai/docs/reader
272
+
273
+
274
+ Params
275
+ ----------
276
+ documents: list of Haystack Documents, returned by preprocessig pipeline.
277
+ embedding_model: Name of the model to be used for embedding. Check the links
278
+ provided in documentation
279
+ embedding_model_format: check the github link of Haystack provided in
280
+ documentation
281
+ embedding_layer: check the github link of Haystack provided in documentation
282
+ embedding_dim: Document store has default value of embedding size = 768, and
283
+ update_embeddings method of Docstore cannot infer the embedding size of
284
+ retiever automatically, therefore set this value as per the model card.
285
+ retriever_top_k: Number of Top results to be returned by retriever
286
+ reader_model: Name of the model to be used for Reader node in hasyatck
287
+ Pipeline. Check the links provided in documentation
288
+ reader_top_k: Reader will use retrieved results to further find better matches.
289
+ As purpose here is to use reader to extract context, the value is
290
+ same as retriever_top_k.
291
+ max_seq_len:everymodel has max seq len it can handle, check in model card.
292
+ Needed to hanlde the edge cases
293
+ useQueryCheck: Whether to use the querycheck which modifies the query or not.
294
+ top_k_per_candidate:How many answers to extract for each candidate doc
295
+ that is coming from the retriever
296
+
297
+ Return
298
+ ---------
299
+ semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
300
+ nodes [QueryCheck, Retriever, Reader/Docs2Answer]. If reader is None,
301
+ then Doc2answer is used to keep the output of pipeline structurally
302
+ same.
303
+
304
+ document_store: As retriever can work only with Haystack Document Store, the
305
+ list of document returned by preprocessing pipeline are fed into to
306
+ get InMemmoryDocumentStore object type, with retriever updating the
307
+ embeddings of each paragraph in document store.
308
+
309
+ """
310
+ document_store = createDocumentStore(documents=documents,
311
+ embedding_dim=embedding_dim)
312
+ retriever = loadRetriever(embedding_model = embedding_model,
313
+ embedding_model_format=embedding_model_format,
314
+ embedding_layer=embedding_layer,
315
+ retriever_top_k= retriever_top_k,
316
+ document_store = document_store,
317
+ max_seq_len=max_seq_len)
318
+ document_store.update_embeddings(retriever)
319
+ semantic_search_pipeline = Pipeline()
320
+ if useQueryCheck and reader_model:
321
+ querycheck = QueryCheck()
322
+ reader = FARMReader(model_name_or_path=reader_model,
323
+ top_k = reader_top_k, use_gpu=True,
324
+ top_k_per_candidate = top_k_per_candidate)
325
+ semantic_search_pipeline.add_node(component = querycheck,
326
+ name = "QueryCheck",inputs = ["Query"])
327
+ semantic_search_pipeline.add_node(component = retriever,
328
+ name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
329
+ semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
330
+ inputs= ["EmbeddingRetriever"])
331
+
332
+ elif reader_model :
333
+ reader = FARMReader(model_name_or_path=reader_model,
334
+ top_k = reader_top_k, use_gpu=True,
335
+ top_k_per_candidate = top_k_per_candidate)
336
+ semantic_search_pipeline.add_node(component = retriever,
337
+ name = "EmbeddingRetriever",inputs = ["Query"])
338
+ semantic_search_pipeline.add_node(component = reader,
339
+ name = "FARMReader",inputs= ["EmbeddingRetriever"])
340
+ elif useQueryCheck and not reader_model:
341
+ querycheck = QueryCheck()
342
+ docs2answers = Docs2Answers()
343
+ semantic_search_pipeline.add_node(component = querycheck,
344
+ name = "QueryCheck",inputs = ["Query"])
345
+ semantic_search_pipeline.add_node(component = retriever,
346
+ name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
347
+ semantic_search_pipeline.add_node(component = docs2answers,
348
+ name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
349
+ elif not useQueryCheck and not reader_model:
350
+ docs2answers = Docs2Answers()
351
+ semantic_search_pipeline.add_node(component = retriever,
352
+ name = "EmbeddingRetriever",inputs = ["Query"])
353
+ semantic_search_pipeline.add_node(component = docs2answers,
354
+ name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
355
+
356
+ logging.info(semantic_search_pipeline.components)
357
+ return semantic_search_pipeline, document_store
358
+
359
+ def runSemanticPipeline(pipeline:Pipeline, queries:Union[list,str])->dict:
360
+ """
361
+ will use the haystack run or run_batch based on if single query is passed
362
+ as string or multiple queries as List[str]
363
+
364
+ Params
365
+ -------
366
+ pipeline: haystack pipeline, this is same as returned by semanticSearchPipeline
367
+ from utils.semanticsearch
368
+
369
+ queries: Either a single query or list of queries.
370
+
371
+ Return
372
+ -------
373
+ results: Dict containing answers and documents as key and their respective
374
+ values
375
+
376
+ """
377
+
378
+ if type(queries) == list:
379
+ results = pipeline.run_batch(queries=queries)
380
+ elif type(queries) == str:
381
+ results = pipeline.run(query=queries)
382
+ else:
383
+ logging.info("Please check the input type for the queries")
384
+ return
385
+
386
+ return results
387
+
388
+ def process_query_output(results:dict)->pd.DataFrame:
389
+ """
390
+ Returns the dataframe with necessary information like including
391
+ ['query','answer','answer_offset','context_offset','context','content',
392
+ 'reader_score','retriever_score','id',]. This is designed for output given
393
+ by semantic search pipeline with single query and final node as reader.
394
+ The output of pipeline having Docs2Answers as final node or multiple queries
395
+ need to be handled separately. In these other cases, use process_semantic_output
396
+ from utils.semantic_search which uses this function internally to make one
397
+ combined dataframe.
398
+
399
+ Params
400
+ ---------
401
+ results: this dictionary should have key,values with
402
+ keys = [query,answers,documents], however answers is optional.
403
+ in case of [Doc2Answers as final node], process_semantic_output
404
+ doesnt return answers thereby setting all values contained in
405
+ answers to 'None'
406
+
407
+ Return
408
+ --------
409
+ df: dataframe with all the columns mentioned in function description.
410
+
411
+ """
412
+ query_text = results['query']
413
+ if 'answers' in results.keys():
414
+ answer_dict = {}
415
+
416
+ for answer in results['answers']:
417
+ answer_dict[answer.document_id] = answer.to_dict()
418
+ else:
419
+ answer_dict = {}
420
+ docs = results['documents']
421
+ df = pd.DataFrame(columns=['query','answer','answer_offset','context_offset',
422
+ 'context','content','reader_score','retriever_score',
423
+ 'id'])
424
+ for doc in docs:
425
+ row_list = {}
426
+ row_list['query'] = query_text
427
+ row_list['retriever_score'] = doc.score
428
+ row_list['id'] = doc.id
429
+ row_list['content'] = doc.content
430
+ if doc.id in answer_dict.keys():
431
+ row_list['answer'] = answer_dict[doc.id]['answer']
432
+ row_list['context'] = answer_dict[doc.id]['context']
433
+ row_list['reader_score'] = answer_dict[doc.id]['score']
434
+ answer_offset = answer_dict[doc.id]['offsets_in_document'][0]
435
+ row_list['answer_offset'] = [answer_offset['start'],answer_offset['end']]
436
+ start_idx = doc.content.find(row_list['context'])
437
+ end_idx = start_idx + len(row_list['context'])
438
+ row_list['context_offset'] = [start_idx, end_idx]
439
+ else:
440
+ row_list['answer'] = None
441
+ row_list['context'] = None
442
+ row_list['reader_score'] = None
443
+ row_list['answer_offset'] = None
444
+ row_list['context_offset'] = None
445
+ df_dictionary = pd.DataFrame([row_list])
446
+ df = pd.concat([df, df_dictionary], ignore_index=True)
447
+
448
+ return df
449
+
450
+ def process_semantic_output(results):
451
+ """
452
+ Returns the dataframe with necessary information like including
453
+ ['query','answer','answer_offset','context_offset','context','content',
454
+ 'reader_score','retriever_score','id',]. Distingushes if its single query or
455
+ multi queries by reading the pipeline output dictionary keys.
456
+ Uses the process_query_output to get the dataframe for each query and create
457
+ one concataneted dataframe. In case of Docs2Answers as final node, deletes
458
+ the answers part. See documentations of process_query_output.
459
+
460
+ Params
461
+ ---------
462
+ results: raw output of runSemanticPipeline.
463
+
464
+ Return
465
+ --------
466
+ df: dataframe with all the columns mentioned in function description.
467
+
468
+ """
469
+ output = {}
470
+ if 'query' in results.keys():
471
+ output['query'] = results['query']
472
+ output['documents'] = results['documents']
473
+ if results['node_id'] == 'Docs2Answers':
474
+ pass
475
+ else:
476
+ output['answers'] = results['answers']
477
+ df = process_query_output(output)
478
+ return df
479
+ if 'queries' in results.keys():
480
+ df = pd.DataFrame(columns=['query','answer','answer_offset',
481
+ 'context_offset','context','content',
482
+ 'reader_score','retriever_score','id'])
483
+ for query,answers,documents in zip(results['queries'],
484
+ results['answers'],results['documents']):
485
+ output = {}
486
+ output['query'] = query
487
+ output['documents'] = documents
488
+ if results['node_id'] == 'Docs2Answers':
489
+ pass
490
+ else:
491
+ output['answers'] = answers
492
+
493
+ temp = process_query_output(output)
494
+ df = pd.concat([df, temp], ignore_index=True)
495
+
496
+
497
+ return df
498
+
499
+ def semanticsearchAnnotator(matches:List[List[int]], document:Text):
500
+ """
501
+ Annotates the text in the document defined by list of [start index, end index]
502
+ Example: "How are you today", if document type is text, matches = [[0,3]]
503
+ will give answer = "How", however in case we used the spacy matcher then the
504
+ matches = [[0,3]] will give answer = "How are you". However if spacy is used
505
+ to find "How" then the matches = [[0,1]] for the string defined above.
506
+
507
+ """
508
+ start = 0
509
+ annotated_text = ""
510
+ for match in matches:
511
+ start_idx = match[0]
512
+ end_idx = match[1]
513
+ if check_streamlit():
514
+ annotated_text = (annotated_text + document[start:start_idx]
515
+ + str(annotation(body=document[start_idx:end_idx],
516
+ label="Context", background="#964448", color='#ffffff')))
517
+ else:
518
+ annotated_text = (annotated_text + document[start:start_idx]
519
+ + colored(document[start_idx:end_idx],
520
+ "green", attrs = ['bold']))
521
+ start = end_idx
522
+
523
+ annotated_text = annotated_text + document[end_idx:]
524
+
525
+ if check_streamlit():
526
+
527
+ st.write(
528
+ markdown(annotated_text),
529
+ unsafe_allow_html=True,
530
+ )
531
+ else:
532
+ print(annotated_text)
533
+
534
+
535
+ def semantic_keywordsearch(query:Text,documents:List[Document],
536
+ embedding_model:Text,
537
+ embedding_model_format:Text,
538
+ embedding_layer:int, reader_model:str,
539
+ retriever_top_k:int = 10, reader_top_k:int = 10,
540
+ return_results:bool = False, embedding_dim:int = 768,
541
+ max_seq_len:int = 512,top_k_per_candidate:int =1,
542
+ sort_by:Literal["retriever", "reader"] = 'retriever'):
543
+ """
544
+ Performs the Semantic search on the List of haystack documents which is
545
+ returned by preprocessing Pipeline.
546
+
547
+ Params
548
+ -------
549
+ query: Keywords that need to be searche in documents.
550
+ documents: List fo Haystack documents returned by preprocessing pipeline.
551
+
552
+ """
553
+ semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = documents,
554
+ embedding_model= embedding_model,
555
+ embedding_layer= embedding_layer,
556
+ embedding_model_format= embedding_model_format,
557
+ reader_model= reader_model, retriever_top_k= retriever_top_k,
558
+ reader_top_k= reader_top_k, embedding_dim=embedding_dim,
559
+ max_seq_len=max_seq_len,
560
+ top_k_per_candidate=top_k_per_candidate)
561
+
562
+ raw_output = runSemanticPipeline(semanticsearch_pipeline,query)
563
+ results_df = process_semantic_output(raw_output)
564
+ if sort_by == 'retriever':
565
+ results_df = results_df.sort_values(by=['retriever_score'], ascending=False)
566
+ else:
567
+ results_df = results_df.sort_values(by=['reader_score'], ascending=False)
568
+
569
+ if return_results:
570
+ return results_df
571
+ else:
572
+ if check_streamlit:
573
+ st.markdown("##### Top few semantic search results #####")
574
+ else:
575
+ print("Top few semantic search results")
576
+ for i in range(len(results_df)):
577
+ if check_streamlit:
578
+ st.write("Result {}".format(i+1))
579
+ else:
580
+ print("Result {}".format(i+1))
581
+ semanticsearchAnnotator([results_df.loc[i]['context_offset']],
582
+ results_df.loc[i]['content'] )
utils/streamlitcheck.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ try:
3
+ import streamlit as st
4
+ except ImportError:
5
+ logging.info("Streamlit not installed")
6
+
7
+
8
+ def check_streamlit():
9
+ """
10
+ Function to check whether python code is run within streamlit
11
+
12
+ Returns
13
+ -------
14
+ use_streamlit : boolean
15
+ True if code is run within streamlit, else False
16
+ """
17
+ try:
18
+ from streamlit.scriptrunner.script_run_context import get_script_run_ctx
19
+ if not get_script_run_ctx():
20
+ use_streamlit = False
21
+ else:
22
+ use_streamlit = True
23
+ except ModuleNotFoundError:
24
+ use_streamlit = False
25
+ return use_streamlit
26
+
27
+ def disable_other_checkboxes(*other_checkboxes_keys):
28
+ for checkbox_key in other_checkboxes_keys:
29
+ st.session_state[checkbox_key] = False
30
+
31
+ def checkbox_without_preselect(keylist):
32
+ dict_ = {}
33
+ for i,key_val in enumerate(keylist):
34
+ dict_[i] = st.checkbox(key_val,key = key_val,
35
+ on_change = disable_other_checkboxes,
36
+ args=tuple(list(filter(lambda x: x!= key_val, keylist))),)
37
+
38
+ for key,val in dict_.items():
39
+ if val == True:
40
+ return keylist[int(key)]
41
+
42
+ return None
utils/uploadAndExample.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ import json
4
+
5
+ def add_upload(choice):
6
+ """
7
+ Provdies the user with choice to either 'Upload Document' or 'Try Example'.
8
+ Based on user choice runs streamlit processes and save the path and name of
9
+ the 'file' to streamlit session_state which then can be fetched later.
10
+
11
+ """
12
+
13
+ if choice == 'Upload Document':
14
+ uploaded_file = st.sidebar.file_uploader('Upload the File',
15
+ type=['pdf', 'docx', 'txt'])
16
+ if uploaded_file is not None:
17
+ with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
18
+ bytes_data = uploaded_file.getvalue()
19
+ temp.write(bytes_data)
20
+ st.session_state['filename'] = uploaded_file.name
21
+ st.session_state['filepath'] = temp.name
22
+
23
+
24
+ else:
25
+ # listing the options
26
+ with open('docStore/sample/files.json','r') as json_file:
27
+ files = json.load(json_file)
28
+
29
+ option = st.sidebar.selectbox('Select the example document',
30
+ list(files.keys()))
31
+ file_name = file_path = files[option]
32
+ st.session_state['filename'] = file_name
33
+ st.session_state['filepath'] = file_path