Upload 9 files
Browse files- utils/checkconfig.py +15 -0
- utils/keyword_extraction.py +140 -0
- utils/lexical_search.py +251 -0
- utils/ndc_explorer.py +90 -0
- utils/preprocessing.py +260 -0
- utils/sdg_classifier.py +177 -0
- utils/semantic_search.py +582 -0
- utils/streamlitcheck.py +42 -0
- utils/uploadAndExample.py +33 -0
utils/checkconfig.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import configparser
|
2 |
+
import logging
|
3 |
+
|
4 |
+
def getconfig(configfile_path:str):
|
5 |
+
"""
|
6 |
+
configfile_path: file path of .cfg file
|
7 |
+
"""
|
8 |
+
|
9 |
+
config = configparser.ConfigParser()
|
10 |
+
|
11 |
+
try:
|
12 |
+
config.read_file(open(configfile_path))
|
13 |
+
return config
|
14 |
+
except:
|
15 |
+
logging.warning("config file not found")
|
utils/keyword_extraction.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
3 |
+
# import nltk
|
4 |
+
# nltk.download('stopwords')
|
5 |
+
# from nltk.corpus import stopwords
|
6 |
+
import pickle
|
7 |
+
from typing import List, Text
|
8 |
+
import logging
|
9 |
+
from summa import keywords
|
10 |
+
|
11 |
+
try:
|
12 |
+
import streamlit as st
|
13 |
+
except ImportError:
|
14 |
+
logging.info("Streamlit not installed")
|
15 |
+
|
16 |
+
|
17 |
+
def sort_coo(coo_matrix):
|
18 |
+
"""
|
19 |
+
It takes Coordinate format scipy sparse matrix and extracts info from same.\
|
20 |
+
1. https://kavita-ganesan.com/python-keyword-extraction/#.Y2-TFHbMJPb
|
21 |
+
"""
|
22 |
+
tuples = zip(coo_matrix.col, coo_matrix.data)
|
23 |
+
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
|
24 |
+
|
25 |
+
def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
|
26 |
+
"""get the feature names and tf-idf score of top n items
|
27 |
+
|
28 |
+
Params
|
29 |
+
---------
|
30 |
+
feature_names: list of words from vectorizer
|
31 |
+
sorted_items: tuple returned by sort_coo function defined in \
|
32 |
+
keyword_extraction.py
|
33 |
+
topn: topn words to be extracted using tfidf
|
34 |
+
|
35 |
+
Return
|
36 |
+
----------
|
37 |
+
results: top extracted keywords
|
38 |
+
|
39 |
+
"""
|
40 |
+
|
41 |
+
#use only topn items from vector
|
42 |
+
sorted_items = sorted_items[:top_n]
|
43 |
+
score_vals = []
|
44 |
+
feature_vals = []
|
45 |
+
|
46 |
+
# word index and corresponding tf-idf score
|
47 |
+
for idx, score in sorted_items:
|
48 |
+
|
49 |
+
#keep track of feature name and its corresponding score
|
50 |
+
score_vals.append(round(score, 3))
|
51 |
+
feature_vals.append(feature_names[idx])
|
52 |
+
|
53 |
+
results= {}
|
54 |
+
for idx in range(len(feature_vals)):
|
55 |
+
results[feature_vals[idx]]=score_vals[idx]
|
56 |
+
|
57 |
+
return results
|
58 |
+
|
59 |
+
|
60 |
+
def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n):
|
61 |
+
"""
|
62 |
+
TFIDF based keywords extraction
|
63 |
+
|
64 |
+
Params
|
65 |
+
---------
|
66 |
+
vectorizer: trained cont vectorizer model
|
67 |
+
tfidfmodel: TFIDF Tranformer model
|
68 |
+
top_n: Top N keywords to be extracted
|
69 |
+
textdata: text data to which needs keyword extraction
|
70 |
+
|
71 |
+
Return
|
72 |
+
----------
|
73 |
+
keywords: top extracted keywords
|
74 |
+
|
75 |
+
"""
|
76 |
+
features = vectorizer.get_feature_names_out()
|
77 |
+
tf_idf_vector=tfidfmodel.transform(vectorizer.transform(textdata))
|
78 |
+
sorted_items=sort_coo(tf_idf_vector.tocoo())
|
79 |
+
results=extract_topn_from_vector(features,sorted_items,top_n)
|
80 |
+
keywords = [keyword for keyword in results]
|
81 |
+
return keywords
|
82 |
+
|
83 |
+
def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
|
84 |
+
"""
|
85 |
+
TFIDF based keywords extraction
|
86 |
+
|
87 |
+
Params
|
88 |
+
---------
|
89 |
+
sdg: which sdg tfidf model to be used
|
90 |
+
sdgdata: text data to which needs keyword extraction
|
91 |
+
|
92 |
+
|
93 |
+
Return
|
94 |
+
----------
|
95 |
+
keywords: top extracted keywords
|
96 |
+
|
97 |
+
"""
|
98 |
+
model_path = "docStore/sdg{}/".format(sdg)
|
99 |
+
vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
|
100 |
+
tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
|
101 |
+
features = vectorizer.get_feature_names_out()
|
102 |
+
tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
|
103 |
+
sorted_items=sort_coo(tf_idf_vector.tocoo())
|
104 |
+
top_n = top_n
|
105 |
+
results=extract_topn_from_vector(features,sorted_items,top_n)
|
106 |
+
keywords = [keyword for keyword in results]
|
107 |
+
return keywords
|
108 |
+
|
109 |
+
@st.cache(allow_output_mutation=True)
|
110 |
+
def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]:
|
111 |
+
"""
|
112 |
+
wrappper function to perform textrank, uses either ratio or wordcount to
|
113 |
+
extract top keywords limited by words or ratio.
|
114 |
+
1. https://github.com/summanlp/textrank/blob/master/summa/keywords.py
|
115 |
+
|
116 |
+
Params
|
117 |
+
--------
|
118 |
+
textdata: text data to perform the textrank.
|
119 |
+
ratio: float to limit the number of keywords as proportion of total token \
|
120 |
+
in textdata
|
121 |
+
words: number of keywords to be extracted. Takes priority over ratio if \
|
122 |
+
Non zero. Howevr incase the pagerank returns lesser keywords than \
|
123 |
+
compared to fix value then ratio is used.
|
124 |
+
|
125 |
+
Return
|
126 |
+
--------
|
127 |
+
results: extracted keywords
|
128 |
+
"""
|
129 |
+
if words == 0:
|
130 |
+
logging.info("Textrank using defulat ratio value = 0.1, as no words limit given")
|
131 |
+
results = keywords.keywords(textdata, ratio= ratio).split("\n")
|
132 |
+
else:
|
133 |
+
try:
|
134 |
+
results = keywords.keywords(textdata, words= words).split("\n")
|
135 |
+
except:
|
136 |
+
results = keywords.keywords(textdata, ratio = ratio).split("\n")
|
137 |
+
|
138 |
+
return results
|
139 |
+
|
140 |
+
|
utils/lexical_search.py
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack.nodes import TfidfRetriever
|
2 |
+
from haystack.document_stores import InMemoryDocumentStore
|
3 |
+
import spacy
|
4 |
+
import re
|
5 |
+
from spacy.matcher import Matcher
|
6 |
+
from markdown import markdown
|
7 |
+
from annotated_text import annotation
|
8 |
+
from haystack.schema import Document
|
9 |
+
from typing import List, Text, Tuple
|
10 |
+
from typing_extensions import Literal
|
11 |
+
from utils.preprocessing import processingpipeline
|
12 |
+
from utils.streamlitcheck import check_streamlit
|
13 |
+
import logging
|
14 |
+
try:
|
15 |
+
from termcolor import colored
|
16 |
+
except:
|
17 |
+
pass
|
18 |
+
|
19 |
+
try:
|
20 |
+
import streamlit as st
|
21 |
+
except ImportError:
|
22 |
+
logging.info("Streamlit not installed")
|
23 |
+
|
24 |
+
|
25 |
+
def runLexicalPreprocessingPipeline(file_name:str,file_path:str,
|
26 |
+
split_by: Literal["sentence", "word"] = 'word',
|
27 |
+
split_length:int = 80, split_overlap:int = 0,
|
28 |
+
remove_punc:bool = False,)->List[Document]:
|
29 |
+
"""
|
30 |
+
creates the pipeline and runs the preprocessing pipeline,
|
31 |
+
the params for pipeline are fetched from paramconfig. As lexical doesnt gets
|
32 |
+
affected by overlap, threfore split_overlap = 0 in default paramconfig and
|
33 |
+
split_by = word.
|
34 |
+
|
35 |
+
Params
|
36 |
+
------------
|
37 |
+
|
38 |
+
file_name: filename, in case of streamlit application use
|
39 |
+
st.session_state['filename']
|
40 |
+
file_path: filepath, in case of streamlit application use
|
41 |
+
st.session_state['filepath']
|
42 |
+
split_by: document splitting strategy either as word or sentence
|
43 |
+
split_length: when synthetically creating the paragrpahs from document,
|
44 |
+
it defines the length of paragraph.
|
45 |
+
split_overlap: Number of words or sentences that overlap when creating
|
46 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
47 |
+
when read in together with others. Therefore the overlap is used.
|
48 |
+
splititng of text.
|
49 |
+
removePunc: to remove all Punctuation including ',' and '.' or not
|
50 |
+
|
51 |
+
Return
|
52 |
+
--------------
|
53 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
54 |
+
has four objects. For the lexicaal search using TFIDFRetriever we
|
55 |
+
need to use the List of Haystack Document, which can be fetched by
|
56 |
+
key = 'documents' on output.
|
57 |
+
|
58 |
+
"""
|
59 |
+
|
60 |
+
lexical_processing_pipeline = processingpipeline()
|
61 |
+
|
62 |
+
|
63 |
+
output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
|
64 |
+
params= {"FileConverter": {"file_path": file_path, \
|
65 |
+
"file_name": file_name},
|
66 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
67 |
+
"split_by": split_by, \
|
68 |
+
"split_length":split_length,\
|
69 |
+
"split_overlap": split_overlap}})
|
70 |
+
|
71 |
+
return output_lexical_pre
|
72 |
+
|
73 |
+
|
74 |
+
def tokenize_lexical_query(query:str)-> List[str]:
|
75 |
+
"""
|
76 |
+
Removes the stop words from query and returns the list of important keywords
|
77 |
+
in query. For the lexical search the relevent paragraphs in document are
|
78 |
+
retreived using TfIDFretreiver from Haystack. However to highlight these
|
79 |
+
keywords we need the tokenized form of query.
|
80 |
+
|
81 |
+
Params
|
82 |
+
--------
|
83 |
+
query: string which represents either list of keywords user is looking for
|
84 |
+
or a query in form of Question.
|
85 |
+
|
86 |
+
Return
|
87 |
+
-----------
|
88 |
+
token_list: list of important keywords in the query.
|
89 |
+
|
90 |
+
"""
|
91 |
+
nlp = spacy.load("en_core_web_sm")
|
92 |
+
token_list = [token.text.lower() for token in nlp(query)
|
93 |
+
if not (token.is_stop or token.is_punct)]
|
94 |
+
return token_list
|
95 |
+
|
96 |
+
def runSpacyMatcher(token_list:List[str], document:Text
|
97 |
+
)->Tuple[List[List[int]],spacy.tokens.doc.Doc]:
|
98 |
+
"""
|
99 |
+
Using the spacy in backend finds the keywords in the document using the
|
100 |
+
Matcher class from spacy. We can alternatively use the regex, but spacy
|
101 |
+
finds all keywords in serialized manner which helps in annotation of answers.
|
102 |
+
|
103 |
+
Params
|
104 |
+
-------
|
105 |
+
token_list: this is token list which tokenize_lexical_query function returns
|
106 |
+
document: text in which we need to find the tokens
|
107 |
+
|
108 |
+
Return
|
109 |
+
--------
|
110 |
+
matches: List of [start_index, end_index] in the spacydoc(at word level not
|
111 |
+
character) for the keywords in token list.
|
112 |
+
|
113 |
+
spacydoc: the keyword index in the spacydoc are at word level and not character,
|
114 |
+
therefore to allow the annotator to work seamlessly we return the spacydoc.
|
115 |
+
|
116 |
+
"""
|
117 |
+
nlp = spacy.load("en_core_web_sm")
|
118 |
+
spacydoc = nlp(document)
|
119 |
+
matcher = Matcher(nlp.vocab)
|
120 |
+
token_pattern = [[{"LOWER":token}] for token in token_list]
|
121 |
+
matcher.add(",".join(token_list), token_pattern)
|
122 |
+
spacymatches = matcher(spacydoc)
|
123 |
+
|
124 |
+
# getting start and end index in spacydoc so that annotator can work seamlessly
|
125 |
+
matches = []
|
126 |
+
for match_id, start, end in spacymatches:
|
127 |
+
matches = matches + [[start, end]]
|
128 |
+
|
129 |
+
return matches, spacydoc
|
130 |
+
|
131 |
+
def runRegexMatcher(token_list:List[str], document:Text):
|
132 |
+
"""
|
133 |
+
Using the regex in backend finds the keywords in the document.
|
134 |
+
|
135 |
+
Params
|
136 |
+
-------
|
137 |
+
token_list: this is token list which tokenize_lexical_query function returns
|
138 |
+
|
139 |
+
document: text in which we need to find the tokens
|
140 |
+
|
141 |
+
Return
|
142 |
+
--------
|
143 |
+
matches: List of [start_index, end_index] in the document for the keywords
|
144 |
+
in token list at character level.
|
145 |
+
|
146 |
+
document: the keyword index returned by regex are at character level,
|
147 |
+
therefore to allow the annotator to work seamlessly we return the text back.
|
148 |
+
|
149 |
+
"""
|
150 |
+
matches = []
|
151 |
+
for token in token_list:
|
152 |
+
matches = (matches +
|
153 |
+
[[val.start(), val.start() +
|
154 |
+
len(token)] for val in re.finditer(token, document)])
|
155 |
+
|
156 |
+
return matches, document
|
157 |
+
|
158 |
+
def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
|
159 |
+
"""
|
160 |
+
This is spacy Annotator and needs spacy.doc
|
161 |
+
Annotates the text in the document defined by list of [start index, end index]
|
162 |
+
Example: "How are you today", if document type is text, matches = [[0,3]]
|
163 |
+
will give answer = "How", however in case we used the spacy matcher then the
|
164 |
+
matches = [[0,3]] will give answer = "How are you". However if spacy is used
|
165 |
+
to find "How" then the matches = [[0,1]] for the string defined above.
|
166 |
+
|
167 |
+
Params
|
168 |
+
-----------
|
169 |
+
matches: As mentioned its list of list. Example [[0,1],[10,13]]
|
170 |
+
document: document which needs to be indexed.
|
171 |
+
|
172 |
+
|
173 |
+
Return
|
174 |
+
--------
|
175 |
+
will send the output to either app front end using streamlit or
|
176 |
+
write directly to output screen.
|
177 |
+
|
178 |
+
"""
|
179 |
+
start = 0
|
180 |
+
annotated_text = ""
|
181 |
+
for match in matches:
|
182 |
+
start_idx = match[0]
|
183 |
+
end_idx = match[1]
|
184 |
+
|
185 |
+
if check_streamlit():
|
186 |
+
annotated_text = (annotated_text + document[start:start_idx].text
|
187 |
+
+ str(annotation(body=document[start_idx:end_idx].text,
|
188 |
+
label="ANSWER", background="#964448", color='#ffffff')))
|
189 |
+
else:
|
190 |
+
annotated_text = (annotated_text + document[start:start_idx].text
|
191 |
+
+ colored(document[start_idx:end_idx].text,
|
192 |
+
"green", attrs = ['bold']))
|
193 |
+
|
194 |
+
|
195 |
+
start = end_idx
|
196 |
+
|
197 |
+
annotated_text = annotated_text + document[end_idx:].text
|
198 |
+
|
199 |
+
|
200 |
+
if check_streamlit():
|
201 |
+
|
202 |
+
st.write(
|
203 |
+
markdown(annotated_text),
|
204 |
+
unsafe_allow_html=True,
|
205 |
+
)
|
206 |
+
else:
|
207 |
+
print(annotated_text)
|
208 |
+
|
209 |
+
def lexical_search(query:Text, documents:List[Document],top_k:int):
|
210 |
+
"""
|
211 |
+
Performs the Lexical search on the List of haystack documents which is
|
212 |
+
returned by preprocessing Pipeline.
|
213 |
+
|
214 |
+
Params
|
215 |
+
-------
|
216 |
+
query: Keywords that need to be searche in documents.
|
217 |
+
documents: List of Haystack documents returned by preprocessing pipeline.
|
218 |
+
top_k: Number of Top results to be fetched.
|
219 |
+
|
220 |
+
"""
|
221 |
+
|
222 |
+
document_store = InMemoryDocumentStore()
|
223 |
+
document_store.write_documents(documents)
|
224 |
+
|
225 |
+
# Haystack Retriever works with document stores only.
|
226 |
+
retriever = TfidfRetriever(document_store)
|
227 |
+
results = retriever.retrieve(query=query, top_k = top_k)
|
228 |
+
query_tokens = tokenize_lexical_query(query)
|
229 |
+
flag = True
|
230 |
+
for count, result in enumerate(results):
|
231 |
+
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
232 |
+
|
233 |
+
if len(matches) != 0:
|
234 |
+
if flag:
|
235 |
+
flag = False
|
236 |
+
if check_streamlit():
|
237 |
+
st.markdown("##### Top few lexical search (TFIDF) hits #####")
|
238 |
+
else:
|
239 |
+
print("Top few lexical search (TFIDF) hits")
|
240 |
+
|
241 |
+
if check_streamlit():
|
242 |
+
st.write("Result {}".format(count+1))
|
243 |
+
else:
|
244 |
+
print("Results {}".format(count +1))
|
245 |
+
spacyAnnotator(matches, doc)
|
246 |
+
|
247 |
+
if flag:
|
248 |
+
if check_streamlit():
|
249 |
+
st.info("🤔 No relevant result found. Please try another keyword.")
|
250 |
+
else:
|
251 |
+
print("No relevant result found. Please try another keyword.")
|
utils/ndc_explorer.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import urllib.request
|
3 |
+
import json
|
4 |
+
|
5 |
+
link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
|
6 |
+
def get_document(country_code: str):
|
7 |
+
"""
|
8 |
+
read the country NDC data from
|
9 |
+
https://klimalog.die-gdi.de/ndc/open-data/dataset.json
|
10 |
+
using the country code.
|
11 |
+
|
12 |
+
Params
|
13 |
+
-------
|
14 |
+
country_code:"""
|
15 |
+
with urllib.request.urlopen(link) as urlfile:
|
16 |
+
data = json.loads(urlfile.read())
|
17 |
+
categoriesData = {}
|
18 |
+
categoriesData['categories']= data['categories']
|
19 |
+
categoriesData['subcategories']= data['subcategories']
|
20 |
+
keys_sub = categoriesData['subcategories'].keys()
|
21 |
+
documentType= 'NDCs'
|
22 |
+
if documentType in data.keys():
|
23 |
+
if country_code in data[documentType].keys():
|
24 |
+
get_dict = {}
|
25 |
+
for key, value in data[documentType][country_code].items():
|
26 |
+
if key not in ['country_name','region_id', 'region_name']:
|
27 |
+
get_dict[key] = value['classification']
|
28 |
+
else:
|
29 |
+
get_dict[key] = value
|
30 |
+
else:
|
31 |
+
return None
|
32 |
+
else:
|
33 |
+
return None
|
34 |
+
|
35 |
+
country = {}
|
36 |
+
for key in categoriesData['categories']:
|
37 |
+
country[key]= {}
|
38 |
+
for key,value in categoriesData['subcategories'].items():
|
39 |
+
country[value['category']][key] = get_dict[key]
|
40 |
+
|
41 |
+
return country
|
42 |
+
|
43 |
+
|
44 |
+
def countrySpecificCCA(cca_sent:dict, threshold:int, countryCode:str):
|
45 |
+
"""
|
46 |
+
based on the countrycode, reads the country data from
|
47 |
+
https://klimalog.die-gdi.de/ndc/open-data/dataset.json
|
48 |
+
using get_documents from utils.ndc_explorer.py
|
49 |
+
then based on thereshold value filters the Climate Change Adaptation
|
50 |
+
targets assigned by NDC explorer team to that country. Using the sentences
|
51 |
+
create by Data services team of GIZ for each target level, tries to find the
|
52 |
+
relevant passages from the document by doing the semantic search.
|
53 |
+
|
54 |
+
Params
|
55 |
+
-------
|
56 |
+
cca_sent: dictionary with key as 'target labels' and manufactured sentences
|
57 |
+
reflecting the target level. Please see the docStore/ndcs/cca.txt
|
58 |
+
|
59 |
+
threshold: NDC target have many categoriees ranging from [0-5], with 0
|
60 |
+
refelcting most relaxed attitude and 5 being most aggrisive towards Climate
|
61 |
+
change. We select the threshold value beyond which we need to focus on.
|
62 |
+
|
63 |
+
countryCode: standard country code to allow us to fetch the country specific
|
64 |
+
data.
|
65 |
+
|
66 |
+
"""
|
67 |
+
temp = {}
|
68 |
+
doc = get_document(countryCode)
|
69 |
+
for key,value in cca_sent.items():
|
70 |
+
id_ = doc['climate change adaptation'][key]['id']
|
71 |
+
if id_ >threshold:
|
72 |
+
temp[key] = value['id'][id_]
|
73 |
+
return temp
|
74 |
+
|
75 |
+
|
76 |
+
def countrySpecificCCM(ccm_sent, threshold, countryCode):
|
77 |
+
"""
|
78 |
+
see the documentation of countrySpecificCCA. This is same instead of
|
79 |
+
this gets the data pertaining to Adaptation
|
80 |
+
|
81 |
+
"""
|
82 |
+
|
83 |
+
temp = {}
|
84 |
+
doc = get_document(countryCode)
|
85 |
+
for key,value in ccm_sent.items():
|
86 |
+
id_ = doc['climate change mitigation'][key]['id']
|
87 |
+
if id_ >threshold:
|
88 |
+
temp[key] = value['id'][id_]
|
89 |
+
|
90 |
+
return temp
|
utils/preprocessing.py
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack.nodes.base import BaseComponent
|
2 |
+
from haystack.schema import Document
|
3 |
+
from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
|
4 |
+
from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
|
5 |
+
from typing import Callable, Dict, List, Optional, Text, Tuple, Union
|
6 |
+
from typing_extensions import Literal
|
7 |
+
import pandas as pd
|
8 |
+
import logging
|
9 |
+
import re
|
10 |
+
import string
|
11 |
+
from haystack.pipelines import Pipeline
|
12 |
+
|
13 |
+
def useOCR(file_path: str)-> Text:
|
14 |
+
"""
|
15 |
+
Converts image pdfs into text, Using the Farm-haystack[OCR]
|
16 |
+
|
17 |
+
Params
|
18 |
+
----------
|
19 |
+
file_path: file_path of uploade file, returned by add_upload function in
|
20 |
+
uploadAndExample.py
|
21 |
+
|
22 |
+
Returns the text file as string.
|
23 |
+
"""
|
24 |
+
|
25 |
+
|
26 |
+
converter = PDFToTextOCRConverter(remove_numeric_tables=True,
|
27 |
+
valid_languages=["eng"])
|
28 |
+
docs = converter.convert(file_path=file_path, meta=None)
|
29 |
+
return docs[0].content
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
class FileConverter(BaseComponent):
|
35 |
+
"""
|
36 |
+
Wrapper class to convert uploaded document into text by calling appropriate
|
37 |
+
Converter class, will use internally haystack PDFToTextOCR in case of image
|
38 |
+
pdf. Cannot use the FileClassifier from haystack as its doesnt has any
|
39 |
+
label/output class for image.
|
40 |
+
|
41 |
+
1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
|
42 |
+
2. https://docs.haystack.deepset.ai/docs/file_converters
|
43 |
+
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
|
44 |
+
4. https://docs.haystack.deepset.ai/reference/file-converters-api
|
45 |
+
|
46 |
+
|
47 |
+
"""
|
48 |
+
|
49 |
+
outgoing_edges = 1
|
50 |
+
|
51 |
+
def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
|
52 |
+
id_hash_keys: Optional[List[str]] = None,
|
53 |
+
) -> Tuple[dict,str]:
|
54 |
+
""" this is required method to invoke the component in
|
55 |
+
the pipeline implementation.
|
56 |
+
|
57 |
+
Params
|
58 |
+
----------
|
59 |
+
file_name: name of file
|
60 |
+
file_path: file_path of uploade file, returned by add_upload function in
|
61 |
+
uploadAndExample.py
|
62 |
+
|
63 |
+
See the links provided in Class docstring/description to see other params
|
64 |
+
|
65 |
+
Return
|
66 |
+
---------
|
67 |
+
output: dictionary, with key as identifier and value could be anything
|
68 |
+
we need to return. In this case its the List of Hasyatck Document
|
69 |
+
|
70 |
+
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
71 |
+
"""
|
72 |
+
try:
|
73 |
+
if file_name.endswith('.pdf'):
|
74 |
+
converter = PDFToTextConverter(remove_numeric_tables=True)
|
75 |
+
if file_name.endswith('.txt'):
|
76 |
+
converter = TextConverter(remove_numeric_tables=True)
|
77 |
+
if file_name.endswith('.docx'):
|
78 |
+
converter = DocxToTextConverter()
|
79 |
+
except Exception as e:
|
80 |
+
logging.error(e)
|
81 |
+
return
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
documents = []
|
86 |
+
|
87 |
+
document = converter.convert(
|
88 |
+
file_path=file_path, meta=None,
|
89 |
+
encoding=encoding, id_hash_keys=id_hash_keys
|
90 |
+
)[0]
|
91 |
+
|
92 |
+
text = document.content
|
93 |
+
|
94 |
+
# if file is image pdf then it will have {'content': "\x0c\x0c\x0c\x0c"}
|
95 |
+
# subsitute this substring with '',and check if content is empty string
|
96 |
+
|
97 |
+
text = re.sub(r'\x0c', '', text)
|
98 |
+
documents.append(Document(content=text,
|
99 |
+
meta={"name": file_name},
|
100 |
+
id_hash_keys=id_hash_keys))
|
101 |
+
|
102 |
+
|
103 |
+
# check if text is empty and apply pdfOCR converter.
|
104 |
+
for i in documents:
|
105 |
+
if i.content == "":
|
106 |
+
logging.info("Using OCR")
|
107 |
+
i.content = useOCR(file_path)
|
108 |
+
|
109 |
+
logging.info('file conversion succesful')
|
110 |
+
output = {'documents': documents}
|
111 |
+
return output, 'output_1'
|
112 |
+
|
113 |
+
def run_batch():
|
114 |
+
"""
|
115 |
+
we dont have requirement to process the multiple files in one go
|
116 |
+
therefore nothing here, however to use the custom node we need to have
|
117 |
+
this method for the class.
|
118 |
+
"""
|
119 |
+
|
120 |
+
return
|
121 |
+
|
122 |
+
|
123 |
+
def basic(s:str, remove_punc:bool = False):
|
124 |
+
|
125 |
+
"""
|
126 |
+
Performs basic cleaning of text.
|
127 |
+
|
128 |
+
Params
|
129 |
+
----------
|
130 |
+
s: string to be processed
|
131 |
+
removePunc: to remove all Punctuation including ',' and '.' or not
|
132 |
+
|
133 |
+
Returns: processed string: see comments in the source code for more info
|
134 |
+
"""
|
135 |
+
|
136 |
+
# Remove URLs
|
137 |
+
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
|
138 |
+
s = re.sub(r"http\S+", " ", s)
|
139 |
+
|
140 |
+
# Remove new line characters
|
141 |
+
s = re.sub('\n', ' ', s)
|
142 |
+
|
143 |
+
# Remove punctuations
|
144 |
+
if remove_punc == True:
|
145 |
+
translator = str.maketrans(' ', ' ', string.punctuation)
|
146 |
+
s = s.translate(translator)
|
147 |
+
# Remove distracting single quotes and dotted pattern
|
148 |
+
s = re.sub("\'", " ", s)
|
149 |
+
s = s.replace("..","")
|
150 |
+
|
151 |
+
return s.strip()
|
152 |
+
|
153 |
+
|
154 |
+
class UdfPreProcessor(BaseComponent):
|
155 |
+
"""
|
156 |
+
class to preprocess the document returned by FileConverter. It will check
|
157 |
+
for splitting strategy and splits the document by word or sentences and then
|
158 |
+
synthetically create the paragraphs.
|
159 |
+
|
160 |
+
1. https://docs.haystack.deepset.ai/docs/preprocessor
|
161 |
+
2. https://docs.haystack.deepset.ai/reference/preprocessor-api
|
162 |
+
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
|
163 |
+
|
164 |
+
"""
|
165 |
+
outgoing_edges = 1
|
166 |
+
|
167 |
+
def run(self, documents:List[Document], remove_punc:bool=False,
|
168 |
+
split_by: Literal["sentence", "word"] = 'sentence',
|
169 |
+
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
170 |
+
split_overlap:int = 0):
|
171 |
+
|
172 |
+
""" this is required method to invoke the component in
|
173 |
+
the pipeline implementation.
|
174 |
+
|
175 |
+
Params
|
176 |
+
----------
|
177 |
+
documents: documents from the output dictionary returned by Fileconverter
|
178 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
179 |
+
split_by: document splitting strategy either as word or sentence
|
180 |
+
split_length: when synthetically creating the paragrpahs from document,
|
181 |
+
it defines the length of paragraph.
|
182 |
+
split_respect_sentence_boundary: Used when using 'word' strategy for
|
183 |
+
splititng of text.
|
184 |
+
split_overlap: Number of words or sentences that overlap when creating
|
185 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
186 |
+
when read in together with others. Therefore the overlap is used.
|
187 |
+
|
188 |
+
Return
|
189 |
+
---------
|
190 |
+
output: dictionary, with key as identifier and value could be anything
|
191 |
+
we need to return. In this case the output will contain 4 objects
|
192 |
+
the paragraphs text list as List, Haystack document, Dataframe and
|
193 |
+
one raw text file.
|
194 |
+
|
195 |
+
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
196 |
+
|
197 |
+
"""
|
198 |
+
|
199 |
+
if split_by == 'sentence':
|
200 |
+
split_respect_sentence_boundary = False
|
201 |
+
|
202 |
+
else:
|
203 |
+
split_respect_sentence_boundary = split_respect_sentence_boundary
|
204 |
+
|
205 |
+
preprocessor = PreProcessor(
|
206 |
+
clean_empty_lines=True,
|
207 |
+
clean_whitespace=True,
|
208 |
+
clean_header_footer=True,
|
209 |
+
split_by=split_by,
|
210 |
+
split_length=split_length,
|
211 |
+
split_respect_sentence_boundary= split_respect_sentence_boundary,
|
212 |
+
split_overlap=split_overlap,
|
213 |
+
|
214 |
+
# will add page number only in case of PDF not for text/docx file.
|
215 |
+
add_page_number=True
|
216 |
+
)
|
217 |
+
|
218 |
+
for i in documents:
|
219 |
+
# # basic cleaning before passing it to preprocessor.
|
220 |
+
# i = basic(i)
|
221 |
+
docs_processed = preprocessor.process([i])
|
222 |
+
for item in docs_processed:
|
223 |
+
item.content = basic(item.content, remove_punc= remove_punc)
|
224 |
+
|
225 |
+
df = pd.DataFrame(docs_processed)
|
226 |
+
all_text = " ".join(df.content.to_list())
|
227 |
+
para_list = df.content.to_list()
|
228 |
+
logging.info('document split into {} paragraphs'.format(len(para_list)))
|
229 |
+
output = {'documents': docs_processed,
|
230 |
+
'dataframe': df,
|
231 |
+
'text': all_text,
|
232 |
+
'paraList': para_list
|
233 |
+
}
|
234 |
+
return output, "output_1"
|
235 |
+
def run_batch():
|
236 |
+
"""
|
237 |
+
we dont have requirement to process the multiple files in one go
|
238 |
+
therefore nothing here, however to use the custom node we need to have
|
239 |
+
this method for the class.
|
240 |
+
"""
|
241 |
+
return
|
242 |
+
|
243 |
+
def processingpipeline():
|
244 |
+
"""
|
245 |
+
Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
|
246 |
+
from utils.preprocessing
|
247 |
+
|
248 |
+
"""
|
249 |
+
|
250 |
+
preprocessing_pipeline = Pipeline()
|
251 |
+
file_converter = FileConverter()
|
252 |
+
custom_preprocessor = UdfPreProcessor()
|
253 |
+
|
254 |
+
preprocessing_pipeline.add_node(component=file_converter,
|
255 |
+
name="FileConverter", inputs=["File"])
|
256 |
+
preprocessing_pipeline.add_node(component = custom_preprocessor,
|
257 |
+
name ='UdfPreProcessor', inputs=["FileConverter"])
|
258 |
+
|
259 |
+
return preprocessing_pipeline
|
260 |
+
|
utils/sdg_classifier.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack.nodes import TransformersDocumentClassifier
|
2 |
+
from haystack.schema import Document
|
3 |
+
from typing import List, Tuple
|
4 |
+
from typing_extensions import Literal
|
5 |
+
import logging
|
6 |
+
import pandas as pd
|
7 |
+
from pandas import DataFrame, Series
|
8 |
+
from utils.checkconfig import getconfig
|
9 |
+
from utils.streamlitcheck import check_streamlit
|
10 |
+
from utils.preprocessing import processingpipeline
|
11 |
+
try:
|
12 |
+
import streamlit as st
|
13 |
+
except ImportError:
|
14 |
+
logging.info("Streamlit not installed")
|
15 |
+
|
16 |
+
## Labels dictionary ###
|
17 |
+
_lab_dict = {0: 'no_cat',
|
18 |
+
1:'SDG 1 - No poverty',
|
19 |
+
2:'SDG 2 - Zero hunger',
|
20 |
+
3:'SDG 3 - Good health and well-being',
|
21 |
+
4:'SDG 4 - Quality education',
|
22 |
+
5:'SDG 5 - Gender equality',
|
23 |
+
6:'SDG 6 - Clean water and sanitation',
|
24 |
+
7:'SDG 7 - Affordable and clean energy',
|
25 |
+
8:'SDG 8 - Decent work and economic growth',
|
26 |
+
9:'SDG 9 - Industry, Innovation and Infrastructure',
|
27 |
+
10:'SDG 10 - Reduced inequality',
|
28 |
+
11:'SDG 11 - Sustainable cities and communities',
|
29 |
+
12:'SDG 12 - Responsible consumption and production',
|
30 |
+
13:'SDG 13 - Climate action',
|
31 |
+
14:'SDG 14 - Life below water',
|
32 |
+
15:'SDG 15 - Life on land',
|
33 |
+
16:'SDG 16 - Peace, justice and strong institutions',
|
34 |
+
17:'SDG 17 - Partnership for the goals',}
|
35 |
+
|
36 |
+
@st.cache(allow_output_mutation=True)
|
37 |
+
def load_sdgClassifier(config_file:str = None, classifier_name:str = None):
|
38 |
+
"""
|
39 |
+
loads the document classifier using haystack, where the name/path of model
|
40 |
+
in HF-hub as string is used to fetch the model object.Either configfile or
|
41 |
+
model should be passed.
|
42 |
+
1. https://docs.haystack.deepset.ai/reference/document-classifier-api
|
43 |
+
2. https://docs.haystack.deepset.ai/docs/document_classifier
|
44 |
+
|
45 |
+
Params
|
46 |
+
--------
|
47 |
+
config_file: config file path from which to read the model name
|
48 |
+
classifier_name: if modelname is passed, it takes a priority if not \
|
49 |
+
found then will look for configfile, else raise error.
|
50 |
+
|
51 |
+
|
52 |
+
Return: document classifier model
|
53 |
+
"""
|
54 |
+
if not classifier_name:
|
55 |
+
if not config_file:
|
56 |
+
logging.warning("Pass either model name or config file")
|
57 |
+
return
|
58 |
+
else:
|
59 |
+
config = getconfig(config_file)
|
60 |
+
classifier_name = config.get('sdg','MODEL')
|
61 |
+
|
62 |
+
logging.info("Loading classifier")
|
63 |
+
doc_classifier = TransformersDocumentClassifier(
|
64 |
+
model_name_or_path=classifier_name,
|
65 |
+
task="text-classification")
|
66 |
+
|
67 |
+
return doc_classifier
|
68 |
+
|
69 |
+
|
70 |
+
@st.cache(allow_output_mutation=True)
|
71 |
+
def sdg_classification(haystack_doc:List[Document],
|
72 |
+
threshold:float = 0.8,
|
73 |
+
classifier_model:TransformersDocumentClassifier= None
|
74 |
+
)->Tuple[DataFrame,Series]:
|
75 |
+
"""
|
76 |
+
Text-Classification on the list of texts provided. Classifier provides the
|
77 |
+
most appropriate label for each text. these labels are in terms of if text
|
78 |
+
belongs to which particular Sustainable Devleopment Goal (SDG).
|
79 |
+
|
80 |
+
Params
|
81 |
+
---------
|
82 |
+
haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
|
83 |
+
contains the list of paragraphs in different format,here the list of
|
84 |
+
Haystack Documents is used.
|
85 |
+
threshold: threshold value for the model to keep the results from classifier
|
86 |
+
classifiermodel: you can pass the classifier model directly,which takes priority
|
87 |
+
however if not then looks for model in streamlit session.
|
88 |
+
In case of streamlit avoid passing the model directly.
|
89 |
+
|
90 |
+
|
91 |
+
Returns
|
92 |
+
----------
|
93 |
+
df: Dataframe with two columns['SDG:int', 'text']
|
94 |
+
x: Series object with the unique SDG covered in the document uploaded and
|
95 |
+
the number of times it is covered/discussed/count_of_paragraphs.
|
96 |
+
|
97 |
+
"""
|
98 |
+
logging.info("Working on SDG Classification")
|
99 |
+
if not classifier_model:
|
100 |
+
if check_streamlit():
|
101 |
+
classifier_model = st.session_state['sdg_classifier']
|
102 |
+
else:
|
103 |
+
logging.warning("No streamlit envinornment found, Pass the classifier")
|
104 |
+
return
|
105 |
+
|
106 |
+
results = classifier_model.predict(haystack_doc)
|
107 |
+
|
108 |
+
|
109 |
+
labels_= [(l.meta['classification']['label'],
|
110 |
+
l.meta['classification']['score'],l.content,) for l in results]
|
111 |
+
|
112 |
+
df = DataFrame(labels_, columns=["SDG","Relevancy","text"])
|
113 |
+
|
114 |
+
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
115 |
+
df.index += 1
|
116 |
+
df =df[df['Relevancy']>threshold]
|
117 |
+
|
118 |
+
# creating the dataframe for value counts of SDG, along with 'title' of SDGs
|
119 |
+
x = df['SDG'].value_counts()
|
120 |
+
x = x.rename('count')
|
121 |
+
x = x.rename_axis('SDG').reset_index()
|
122 |
+
x["SDG"] = pd.to_numeric(x["SDG"])
|
123 |
+
x = x.sort_values(by=['count'], ascending=False)
|
124 |
+
x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
|
125 |
+
x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
|
126 |
+
|
127 |
+
df['SDG'] = pd.to_numeric(df['SDG'])
|
128 |
+
df = df.sort_values('SDG')
|
129 |
+
|
130 |
+
return df, x
|
131 |
+
|
132 |
+
def runSDGPreprocessingPipeline(file_name:str, file_path:str,
|
133 |
+
split_by: Literal["sentence", "word"] = 'sentence',
|
134 |
+
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
135 |
+
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
136 |
+
"""
|
137 |
+
creates the pipeline and runs the preprocessing pipeline,
|
138 |
+
the params for pipeline are fetched from paramconfig
|
139 |
+
|
140 |
+
Params
|
141 |
+
------------
|
142 |
+
|
143 |
+
file_name: filename, in case of streamlit application use
|
144 |
+
st.session_state['filename']
|
145 |
+
file_path: filepath, in case of streamlit application use st.session_state['filepath']
|
146 |
+
split_by: document splitting strategy either as word or sentence
|
147 |
+
split_length: when synthetically creating the paragrpahs from document,
|
148 |
+
it defines the length of paragraph.
|
149 |
+
split_respect_sentence_boundary: Used when using 'word' strategy for
|
150 |
+
splititng of text.
|
151 |
+
split_overlap: Number of words or sentences that overlap when creating
|
152 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
153 |
+
when read in together with others. Therefore the overlap is used.
|
154 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
155 |
+
|
156 |
+
|
157 |
+
Return
|
158 |
+
--------------
|
159 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
160 |
+
has four objects. For the Haysatck implementation of SDG classification we,
|
161 |
+
need to use the List of Haystack Document, which can be fetched by
|
162 |
+
key = 'documents' on output.
|
163 |
+
|
164 |
+
"""
|
165 |
+
|
166 |
+
sdg_processing_pipeline = processingpipeline()
|
167 |
+
|
168 |
+
output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
|
169 |
+
params= {"FileConverter": {"file_path": file_path, \
|
170 |
+
"file_name": file_name},
|
171 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
172 |
+
"split_by": split_by, \
|
173 |
+
"split_length":split_length,\
|
174 |
+
"split_overlap": split_overlap, \
|
175 |
+
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
176 |
+
|
177 |
+
return output_sdg_pre
|
utils/semantic_search.py
ADDED
@@ -0,0 +1,582 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack.nodes import TransformersQueryClassifier, Docs2Answers
|
2 |
+
from haystack.nodes import EmbeddingRetriever, FARMReader
|
3 |
+
from haystack.nodes.base import BaseComponent
|
4 |
+
from haystack.document_stores import InMemoryDocumentStore
|
5 |
+
from markdown import markdown
|
6 |
+
from annotated_text import annotation
|
7 |
+
from haystack.schema import Document
|
8 |
+
from typing import List, Text, Union
|
9 |
+
from typing_extensions import Literal
|
10 |
+
from utils.preprocessing import processingpipeline
|
11 |
+
from utils.streamlitcheck import check_streamlit
|
12 |
+
from haystack.pipelines import Pipeline
|
13 |
+
import pandas as pd
|
14 |
+
import logging
|
15 |
+
try:
|
16 |
+
from termcolor import colored
|
17 |
+
except:
|
18 |
+
pass
|
19 |
+
try:
|
20 |
+
import streamlit as st
|
21 |
+
except ImportError:
|
22 |
+
logging.info("Streamlit not installed")
|
23 |
+
|
24 |
+
|
25 |
+
@st.cache(allow_output_mutation=True)
|
26 |
+
def loadQueryClassifier():
|
27 |
+
"""
|
28 |
+
retuns the haystack query classifier model
|
29 |
+
model = shahrukhx01/bert-mini-finetune-question-detection
|
30 |
+
|
31 |
+
"""
|
32 |
+
query_classifier = TransformersQueryClassifier(model_name_or_path=
|
33 |
+
"shahrukhx01/bert-mini-finetune-question-detection")
|
34 |
+
return query_classifier
|
35 |
+
|
36 |
+
class QueryCheck(BaseComponent):
|
37 |
+
"""
|
38 |
+
Uses Query Classifier from Haystack, process the query based on query type.
|
39 |
+
Ability to determine the statements is not so good, therefore the chances
|
40 |
+
statement also get modified. Ex: "List water related issues" will be
|
41 |
+
identified by the model as keywords, and therefore it be processed as "what
|
42 |
+
are the 'list all water related issues' related issues and discussions?".
|
43 |
+
This is one shortcoming but is igonred for now, as semantic search will not
|
44 |
+
get affected a lot, by this. If you want to pass keywords list and want to
|
45 |
+
do batch processing use. run_batch. Example: if you want to find relevant
|
46 |
+
passages for water, food security, poverty then querylist = ["water", "food
|
47 |
+
security","poverty"] and then execute QueryCheck.run_batch(queries = querylist)
|
48 |
+
|
49 |
+
1. https://docs.haystack.deepset.ai/docs/query_classifier
|
50 |
+
|
51 |
+
"""
|
52 |
+
|
53 |
+
outgoing_edges = 1
|
54 |
+
|
55 |
+
def run(self, query:str):
|
56 |
+
"""
|
57 |
+
mandatory method to use the custom node. Determines the query type, if
|
58 |
+
if the query is of type keyword/statement will modify it to make it more
|
59 |
+
useful for sentence transoformers.
|
60 |
+
|
61 |
+
Params
|
62 |
+
--------
|
63 |
+
query: query/statement/keywords in form of string
|
64 |
+
|
65 |
+
Return
|
66 |
+
------
|
67 |
+
output: dictionary, with key as identifier and value could be anything
|
68 |
+
we need to return. In this case the output contain key = 'query'.
|
69 |
+
|
70 |
+
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
71 |
+
|
72 |
+
"""
|
73 |
+
query_classifier = loadQueryClassifier()
|
74 |
+
result = query_classifier.run(query=query)
|
75 |
+
|
76 |
+
if result[1] == "output_1":
|
77 |
+
output = {"query":query,
|
78 |
+
"query_type": 'question/statement'}
|
79 |
+
else:
|
80 |
+
output = {"query": "what are the {} related issues and \
|
81 |
+
discussions?".format(query),
|
82 |
+
"query_type": 'statements/keyword'}
|
83 |
+
logging.info(output)
|
84 |
+
return output, "output_1"
|
85 |
+
|
86 |
+
def run_batch(self, queries:List[str]):
|
87 |
+
"""
|
88 |
+
running multiple queries in one go, howeevr need the queries to be passed
|
89 |
+
as list of string. Example: if you want to find relevant passages for
|
90 |
+
water, food security, poverty then querylist = ["water", "food security",
|
91 |
+
"poverty"] and then execute QueryCheck.run_batch(queries = querylist)
|
92 |
+
|
93 |
+
Params
|
94 |
+
--------
|
95 |
+
queries: queries/statements/keywords in form of string encapsulated
|
96 |
+
within List
|
97 |
+
|
98 |
+
Return
|
99 |
+
------
|
100 |
+
output: dictionary, with key as identifier and value could be anything
|
101 |
+
we need to return. In this case the output contain key = 'queries'.
|
102 |
+
|
103 |
+
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
104 |
+
"""
|
105 |
+
query_classifier = loadQueryClassifier()
|
106 |
+
query_list = []
|
107 |
+
for query in queries:
|
108 |
+
result = query_classifier.run(query=query)
|
109 |
+
if result[1] == "output_1":
|
110 |
+
query_list.append(query)
|
111 |
+
else:
|
112 |
+
query_list.append("what are the {} related issues and \
|
113 |
+
discussions?".format(query))
|
114 |
+
output = {'queries':query_list}
|
115 |
+
logging.info(output)
|
116 |
+
return output, "output_1"
|
117 |
+
|
118 |
+
|
119 |
+
@st.cache(allow_output_mutation=True)
|
120 |
+
def runSemanticPreprocessingPipeline(file_path:str, file_name:str,
|
121 |
+
split_by: Literal["sentence", "word"] = 'sentence',
|
122 |
+
split_length:int = 2, split_overlap:int = 0,
|
123 |
+
split_respect_sentence_boundary:bool = False,
|
124 |
+
remove_punc:bool = False)->List[Document]:
|
125 |
+
"""
|
126 |
+
creates the pipeline and runs the preprocessing pipeline.
|
127 |
+
|
128 |
+
Params
|
129 |
+
------------
|
130 |
+
|
131 |
+
file_name: filename, in case of streamlit application use
|
132 |
+
st.session_state['filename']
|
133 |
+
file_path: filepath, in case of streamlit application use
|
134 |
+
st.session_state['filepath']
|
135 |
+
split_by: document splitting strategy either as word or sentence
|
136 |
+
split_length: when synthetically creating the paragrpahs from document,
|
137 |
+
it defines the length of paragraph.
|
138 |
+
split_overlap: Number of words or sentences that overlap when creating the
|
139 |
+
paragraphs. This is done as one sentence or 'some words' make sense
|
140 |
+
when read in together with others. Therefore the overlap is used.
|
141 |
+
split_respect_sentence_boundary: Used when using 'word' strategy for
|
142 |
+
splititng of text.
|
143 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
144 |
+
|
145 |
+
Return
|
146 |
+
--------------
|
147 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
148 |
+
has four objects. For the Haysatck implementation of semantic search we,
|
149 |
+
need to use the List of Haystack Document, which can be fetched by
|
150 |
+
key = 'documents' on output.
|
151 |
+
|
152 |
+
"""
|
153 |
+
|
154 |
+
semantic_processing_pipeline = processingpipeline()
|
155 |
+
|
156 |
+
output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
|
157 |
+
params= {"FileConverter": {"file_path": file_path, \
|
158 |
+
"file_name": file_name},
|
159 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
160 |
+
"split_by": split_by, \
|
161 |
+
"split_length":split_length,\
|
162 |
+
"split_overlap": split_overlap,
|
163 |
+
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
164 |
+
|
165 |
+
return output_semantic_pre
|
166 |
+
|
167 |
+
|
168 |
+
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
169 |
+
allow_output_mutation=True)
|
170 |
+
def loadRetriever(embedding_model:Text=None, embedding_model_format:Text = None,
|
171 |
+
embedding_layer:int = None, retriever_top_k:int = 10,
|
172 |
+
max_seq_len:int=512, document_store:InMemoryDocumentStore=None):
|
173 |
+
"""
|
174 |
+
Returns the Retriever model based on params provided.
|
175 |
+
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
176 |
+
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
177 |
+
3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
|
178 |
+
|
179 |
+
|
180 |
+
Params
|
181 |
+
---------
|
182 |
+
embedding_model: Name of the model to be used for embedding. Check the links
|
183 |
+
provided in documentation
|
184 |
+
embedding_model_format: check the github link of Haystack provided in
|
185 |
+
documentation embedding_layer: check the github link of Haystack
|
186 |
+
provided in documentation retriever_top_k: Number of Top results to
|
187 |
+
be returned by
|
188 |
+
retriever max_seq_len: everymodel has max seq len it can handle, check in
|
189 |
+
model card. Needed to hanlde the edge cases.
|
190 |
+
document_store: InMemoryDocumentStore, write haystack Document list to
|
191 |
+
DocumentStore and pass the same to function call. Can be done using
|
192 |
+
createDocumentStore from utils.
|
193 |
+
|
194 |
+
Return
|
195 |
+
-------
|
196 |
+
retriever: embedding model
|
197 |
+
"""
|
198 |
+
logging.info("loading retriever")
|
199 |
+
if document_store is None:
|
200 |
+
logging.warning("Retriever initialization requires the DocumentStore")
|
201 |
+
return
|
202 |
+
|
203 |
+
retriever = EmbeddingRetriever(
|
204 |
+
embedding_model=embedding_model,top_k = retriever_top_k,
|
205 |
+
document_store = document_store,
|
206 |
+
emb_extraction_layer=embedding_layer, scale_score =True,
|
207 |
+
model_format=embedding_model_format, use_gpu = True,
|
208 |
+
max_seq_len = max_seq_len )
|
209 |
+
if check_streamlit:
|
210 |
+
st.session_state['retriever'] = retriever
|
211 |
+
return retriever
|
212 |
+
|
213 |
+
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
214 |
+
allow_output_mutation=True)
|
215 |
+
def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
|
216 |
+
embedding_dim:int = 768):
|
217 |
+
"""
|
218 |
+
Creates the InMemory Document Store from haystack list of Documents.
|
219 |
+
It is mandatory component for Retriever to work in Haystack frame work.
|
220 |
+
|
221 |
+
Params
|
222 |
+
-------
|
223 |
+
documents: List of haystack document. If using the preprocessing pipeline,
|
224 |
+
can be fetched key = 'documents; on output of preprocessing pipeline.
|
225 |
+
similarity: scoring function, can be either 'cosine' or 'dot_product'
|
226 |
+
embedding_dim: Document store has default value of embedding size = 768, and
|
227 |
+
update_embeddings method of Docstore cannot infer the embedding size of
|
228 |
+
retiever automatically, therefore set this value as per the model card.
|
229 |
+
|
230 |
+
Return
|
231 |
+
-------
|
232 |
+
document_store: InMemory Document Store object type.
|
233 |
+
|
234 |
+
"""
|
235 |
+
document_store = InMemoryDocumentStore(similarity = similarity,
|
236 |
+
embedding_dim = embedding_dim )
|
237 |
+
document_store.write_documents(documents)
|
238 |
+
|
239 |
+
return document_store
|
240 |
+
|
241 |
+
|
242 |
+
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
243 |
+
allow_output_mutation=True)
|
244 |
+
def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
|
245 |
+
embedding_model_format:Text = None,embedding_layer:int = None,
|
246 |
+
embedding_dim:int = 768,retriever_top_k:int = 10,
|
247 |
+
reader_model:str = None, reader_top_k:int = 10,
|
248 |
+
max_seq_len:int =512,useQueryCheck = True,
|
249 |
+
top_k_per_candidate:int = 1):
|
250 |
+
"""
|
251 |
+
creates the semantic search pipeline and document Store object from the
|
252 |
+
list of haystack documents. The top_k for the Reader and Retirever are kept
|
253 |
+
same, so that all the results returned by Retriever are used, however the
|
254 |
+
context is extracted by Reader for each retrieved result. The querycheck is
|
255 |
+
added as node to process the query. This pipeline is suited for keyword search,
|
256 |
+
and to some extent extractive QA purpose. The purpose of Reader is strictly to
|
257 |
+
highlight the context for retrieved result and not for QA, however as stated
|
258 |
+
it can work for QA too in limited sense.
|
259 |
+
There are 4 variants of pipeline it can return
|
260 |
+
1.QueryCheck > Retriever > Reader
|
261 |
+
2.Retriever > Reader
|
262 |
+
3.QueryCheck > Retriever > Docs2Answers : If reader is None,
|
263 |
+
then Doc2answer is used to keep the output of pipeline structurally same.
|
264 |
+
4.Retriever > Docs2Answers
|
265 |
+
|
266 |
+
Links
|
267 |
+
|
268 |
+
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
269 |
+
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
270 |
+
3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
|
271 |
+
4. https://docs.haystack.deepset.ai/docs/reader
|
272 |
+
|
273 |
+
|
274 |
+
Params
|
275 |
+
----------
|
276 |
+
documents: list of Haystack Documents, returned by preprocessig pipeline.
|
277 |
+
embedding_model: Name of the model to be used for embedding. Check the links
|
278 |
+
provided in documentation
|
279 |
+
embedding_model_format: check the github link of Haystack provided in
|
280 |
+
documentation
|
281 |
+
embedding_layer: check the github link of Haystack provided in documentation
|
282 |
+
embedding_dim: Document store has default value of embedding size = 768, and
|
283 |
+
update_embeddings method of Docstore cannot infer the embedding size of
|
284 |
+
retiever automatically, therefore set this value as per the model card.
|
285 |
+
retriever_top_k: Number of Top results to be returned by retriever
|
286 |
+
reader_model: Name of the model to be used for Reader node in hasyatck
|
287 |
+
Pipeline. Check the links provided in documentation
|
288 |
+
reader_top_k: Reader will use retrieved results to further find better matches.
|
289 |
+
As purpose here is to use reader to extract context, the value is
|
290 |
+
same as retriever_top_k.
|
291 |
+
max_seq_len:everymodel has max seq len it can handle, check in model card.
|
292 |
+
Needed to hanlde the edge cases
|
293 |
+
useQueryCheck: Whether to use the querycheck which modifies the query or not.
|
294 |
+
top_k_per_candidate:How many answers to extract for each candidate doc
|
295 |
+
that is coming from the retriever
|
296 |
+
|
297 |
+
Return
|
298 |
+
---------
|
299 |
+
semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
|
300 |
+
nodes [QueryCheck, Retriever, Reader/Docs2Answer]. If reader is None,
|
301 |
+
then Doc2answer is used to keep the output of pipeline structurally
|
302 |
+
same.
|
303 |
+
|
304 |
+
document_store: As retriever can work only with Haystack Document Store, the
|
305 |
+
list of document returned by preprocessing pipeline are fed into to
|
306 |
+
get InMemmoryDocumentStore object type, with retriever updating the
|
307 |
+
embeddings of each paragraph in document store.
|
308 |
+
|
309 |
+
"""
|
310 |
+
document_store = createDocumentStore(documents=documents,
|
311 |
+
embedding_dim=embedding_dim)
|
312 |
+
retriever = loadRetriever(embedding_model = embedding_model,
|
313 |
+
embedding_model_format=embedding_model_format,
|
314 |
+
embedding_layer=embedding_layer,
|
315 |
+
retriever_top_k= retriever_top_k,
|
316 |
+
document_store = document_store,
|
317 |
+
max_seq_len=max_seq_len)
|
318 |
+
document_store.update_embeddings(retriever)
|
319 |
+
semantic_search_pipeline = Pipeline()
|
320 |
+
if useQueryCheck and reader_model:
|
321 |
+
querycheck = QueryCheck()
|
322 |
+
reader = FARMReader(model_name_or_path=reader_model,
|
323 |
+
top_k = reader_top_k, use_gpu=True,
|
324 |
+
top_k_per_candidate = top_k_per_candidate)
|
325 |
+
semantic_search_pipeline.add_node(component = querycheck,
|
326 |
+
name = "QueryCheck",inputs = ["Query"])
|
327 |
+
semantic_search_pipeline.add_node(component = retriever,
|
328 |
+
name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
|
329 |
+
semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
|
330 |
+
inputs= ["EmbeddingRetriever"])
|
331 |
+
|
332 |
+
elif reader_model :
|
333 |
+
reader = FARMReader(model_name_or_path=reader_model,
|
334 |
+
top_k = reader_top_k, use_gpu=True,
|
335 |
+
top_k_per_candidate = top_k_per_candidate)
|
336 |
+
semantic_search_pipeline.add_node(component = retriever,
|
337 |
+
name = "EmbeddingRetriever",inputs = ["Query"])
|
338 |
+
semantic_search_pipeline.add_node(component = reader,
|
339 |
+
name = "FARMReader",inputs= ["EmbeddingRetriever"])
|
340 |
+
elif useQueryCheck and not reader_model:
|
341 |
+
querycheck = QueryCheck()
|
342 |
+
docs2answers = Docs2Answers()
|
343 |
+
semantic_search_pipeline.add_node(component = querycheck,
|
344 |
+
name = "QueryCheck",inputs = ["Query"])
|
345 |
+
semantic_search_pipeline.add_node(component = retriever,
|
346 |
+
name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
|
347 |
+
semantic_search_pipeline.add_node(component = docs2answers,
|
348 |
+
name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
|
349 |
+
elif not useQueryCheck and not reader_model:
|
350 |
+
docs2answers = Docs2Answers()
|
351 |
+
semantic_search_pipeline.add_node(component = retriever,
|
352 |
+
name = "EmbeddingRetriever",inputs = ["Query"])
|
353 |
+
semantic_search_pipeline.add_node(component = docs2answers,
|
354 |
+
name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
|
355 |
+
|
356 |
+
logging.info(semantic_search_pipeline.components)
|
357 |
+
return semantic_search_pipeline, document_store
|
358 |
+
|
359 |
+
def runSemanticPipeline(pipeline:Pipeline, queries:Union[list,str])->dict:
|
360 |
+
"""
|
361 |
+
will use the haystack run or run_batch based on if single query is passed
|
362 |
+
as string or multiple queries as List[str]
|
363 |
+
|
364 |
+
Params
|
365 |
+
-------
|
366 |
+
pipeline: haystack pipeline, this is same as returned by semanticSearchPipeline
|
367 |
+
from utils.semanticsearch
|
368 |
+
|
369 |
+
queries: Either a single query or list of queries.
|
370 |
+
|
371 |
+
Return
|
372 |
+
-------
|
373 |
+
results: Dict containing answers and documents as key and their respective
|
374 |
+
values
|
375 |
+
|
376 |
+
"""
|
377 |
+
|
378 |
+
if type(queries) == list:
|
379 |
+
results = pipeline.run_batch(queries=queries)
|
380 |
+
elif type(queries) == str:
|
381 |
+
results = pipeline.run(query=queries)
|
382 |
+
else:
|
383 |
+
logging.info("Please check the input type for the queries")
|
384 |
+
return
|
385 |
+
|
386 |
+
return results
|
387 |
+
|
388 |
+
def process_query_output(results:dict)->pd.DataFrame:
|
389 |
+
"""
|
390 |
+
Returns the dataframe with necessary information like including
|
391 |
+
['query','answer','answer_offset','context_offset','context','content',
|
392 |
+
'reader_score','retriever_score','id',]. This is designed for output given
|
393 |
+
by semantic search pipeline with single query and final node as reader.
|
394 |
+
The output of pipeline having Docs2Answers as final node or multiple queries
|
395 |
+
need to be handled separately. In these other cases, use process_semantic_output
|
396 |
+
from utils.semantic_search which uses this function internally to make one
|
397 |
+
combined dataframe.
|
398 |
+
|
399 |
+
Params
|
400 |
+
---------
|
401 |
+
results: this dictionary should have key,values with
|
402 |
+
keys = [query,answers,documents], however answers is optional.
|
403 |
+
in case of [Doc2Answers as final node], process_semantic_output
|
404 |
+
doesnt return answers thereby setting all values contained in
|
405 |
+
answers to 'None'
|
406 |
+
|
407 |
+
Return
|
408 |
+
--------
|
409 |
+
df: dataframe with all the columns mentioned in function description.
|
410 |
+
|
411 |
+
"""
|
412 |
+
query_text = results['query']
|
413 |
+
if 'answers' in results.keys():
|
414 |
+
answer_dict = {}
|
415 |
+
|
416 |
+
for answer in results['answers']:
|
417 |
+
answer_dict[answer.document_id] = answer.to_dict()
|
418 |
+
else:
|
419 |
+
answer_dict = {}
|
420 |
+
docs = results['documents']
|
421 |
+
df = pd.DataFrame(columns=['query','answer','answer_offset','context_offset',
|
422 |
+
'context','content','reader_score','retriever_score',
|
423 |
+
'id'])
|
424 |
+
for doc in docs:
|
425 |
+
row_list = {}
|
426 |
+
row_list['query'] = query_text
|
427 |
+
row_list['retriever_score'] = doc.score
|
428 |
+
row_list['id'] = doc.id
|
429 |
+
row_list['content'] = doc.content
|
430 |
+
if doc.id in answer_dict.keys():
|
431 |
+
row_list['answer'] = answer_dict[doc.id]['answer']
|
432 |
+
row_list['context'] = answer_dict[doc.id]['context']
|
433 |
+
row_list['reader_score'] = answer_dict[doc.id]['score']
|
434 |
+
answer_offset = answer_dict[doc.id]['offsets_in_document'][0]
|
435 |
+
row_list['answer_offset'] = [answer_offset['start'],answer_offset['end']]
|
436 |
+
start_idx = doc.content.find(row_list['context'])
|
437 |
+
end_idx = start_idx + len(row_list['context'])
|
438 |
+
row_list['context_offset'] = [start_idx, end_idx]
|
439 |
+
else:
|
440 |
+
row_list['answer'] = None
|
441 |
+
row_list['context'] = None
|
442 |
+
row_list['reader_score'] = None
|
443 |
+
row_list['answer_offset'] = None
|
444 |
+
row_list['context_offset'] = None
|
445 |
+
df_dictionary = pd.DataFrame([row_list])
|
446 |
+
df = pd.concat([df, df_dictionary], ignore_index=True)
|
447 |
+
|
448 |
+
return df
|
449 |
+
|
450 |
+
def process_semantic_output(results):
|
451 |
+
"""
|
452 |
+
Returns the dataframe with necessary information like including
|
453 |
+
['query','answer','answer_offset','context_offset','context','content',
|
454 |
+
'reader_score','retriever_score','id',]. Distingushes if its single query or
|
455 |
+
multi queries by reading the pipeline output dictionary keys.
|
456 |
+
Uses the process_query_output to get the dataframe for each query and create
|
457 |
+
one concataneted dataframe. In case of Docs2Answers as final node, deletes
|
458 |
+
the answers part. See documentations of process_query_output.
|
459 |
+
|
460 |
+
Params
|
461 |
+
---------
|
462 |
+
results: raw output of runSemanticPipeline.
|
463 |
+
|
464 |
+
Return
|
465 |
+
--------
|
466 |
+
df: dataframe with all the columns mentioned in function description.
|
467 |
+
|
468 |
+
"""
|
469 |
+
output = {}
|
470 |
+
if 'query' in results.keys():
|
471 |
+
output['query'] = results['query']
|
472 |
+
output['documents'] = results['documents']
|
473 |
+
if results['node_id'] == 'Docs2Answers':
|
474 |
+
pass
|
475 |
+
else:
|
476 |
+
output['answers'] = results['answers']
|
477 |
+
df = process_query_output(output)
|
478 |
+
return df
|
479 |
+
if 'queries' in results.keys():
|
480 |
+
df = pd.DataFrame(columns=['query','answer','answer_offset',
|
481 |
+
'context_offset','context','content',
|
482 |
+
'reader_score','retriever_score','id'])
|
483 |
+
for query,answers,documents in zip(results['queries'],
|
484 |
+
results['answers'],results['documents']):
|
485 |
+
output = {}
|
486 |
+
output['query'] = query
|
487 |
+
output['documents'] = documents
|
488 |
+
if results['node_id'] == 'Docs2Answers':
|
489 |
+
pass
|
490 |
+
else:
|
491 |
+
output['answers'] = answers
|
492 |
+
|
493 |
+
temp = process_query_output(output)
|
494 |
+
df = pd.concat([df, temp], ignore_index=True)
|
495 |
+
|
496 |
+
|
497 |
+
return df
|
498 |
+
|
499 |
+
def semanticsearchAnnotator(matches:List[List[int]], document:Text):
|
500 |
+
"""
|
501 |
+
Annotates the text in the document defined by list of [start index, end index]
|
502 |
+
Example: "How are you today", if document type is text, matches = [[0,3]]
|
503 |
+
will give answer = "How", however in case we used the spacy matcher then the
|
504 |
+
matches = [[0,3]] will give answer = "How are you". However if spacy is used
|
505 |
+
to find "How" then the matches = [[0,1]] for the string defined above.
|
506 |
+
|
507 |
+
"""
|
508 |
+
start = 0
|
509 |
+
annotated_text = ""
|
510 |
+
for match in matches:
|
511 |
+
start_idx = match[0]
|
512 |
+
end_idx = match[1]
|
513 |
+
if check_streamlit():
|
514 |
+
annotated_text = (annotated_text + document[start:start_idx]
|
515 |
+
+ str(annotation(body=document[start_idx:end_idx],
|
516 |
+
label="Context", background="#964448", color='#ffffff')))
|
517 |
+
else:
|
518 |
+
annotated_text = (annotated_text + document[start:start_idx]
|
519 |
+
+ colored(document[start_idx:end_idx],
|
520 |
+
"green", attrs = ['bold']))
|
521 |
+
start = end_idx
|
522 |
+
|
523 |
+
annotated_text = annotated_text + document[end_idx:]
|
524 |
+
|
525 |
+
if check_streamlit():
|
526 |
+
|
527 |
+
st.write(
|
528 |
+
markdown(annotated_text),
|
529 |
+
unsafe_allow_html=True,
|
530 |
+
)
|
531 |
+
else:
|
532 |
+
print(annotated_text)
|
533 |
+
|
534 |
+
|
535 |
+
def semantic_keywordsearch(query:Text,documents:List[Document],
|
536 |
+
embedding_model:Text,
|
537 |
+
embedding_model_format:Text,
|
538 |
+
embedding_layer:int, reader_model:str,
|
539 |
+
retriever_top_k:int = 10, reader_top_k:int = 10,
|
540 |
+
return_results:bool = False, embedding_dim:int = 768,
|
541 |
+
max_seq_len:int = 512,top_k_per_candidate:int =1,
|
542 |
+
sort_by:Literal["retriever", "reader"] = 'retriever'):
|
543 |
+
"""
|
544 |
+
Performs the Semantic search on the List of haystack documents which is
|
545 |
+
returned by preprocessing Pipeline.
|
546 |
+
|
547 |
+
Params
|
548 |
+
-------
|
549 |
+
query: Keywords that need to be searche in documents.
|
550 |
+
documents: List fo Haystack documents returned by preprocessing pipeline.
|
551 |
+
|
552 |
+
"""
|
553 |
+
semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = documents,
|
554 |
+
embedding_model= embedding_model,
|
555 |
+
embedding_layer= embedding_layer,
|
556 |
+
embedding_model_format= embedding_model_format,
|
557 |
+
reader_model= reader_model, retriever_top_k= retriever_top_k,
|
558 |
+
reader_top_k= reader_top_k, embedding_dim=embedding_dim,
|
559 |
+
max_seq_len=max_seq_len,
|
560 |
+
top_k_per_candidate=top_k_per_candidate)
|
561 |
+
|
562 |
+
raw_output = runSemanticPipeline(semanticsearch_pipeline,query)
|
563 |
+
results_df = process_semantic_output(raw_output)
|
564 |
+
if sort_by == 'retriever':
|
565 |
+
results_df = results_df.sort_values(by=['retriever_score'], ascending=False)
|
566 |
+
else:
|
567 |
+
results_df = results_df.sort_values(by=['reader_score'], ascending=False)
|
568 |
+
|
569 |
+
if return_results:
|
570 |
+
return results_df
|
571 |
+
else:
|
572 |
+
if check_streamlit:
|
573 |
+
st.markdown("##### Top few semantic search results #####")
|
574 |
+
else:
|
575 |
+
print("Top few semantic search results")
|
576 |
+
for i in range(len(results_df)):
|
577 |
+
if check_streamlit:
|
578 |
+
st.write("Result {}".format(i+1))
|
579 |
+
else:
|
580 |
+
print("Result {}".format(i+1))
|
581 |
+
semanticsearchAnnotator([results_df.loc[i]['context_offset']],
|
582 |
+
results_df.loc[i]['content'] )
|
utils/streamlitcheck.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
try:
|
3 |
+
import streamlit as st
|
4 |
+
except ImportError:
|
5 |
+
logging.info("Streamlit not installed")
|
6 |
+
|
7 |
+
|
8 |
+
def check_streamlit():
|
9 |
+
"""
|
10 |
+
Function to check whether python code is run within streamlit
|
11 |
+
|
12 |
+
Returns
|
13 |
+
-------
|
14 |
+
use_streamlit : boolean
|
15 |
+
True if code is run within streamlit, else False
|
16 |
+
"""
|
17 |
+
try:
|
18 |
+
from streamlit.scriptrunner.script_run_context import get_script_run_ctx
|
19 |
+
if not get_script_run_ctx():
|
20 |
+
use_streamlit = False
|
21 |
+
else:
|
22 |
+
use_streamlit = True
|
23 |
+
except ModuleNotFoundError:
|
24 |
+
use_streamlit = False
|
25 |
+
return use_streamlit
|
26 |
+
|
27 |
+
def disable_other_checkboxes(*other_checkboxes_keys):
|
28 |
+
for checkbox_key in other_checkboxes_keys:
|
29 |
+
st.session_state[checkbox_key] = False
|
30 |
+
|
31 |
+
def checkbox_without_preselect(keylist):
|
32 |
+
dict_ = {}
|
33 |
+
for i,key_val in enumerate(keylist):
|
34 |
+
dict_[i] = st.checkbox(key_val,key = key_val,
|
35 |
+
on_change = disable_other_checkboxes,
|
36 |
+
args=tuple(list(filter(lambda x: x!= key_val, keylist))),)
|
37 |
+
|
38 |
+
for key,val in dict_.items():
|
39 |
+
if val == True:
|
40 |
+
return keylist[int(key)]
|
41 |
+
|
42 |
+
return None
|
utils/uploadAndExample.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import tempfile
|
3 |
+
import json
|
4 |
+
|
5 |
+
def add_upload(choice):
|
6 |
+
"""
|
7 |
+
Provdies the user with choice to either 'Upload Document' or 'Try Example'.
|
8 |
+
Based on user choice runs streamlit processes and save the path and name of
|
9 |
+
the 'file' to streamlit session_state which then can be fetched later.
|
10 |
+
|
11 |
+
"""
|
12 |
+
|
13 |
+
if choice == 'Upload Document':
|
14 |
+
uploaded_file = st.sidebar.file_uploader('Upload the File',
|
15 |
+
type=['pdf', 'docx', 'txt'])
|
16 |
+
if uploaded_file is not None:
|
17 |
+
with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
|
18 |
+
bytes_data = uploaded_file.getvalue()
|
19 |
+
temp.write(bytes_data)
|
20 |
+
st.session_state['filename'] = uploaded_file.name
|
21 |
+
st.session_state['filepath'] = temp.name
|
22 |
+
|
23 |
+
|
24 |
+
else:
|
25 |
+
# listing the options
|
26 |
+
with open('docStore/sample/files.json','r') as json_file:
|
27 |
+
files = json.load(json_file)
|
28 |
+
|
29 |
+
option = st.sidebar.selectbox('Select the example document',
|
30 |
+
list(files.keys()))
|
31 |
+
file_name = file_path = files[option]
|
32 |
+
st.session_state['filename'] = file_name
|
33 |
+
st.session_state['filepath'] = file_path
|