prashant
commited on
Commit
·
cc5c327
1
Parent(s):
4a20529
lexical search app update
Browse files- app.py +2 -2
- appStore/keyword_search.py +18 -107
- appStore/sdg_analysis.py +6 -1
- paramconfig.cfg +2 -0
- utils/search.py +104 -7
- ver0.1 scripts/keyword_search.py +169 -0
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
|
2 |
import appStore.sdg_analysis as sdg_analysis
|
3 |
#import appStore.coherence as coherence
|
4 |
import appStore.info as info
|
@@ -12,6 +12,6 @@ app = MultiApp()
|
|
12 |
|
13 |
app.add_app("About","house", info.app)
|
14 |
app.add_app("SDG Analysis","gear",sdg_analysis.app)
|
15 |
-
|
16 |
|
17 |
app.run()
|
|
|
1 |
+
import appStore.keyword_search as keyword_search
|
2 |
import appStore.sdg_analysis as sdg_analysis
|
3 |
#import appStore.coherence as coherence
|
4 |
import appStore.info as info
|
|
|
12 |
|
13 |
app.add_app("About","house", info.app)
|
14 |
app.add_app("SDG Analysis","gear",sdg_analysis.app)
|
15 |
+
app.add_app("Search","search", keyword_search.app)
|
16 |
|
17 |
app.run()
|
appStore/keyword_search.py
CHANGED
@@ -1,38 +1,12 @@
|
|
1 |
# set path
|
2 |
-
import glob, os, sys
|
3 |
-
|
4 |
-
sys.path.append('../udfPreprocess')
|
5 |
|
6 |
-
#import helper
|
7 |
-
import udfPreprocess.docPreprocessing as pre
|
8 |
-
import udfPreprocess.cleaning as clean
|
9 |
-
from udfPreprocess.search import bm25_tokenizer, bm25TokenizeDoc, lexical_search
|
10 |
-
#import needed libraries
|
11 |
-
import seaborn as sns
|
12 |
-
from pandas import DataFrame
|
13 |
-
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
14 |
-
# from keybert import KeyBERT
|
15 |
-
from transformers import pipeline
|
16 |
-
import matplotlib.pyplot as plt
|
17 |
-
import numpy as np
|
18 |
import streamlit as st
|
19 |
-
import pandas as pd
|
20 |
-
from rank_bm25 import BM25Okapi
|
21 |
-
from sklearn.feature_extraction import _stop_words
|
22 |
-
import string
|
23 |
-
from tqdm.autonotebook import tqdm
|
24 |
-
import numpy as np
|
25 |
-
import docx
|
26 |
-
from docx.shared import Inches
|
27 |
-
from docx.shared import Pt
|
28 |
-
from docx.enum.style import WD_STYLE_TYPE
|
29 |
-
import logging
|
30 |
-
logger = logging.getLogger(__name__)
|
31 |
-
import tempfile
|
32 |
-
import sqlite3
|
33 |
import json
|
34 |
-
import
|
35 |
-
|
|
|
36 |
|
37 |
def app():
|
38 |
|
@@ -54,11 +28,9 @@ def app():
|
|
54 |
""")
|
55 |
|
56 |
st.markdown("")
|
57 |
-
|
58 |
|
59 |
-
|
60 |
with st.sidebar:
|
61 |
-
with open('sample/keywordexample.json','r') as json_file:
|
62 |
keywordexample = json.load(json_file)
|
63 |
|
64 |
genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
|
@@ -76,93 +48,32 @@ def app():
|
|
76 |
keywordList = None
|
77 |
|
78 |
searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context", ['Exact Matches', 'Similar context/meaning'])
|
79 |
-
|
80 |
-
|
81 |
with st.container():
|
82 |
if keywordList is not None:
|
83 |
queryList = st.text_input("You selcted the {} category we will look for these keywords in document".format(genre),
|
84 |
value="{}".format(keywordList))
|
85 |
else:
|
86 |
queryList = st.text_input("Please enter here your question and we will look \
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
placeholder="Enter keyword here")
|
92 |
-
|
93 |
if st.button("Find them"):
|
94 |
|
95 |
if queryList == "":
|
96 |
st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!")
|
97 |
logging.warning("Terminated as no keyword provided")
|
98 |
else:
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
paraList = st.session_state['paraList']
|
103 |
-
|
104 |
if searchtype == 'Exact Matches':
|
105 |
-
queryList = list(queryList.split(","))
|
106 |
logging.info("performing lexical search")
|
107 |
-
|
108 |
-
# st.write(len(tokenized_corpus))
|
109 |
-
document_bm25 = BM25Okapi(tokenized_corpus)
|
110 |
-
|
111 |
with st.spinner("Performing Exact matching search (Lexical search) for you"):
|
112 |
-
|
113 |
-
|
114 |
-
for keyword in queryList:
|
115 |
-
|
116 |
-
bm25_hits = lexical_search(keyword,document_bm25)
|
117 |
-
|
118 |
-
|
119 |
-
counter = 0
|
120 |
-
for hit in bm25_hits:
|
121 |
-
if hit['score'] > 0.00:
|
122 |
-
counter += 1
|
123 |
-
if counter == 1:
|
124 |
-
st.markdown("###### Results for keyword: **{}** ######".format(keyword))
|
125 |
-
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
126 |
-
st.write("\t {}: {}\t".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
|
127 |
-
|
128 |
-
|
129 |
-
if counter == 0:
|
130 |
-
st.write("No results found for '**{}**' ".format(keyword))
|
131 |
-
|
132 |
-
st.markdown("---")
|
133 |
-
else:
|
134 |
-
logging.info("starting semantic search")
|
135 |
-
with st.spinner("Performing Similar/Contextual search"):
|
136 |
-
query = "Find {} related issues ?".format(queryList)
|
137 |
-
config = configparser.ConfigParser()
|
138 |
-
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
139 |
-
threshold = float(config.get('semantic_search','THRESHOLD'))
|
140 |
-
# st.write(query)
|
141 |
-
semantic_hits = semantic_search(query,paraList)
|
142 |
-
st.markdown("##### Few Semantic search hits for {} related topics #####".format(queryList))
|
143 |
-
|
144 |
-
for i,queryhit in enumerate(semantic_hits):
|
145 |
-
|
146 |
-
# st.markdown("###### Results for query: **{}** ######".format(queryList[i]))
|
147 |
-
counter = 0
|
148 |
-
for hit in queryhit:
|
149 |
-
counter += 1
|
150 |
-
|
151 |
-
|
152 |
-
if hit['score'] > threshold:
|
153 |
-
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
154 |
-
st.write("\t {}: \t {}".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
|
155 |
-
|
156 |
-
# document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
157 |
-
st.markdown("---")
|
158 |
-
# st.write(semantic_hits)
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
else:
|
164 |
-
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
165 |
-
logging.warning("Terminated as no keyword provided")
|
166 |
|
167 |
-
|
168 |
-
|
|
|
1 |
# set path
|
2 |
+
import glob, os, sys;
|
3 |
+
sys.path.append('../utils')
|
|
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import json
|
7 |
+
import logging
|
8 |
+
from utils.search import runLexicalPreprocessingPipeline, tokenize_lexical_query
|
9 |
+
from utils.search import runSpacyMatcher, lexical_search
|
10 |
|
11 |
def app():
|
12 |
|
|
|
28 |
""")
|
29 |
|
30 |
st.markdown("")
|
|
|
31 |
|
|
|
32 |
with st.sidebar:
|
33 |
+
with open('docStore/sample/keywordexample.json','r') as json_file:
|
34 |
keywordexample = json.load(json_file)
|
35 |
|
36 |
genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
|
|
|
48 |
keywordList = None
|
49 |
|
50 |
searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context", ['Exact Matches', 'Similar context/meaning'])
|
51 |
+
|
|
|
52 |
with st.container():
|
53 |
if keywordList is not None:
|
54 |
queryList = st.text_input("You selcted the {} category we will look for these keywords in document".format(genre),
|
55 |
value="{}".format(keywordList))
|
56 |
else:
|
57 |
queryList = st.text_input("Please enter here your question and we will look \
|
58 |
+
for an answer in the document OR enter the keyword you \
|
59 |
+
are looking for and we will \
|
60 |
+
we will look for similar context \
|
61 |
+
in the document.",
|
62 |
placeholder="Enter keyword here")
|
63 |
+
|
64 |
if st.button("Find them"):
|
65 |
|
66 |
if queryList == "":
|
67 |
st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!")
|
68 |
logging.warning("Terminated as no keyword provided")
|
69 |
else:
|
70 |
+
if 'filepath' in st.session_state:
|
71 |
+
paraList = runLexicalPreprocessingPipeline()
|
72 |
+
|
|
|
|
|
73 |
if searchtype == 'Exact Matches':
|
74 |
+
# queryList = list(queryList.split(","))
|
75 |
logging.info("performing lexical search")
|
76 |
+
# token_list = tokenize_lexical_query(queryList)
|
|
|
|
|
|
|
77 |
with st.spinner("Performing Exact matching search (Lexical search) for you"):
|
78 |
+
lexical_search(queryList,paraList)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
|
|
|
appStore/sdg_analysis.py
CHANGED
@@ -46,7 +46,12 @@ def app():
|
|
46 |
|
47 |
if 'filepath' in st.session_state:
|
48 |
paraList = runSDGPreprocessingPipeline()
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
df, x = sdg_classification(paraList)
|
52 |
|
|
|
46 |
|
47 |
if 'filepath' in st.session_state:
|
48 |
paraList = runSDGPreprocessingPipeline()
|
49 |
+
if len(paraList) > 150:
|
50 |
+
warning_msg = ": This might take some, please sit back and relax."
|
51 |
+
else:
|
52 |
+
warning_msg = ""
|
53 |
+
|
54 |
+
with st.spinner("Running SDG Classification{}".format(warning_msg)):
|
55 |
|
56 |
df, x = sdg_classification(paraList)
|
57 |
|
paramconfig.cfg
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
[lexical_search]
|
2 |
TOP_K = 10
|
3 |
THRESHOLD = 0.1
|
|
|
|
|
4 |
|
5 |
[semantic_search]
|
6 |
TOP_K = 10
|
|
|
1 |
[lexical_search]
|
2 |
TOP_K = 10
|
3 |
THRESHOLD = 0.1
|
4 |
+
SPLIT_BY = sentence
|
5 |
+
SPLIT_LENGTH = 3
|
6 |
|
7 |
[semantic_search]
|
8 |
TOP_K = 10
|
utils/search.py
CHANGED
@@ -7,17 +7,55 @@ from spacy.matcher import Matcher
|
|
7 |
import streamlit as st
|
8 |
from markdown import markdown
|
9 |
from annotated_text import annotation
|
|
|
|
|
|
|
10 |
|
11 |
config = configparser.ConfigParser()
|
12 |
config.read_file(open('paramconfig.py'))
|
13 |
|
14 |
|
15 |
-
def tokenize_lexical_query(query):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
nlp = spacy.load("en_core_web_sm")
|
17 |
token_list = [token.text.lower() for token in nlp(query) if not token.is_stop]
|
18 |
return token_list
|
19 |
|
20 |
-
def runSpacyMatcher(token_list, document):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
nlp = spacy.load("en_core_web_sm")
|
22 |
spacydoc = nlp(document)
|
23 |
matcher = Matcher(nlp.vocab)
|
@@ -25,20 +63,47 @@ def runSpacyMatcher(token_list, document):
|
|
25 |
matcher.add(",".join(token_list), token_pattern)
|
26 |
spacymatches = matcher(spacydoc)
|
27 |
|
|
|
28 |
matches = []
|
29 |
for match_id, start, end in spacymatches:
|
30 |
matches = matches + [[start, end]]
|
31 |
|
32 |
return matches, spacydoc
|
33 |
|
34 |
-
def runRegexMatcher(token_list, document):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
matches = []
|
36 |
for token in token_list:
|
37 |
matches = matches + [[val.start(), val.start()+ len(token)] for val in re.finditer(token, document)]
|
38 |
|
39 |
return matches, document
|
40 |
|
41 |
-
def searchAnnotator(matches, document):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
start = 0
|
43 |
annotated_text = ""
|
44 |
for match in matches:
|
@@ -52,10 +117,16 @@ def searchAnnotator(matches, document):
|
|
52 |
unsafe_allow_html=True,
|
53 |
)
|
54 |
|
55 |
-
def lexical_search(query,documents):
|
|
|
|
|
|
|
|
|
56 |
|
57 |
document_store = InMemoryDocumentStore()
|
58 |
document_store.write_documents(documents)
|
|
|
|
|
59 |
retriever = TfidfRetriever(document_store)
|
60 |
results = retriever.retrieve(query=query,
|
61 |
top_k= int(config.get('lexical_search','TOP_K')))
|
@@ -64,5 +135,31 @@ def lexical_search(query,documents):
|
|
64 |
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
65 |
searchAnnotator(matches, doc)
|
66 |
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
import streamlit as st
|
8 |
from markdown import markdown
|
9 |
from annotated_text import annotation
|
10 |
+
from haystack.schema import Document
|
11 |
+
from typing import List, Tuple, Text
|
12 |
+
from utils.preprocessing import processingpipeline
|
13 |
|
14 |
config = configparser.ConfigParser()
|
15 |
config.read_file(open('paramconfig.py'))
|
16 |
|
17 |
|
18 |
+
def tokenize_lexical_query(query:str)-> List[str]:
|
19 |
+
"""
|
20 |
+
Removes the stop words from query and returns the list of important keywords
|
21 |
+
in query. For the lexical search the relevent paragraphs in document are
|
22 |
+
retreived using TfIDFretreiver from Haystack. However to highlight these
|
23 |
+
keywords we need the tokenized form of query.
|
24 |
+
|
25 |
+
Params
|
26 |
+
--------
|
27 |
+
query: string which represents either list of keywords user is looking for
|
28 |
+
or a query in form of Question.
|
29 |
+
|
30 |
+
Return
|
31 |
+
-----------
|
32 |
+
token_list: list of important keywords in the query.
|
33 |
+
|
34 |
+
"""
|
35 |
nlp = spacy.load("en_core_web_sm")
|
36 |
token_list = [token.text.lower() for token in nlp(query) if not token.is_stop]
|
37 |
return token_list
|
38 |
|
39 |
+
def runSpacyMatcher(token_list:List[str], document:Text):
|
40 |
+
"""
|
41 |
+
Using the spacy in backend finds the keywords in the document using the
|
42 |
+
Matcher class from spacy. We can alternatively use the regex, but spacy
|
43 |
+
finds all keywords in serialized manner which helps in annotation of answers.
|
44 |
+
|
45 |
+
Params
|
46 |
+
-------
|
47 |
+
token_list: this is token list which tokenize_lexical_query function returns
|
48 |
+
document: text in which we need to find the tokens
|
49 |
+
|
50 |
+
Return
|
51 |
+
--------
|
52 |
+
matches: List of [start_index, end_index] in the spacydoc(at word level not
|
53 |
+
character) for the keywords in token list.
|
54 |
+
|
55 |
+
spacydoc: the keyword index in the spacydoc are at word level and not character,
|
56 |
+
therefore to allow the annotator to work seamlessly we return the spacydoc.
|
57 |
+
|
58 |
+
"""
|
59 |
nlp = spacy.load("en_core_web_sm")
|
60 |
spacydoc = nlp(document)
|
61 |
matcher = Matcher(nlp.vocab)
|
|
|
63 |
matcher.add(",".join(token_list), token_pattern)
|
64 |
spacymatches = matcher(spacydoc)
|
65 |
|
66 |
+
# getting start and end index in spacydoc so that annotator can work seamlessly
|
67 |
matches = []
|
68 |
for match_id, start, end in spacymatches:
|
69 |
matches = matches + [[start, end]]
|
70 |
|
71 |
return matches, spacydoc
|
72 |
|
73 |
+
def runRegexMatcher(token_list:List[str], document:Text):
|
74 |
+
"""
|
75 |
+
Using the regex in backend finds the keywords in the document.
|
76 |
+
|
77 |
+
Params
|
78 |
+
-------
|
79 |
+
token_list: this is token list which tokenize_lexical_query function returns
|
80 |
+
|
81 |
+
document: text in which we need to find the tokens
|
82 |
+
|
83 |
+
Return
|
84 |
+
--------
|
85 |
+
matches: List of [start_index, end_index] in the document for the keywords
|
86 |
+
in token list at character level.
|
87 |
+
|
88 |
+
document: the keyword index returned by regex are at character level,
|
89 |
+
therefore to allow the annotator to work seamlessly we return the text back.
|
90 |
+
|
91 |
+
"""
|
92 |
matches = []
|
93 |
for token in token_list:
|
94 |
matches = matches + [[val.start(), val.start()+ len(token)] for val in re.finditer(token, document)]
|
95 |
|
96 |
return matches, document
|
97 |
|
98 |
+
def searchAnnotator(matches: List[List[int]], document):
|
99 |
+
"""
|
100 |
+
Annotates the text in the document defined by list of [start index, end index]
|
101 |
+
Example: "How are you today", if document type is text, matches = [[0,3]]
|
102 |
+
will give answer = "How", however in case we used the spacy matcher then the
|
103 |
+
matches = [[0,3]] will give answer = "How are you". However if spacy is used
|
104 |
+
to find "How" then the matches = [[0,1]] for the string defined above.
|
105 |
+
|
106 |
+
"""
|
107 |
start = 0
|
108 |
annotated_text = ""
|
109 |
for match in matches:
|
|
|
117 |
unsafe_allow_html=True,
|
118 |
)
|
119 |
|
120 |
+
def lexical_search(query:Text,documents:List[Document]):
|
121 |
+
"""
|
122 |
+
Performs the Lexical search on the List of haystack documents which is
|
123 |
+
returned by preprocessing Pipeline.
|
124 |
+
"""
|
125 |
|
126 |
document_store = InMemoryDocumentStore()
|
127 |
document_store.write_documents(documents)
|
128 |
+
|
129 |
+
# Haystack Retriever works with document stores only.
|
130 |
retriever = TfidfRetriever(document_store)
|
131 |
results = retriever.retrieve(query=query,
|
132 |
top_k= int(config.get('lexical_search','TOP_K')))
|
|
|
135 |
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
136 |
searchAnnotator(matches, doc)
|
137 |
|
138 |
+
def runLexicalPreprocessingPipeline()->List[Document]:
|
139 |
+
"""
|
140 |
+
creates the pipeline and runs the preprocessing pipeline,
|
141 |
+
the params for pipeline are fetched from paramconfig
|
142 |
+
|
143 |
+
Return
|
144 |
+
--------------
|
145 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
146 |
+
has four objects. For the Haysatck implementation of SDG classification we,
|
147 |
+
need to use the List of Haystack Document, which can be fetched by
|
148 |
+
key = 'documents' on output.
|
149 |
+
|
150 |
+
"""
|
151 |
+
file_path = st.session_state['filepath']
|
152 |
+
file_name = st.session_state['filename']
|
153 |
+
sdg_processing_pipeline = processingpipeline()
|
154 |
+
split_by = config.get('lexical_search','SPLIT_BY')
|
155 |
+
split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
|
156 |
+
|
157 |
+
output_lexical_pre = sdg_processing_pipeline.run(file_paths = file_path,
|
158 |
+
params= {"FileConverter": {"file_path": file_path, \
|
159 |
+
"file_name": file_name},
|
160 |
+
"UdfPreProcessor": {"removePunc": False, \
|
161 |
+
"split_by": split_by, \
|
162 |
+
"split_length":split_length}})
|
163 |
+
|
164 |
+
return output_lexical_pre['documents']
|
165 |
+
|
ver0.1 scripts/keyword_search.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# set path
|
2 |
+
import glob, os, sys
|
3 |
+
from udfPreprocess.search import semantic_search
|
4 |
+
sys.path.append('../udfPreprocess')
|
5 |
+
|
6 |
+
#import helper
|
7 |
+
import udfPreprocess.docPreprocessing as pre
|
8 |
+
import udfPreprocess.cleaning as clean
|
9 |
+
from udfPreprocess.search import bm25_tokenizer, bm25TokenizeDoc, lexical_search
|
10 |
+
#import needed libraries
|
11 |
+
import seaborn as sns
|
12 |
+
from pandas import DataFrame
|
13 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
14 |
+
# from keybert import KeyBERT
|
15 |
+
from transformers import pipeline
|
16 |
+
import matplotlib.pyplot as plt
|
17 |
+
import numpy as np
|
18 |
+
import streamlit as st
|
19 |
+
import pandas as pd
|
20 |
+
from rank_bm25 import BM25Okapi
|
21 |
+
from sklearn.feature_extraction import _stop_words
|
22 |
+
import string
|
23 |
+
from tqdm.autonotebook import tqdm
|
24 |
+
import numpy as np
|
25 |
+
import docx
|
26 |
+
from docx.shared import Inches
|
27 |
+
from docx.shared import Pt
|
28 |
+
from docx.enum.style import WD_STYLE_TYPE
|
29 |
+
import logging
|
30 |
+
logger = logging.getLogger(__name__)
|
31 |
+
import tempfile
|
32 |
+
import sqlite3
|
33 |
+
import json
|
34 |
+
import configparser
|
35 |
+
|
36 |
+
|
37 |
+
def app():
|
38 |
+
|
39 |
+
with st.container():
|
40 |
+
st.markdown("<h1 style='text-align: center; \
|
41 |
+
color: black;'> Search</h1>",
|
42 |
+
unsafe_allow_html=True)
|
43 |
+
st.write(' ')
|
44 |
+
st.write(' ')
|
45 |
+
|
46 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
47 |
+
|
48 |
+
st.write(
|
49 |
+
"""
|
50 |
+
The *Keyword Search* app is an easy-to-use interface \
|
51 |
+
built in Streamlit for doing keyword search in \
|
52 |
+
policy document - developed by GIZ Data and the \
|
53 |
+
Sustainable Development Solution Network.
|
54 |
+
""")
|
55 |
+
|
56 |
+
st.markdown("")
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
with st.sidebar:
|
61 |
+
with open('sample/keywordexample.json','r') as json_file:
|
62 |
+
keywordexample = json.load(json_file)
|
63 |
+
|
64 |
+
genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
|
65 |
+
if genre == 'Food':
|
66 |
+
keywordList = keywordexample['Food']
|
67 |
+
elif genre == 'Climate':
|
68 |
+
keywordList = keywordexample['Climate']
|
69 |
+
elif genre == 'Social':
|
70 |
+
keywordList = keywordexample['Social']
|
71 |
+
elif genre == 'Nature':
|
72 |
+
keywordList = keywordexample['Nature']
|
73 |
+
elif genre == 'Implementation':
|
74 |
+
keywordList = keywordexample['Implementation']
|
75 |
+
else:
|
76 |
+
keywordList = None
|
77 |
+
|
78 |
+
searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context", ['Exact Matches', 'Similar context/meaning'])
|
79 |
+
|
80 |
+
|
81 |
+
with st.container():
|
82 |
+
if keywordList is not None:
|
83 |
+
queryList = st.text_input("You selcted the {} category we will look for these keywords in document".format(genre),
|
84 |
+
value="{}".format(keywordList))
|
85 |
+
else:
|
86 |
+
queryList = st.text_input("Please enter here your question and we will look \
|
87 |
+
for an answer in the document OR enter the keyword you \
|
88 |
+
are looking for and we will \
|
89 |
+
we will look for similar context \
|
90 |
+
in the document.",
|
91 |
+
placeholder="Enter keyword here")
|
92 |
+
|
93 |
+
if st.button("Find them"):
|
94 |
+
|
95 |
+
if queryList == "":
|
96 |
+
st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!")
|
97 |
+
logging.warning("Terminated as no keyword provided")
|
98 |
+
else:
|
99 |
+
|
100 |
+
if 'docs' in st.session_state:
|
101 |
+
docs = st.session_state['docs']
|
102 |
+
paraList = st.session_state['paraList']
|
103 |
+
|
104 |
+
if searchtype == 'Exact Matches':
|
105 |
+
queryList = list(queryList.split(","))
|
106 |
+
logging.info("performing lexical search")
|
107 |
+
tokenized_corpus = bm25TokenizeDoc(paraList)
|
108 |
+
# st.write(len(tokenized_corpus))
|
109 |
+
document_bm25 = BM25Okapi(tokenized_corpus)
|
110 |
+
|
111 |
+
with st.spinner("Performing Exact matching search (Lexical search) for you"):
|
112 |
+
st.markdown("##### Top few lexical search (BM25) hits #####")
|
113 |
+
|
114 |
+
for keyword in queryList:
|
115 |
+
|
116 |
+
bm25_hits = lexical_search(keyword,document_bm25)
|
117 |
+
|
118 |
+
|
119 |
+
counter = 0
|
120 |
+
for hit in bm25_hits:
|
121 |
+
if hit['score'] > 0.00:
|
122 |
+
counter += 1
|
123 |
+
if counter == 1:
|
124 |
+
st.markdown("###### Results for keyword: **{}** ######".format(keyword))
|
125 |
+
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
126 |
+
st.write("\t {}: {}\t".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
|
127 |
+
|
128 |
+
|
129 |
+
if counter == 0:
|
130 |
+
st.write("No results found for '**{}**' ".format(keyword))
|
131 |
+
|
132 |
+
st.markdown("---")
|
133 |
+
else:
|
134 |
+
logging.info("starting semantic search")
|
135 |
+
with st.spinner("Performing Similar/Contextual search"):
|
136 |
+
query = "Find {} related issues ?".format(queryList)
|
137 |
+
config = configparser.ConfigParser()
|
138 |
+
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
139 |
+
threshold = float(config.get('semantic_search','THRESHOLD'))
|
140 |
+
# st.write(query)
|
141 |
+
semantic_hits = semantic_search(query,paraList)
|
142 |
+
st.markdown("##### Few Semantic search hits for {} related topics #####".format(queryList))
|
143 |
+
|
144 |
+
for i,queryhit in enumerate(semantic_hits):
|
145 |
+
|
146 |
+
# st.markdown("###### Results for query: **{}** ######".format(queryList[i]))
|
147 |
+
counter = 0
|
148 |
+
for hit in queryhit:
|
149 |
+
counter += 1
|
150 |
+
|
151 |
+
|
152 |
+
if hit['score'] > threshold:
|
153 |
+
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
154 |
+
st.write("\t {}: \t {}".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
|
155 |
+
|
156 |
+
# document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
157 |
+
st.markdown("---")
|
158 |
+
# st.write(semantic_hits)
|
159 |
+
|
160 |
+
|
161 |
+
|
162 |
+
|
163 |
+
else:
|
164 |
+
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
165 |
+
logging.warning("Terminated as no keyword provided")
|
166 |
+
|
167 |
+
|
168 |
+
|
169 |
+
|