Spaces:
Sleeping
Sleeping
Delete appStore/keyword_search.py
Browse files- appStore/keyword_search.py +0 -176
appStore/keyword_search.py
DELETED
@@ -1,176 +0,0 @@
|
|
1 |
-
# set path
|
2 |
-
import glob, os, sys;
|
3 |
-
sys.path.append('../utils')
|
4 |
-
|
5 |
-
import streamlit as st
|
6 |
-
import json
|
7 |
-
import logging
|
8 |
-
from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
|
9 |
-
from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_keywordsearch
|
10 |
-
from utils.checkconfig import getconfig
|
11 |
-
from utils.streamlitcheck import checkbox_without_preselect
|
12 |
-
|
13 |
-
# Declare all the necessary variables
|
14 |
-
config = getconfig('paramconfig.cfg')
|
15 |
-
split_by = config.get('semantic_search','SPLIT_BY')
|
16 |
-
split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
|
17 |
-
split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
|
18 |
-
split_respect_sentence_boundary = bool(int(config.get('semantic_search',
|
19 |
-
'RESPECT_SENTENCE_BOUNDARY')))
|
20 |
-
remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
|
21 |
-
embedding_model = config.get('semantic_search','RETRIEVER')
|
22 |
-
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
23 |
-
embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
24 |
-
embedding_dim = int(config.get('semantic_search','EMBEDDING_DIM'))
|
25 |
-
max_seq_len = int(config.get('semantic_search','MAX_SEQ_LENGTH'))
|
26 |
-
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
27 |
-
reader_model = config.get('semantic_search','READER')
|
28 |
-
reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
29 |
-
top_k_per_candidate = int(config.get('semantic_search','READER_TOP_K_PER_CANDIDATE'))
|
30 |
-
lexical_split_by= config.get('lexical_search','SPLIT_BY')
|
31 |
-
lexical_split_length=int(config.get('lexical_search','SPLIT_LENGTH'))
|
32 |
-
lexical_split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
|
33 |
-
lexical_remove_punc = bool(int(config.get('lexical_search','REMOVE_PUNC')))
|
34 |
-
lexical_top_k=int(config.get('lexical_search','TOP_K'))
|
35 |
-
|
36 |
-
def app():
|
37 |
-
|
38 |
-
with st.container():
|
39 |
-
st.markdown("<h1 style='text-align: center; \
|
40 |
-
color: black;'> Search</h1>",
|
41 |
-
unsafe_allow_html=True)
|
42 |
-
st.write(' ')
|
43 |
-
st.write(' ')
|
44 |
-
|
45 |
-
with st.expander("ℹ️ - About this app", expanded=False):
|
46 |
-
|
47 |
-
st.write(
|
48 |
-
"""
|
49 |
-
The *Search* app is an interface \
|
50 |
-
for doing contextual and keyword searches in \
|
51 |
-
policy documents. \
|
52 |
-
""")
|
53 |
-
st.write("")
|
54 |
-
st.write(""" The application allows its user to perform a search\
|
55 |
-
based on two options: a lexical search([TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf))\
|
56 |
-
and semantic search. [bi-encoder](https://www.sbert.net/examples/applications/retrieve_rerank/README.html)\
|
57 |
-
The lexical search only \
|
58 |
-
displays paragraphs in the document with exact matching results, \
|
59 |
-
the semantic search shows paragraphs with meaningful connections \
|
60 |
-
(e.g., synonyms) based on the search context. Both \
|
61 |
-
methods employ a probabilistic retrieval framework in its identification\
|
62 |
-
of relevant paragraphs. By defualt the search is performed using \
|
63 |
-
'Semantic Search', and to find 'Exact/Lexical Matches' please tick the \
|
64 |
-
checkbox provided which will by-pass semantic search. Furthermore,\
|
65 |
-
the application allows the user to search for pre-defined keywords \
|
66 |
-
from different thematic buckets present in sidebar.""")
|
67 |
-
st.write("")
|
68 |
-
st.write(""" The Exact Matches gives back top {} findings, and Semantic
|
69 |
-
search provides with top {} answers.""".format(lexical_top_k, retriever_top_k))
|
70 |
-
st.write("")
|
71 |
-
st.write("")
|
72 |
-
st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
|
73 |
-
col1,col2,col3= st.columns([2,4,4])
|
74 |
-
with col1:
|
75 |
-
st.caption("OCR File processing")
|
76 |
-
# st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
|
77 |
-
st.write("50 sec")
|
78 |
-
|
79 |
-
with col2:
|
80 |
-
st.caption("Lexical Search on 200 paragraphs(~ 35 pages)")
|
81 |
-
# st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
|
82 |
-
st.write("15 sec")
|
83 |
-
|
84 |
-
with col3:
|
85 |
-
st.caption("Semantic search on 200 paragraphs(~ 35 pages)")
|
86 |
-
# st.markdown('<div style="text-align: center;">120 sec</div>', unsafe_allow_html=True)
|
87 |
-
st.write("120 sec(including emebedding creation)")
|
88 |
-
|
89 |
-
with st.sidebar:
|
90 |
-
with open('docStore/sample/keywordexample.json','r') as json_file:
|
91 |
-
keywordexample = json.load(json_file)
|
92 |
-
|
93 |
-
# genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
|
94 |
-
st.caption("Select Keyword Category")
|
95 |
-
genre = checkbox_without_preselect(list(keywordexample.keys()))
|
96 |
-
if genre:
|
97 |
-
keywordList = keywordexample[genre]
|
98 |
-
else:
|
99 |
-
keywordList = None
|
100 |
-
|
101 |
-
st.markdown("---")
|
102 |
-
|
103 |
-
with st.container():
|
104 |
-
type_hinting = "Please enter here your question and we \
|
105 |
-
will look for an answer in the document\
|
106 |
-
OR enter the keyword you are looking \
|
107 |
-
for and we will look for similar\
|
108 |
-
context in the document.\
|
109 |
-
You can also explore predefined sets of keywords from sidebar. "
|
110 |
-
if keywordList is not None:
|
111 |
-
# queryList = st.text_input("You selected the {} category we \
|
112 |
-
# will look for these keywords in document".format(genre)
|
113 |
-
# value="{}".format(keywordList))
|
114 |
-
queryList = st.text_input(type_hinting,
|
115 |
-
value = "{}".format(keywordList))
|
116 |
-
else:
|
117 |
-
queryList = st.text_input(type_hinting,
|
118 |
-
placeholder="Enter keyword/query here")
|
119 |
-
|
120 |
-
searchtype = st.checkbox("Show only Exact Matches")
|
121 |
-
if st.button("Find them"):
|
122 |
-
|
123 |
-
if queryList == "":
|
124 |
-
st.info("🤔 No keyword provided, if you dont have any, \
|
125 |
-
please try example sets from sidebar!")
|
126 |
-
logging.warning("Terminated as no keyword provided")
|
127 |
-
else:
|
128 |
-
if 'filepath' in st.session_state:
|
129 |
-
|
130 |
-
if searchtype:
|
131 |
-
all_documents = runLexicalPreprocessingPipeline(
|
132 |
-
file_name=st.session_state['filename'],
|
133 |
-
file_path=st.session_state['filepath'],
|
134 |
-
split_by=lexical_split_by,
|
135 |
-
split_length=lexical_split_length,
|
136 |
-
split_overlap=lexical_split_overlap,
|
137 |
-
remove_punc=lexical_remove_punc)
|
138 |
-
logging.info("performing lexical search")
|
139 |
-
with st.spinner("Performing Exact matching search \
|
140 |
-
(Lexical search) for you"):
|
141 |
-
lexical_search(query=queryList,
|
142 |
-
documents = all_documents['documents'],
|
143 |
-
top_k = lexical_top_k )
|
144 |
-
else:
|
145 |
-
all_documents = runSemanticPreprocessingPipeline(
|
146 |
-
file_path= st.session_state['filepath'],
|
147 |
-
file_name = st.session_state['filename'],
|
148 |
-
split_by=split_by,
|
149 |
-
split_length= split_length,
|
150 |
-
split_overlap=split_overlap,
|
151 |
-
remove_punc= remove_punc,
|
152 |
-
split_respect_sentence_boundary=split_respect_sentence_boundary)
|
153 |
-
if len(all_documents['documents']) > 100:
|
154 |
-
warning_msg = ": This might take sometime, please sit back and relax."
|
155 |
-
else:
|
156 |
-
warning_msg = ""
|
157 |
-
|
158 |
-
logging.info("starting semantic search")
|
159 |
-
with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
|
160 |
-
semantic_keywordsearch(query = queryList,
|
161 |
-
documents = all_documents['documents'],
|
162 |
-
embedding_model=embedding_model,
|
163 |
-
embedding_layer=embedding_layer,
|
164 |
-
embedding_model_format=embedding_model_format,
|
165 |
-
reader_model=reader_model,reader_top_k=reader_top_k,
|
166 |
-
retriever_top_k=retriever_top_k, embedding_dim=embedding_dim,
|
167 |
-
max_seq_len=max_seq_len,
|
168 |
-
top_k_per_candidate = top_k_per_candidate)
|
169 |
-
|
170 |
-
else:
|
171 |
-
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
172 |
-
logging.warning("Terminated as no document provided")
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|