import os import subprocess import urllib import pickle import time import streamlit as st from rank_bm25 import BM25Okapi, BM25Plus from bm25Simple import BM25Simple path = os.path.dirname(__file__) print(path) print(subprocess.run(['ls -la'], shell=True)) print() print(subprocess.run(['ls -la models/'], shell=True)) print() print(subprocess.run(['ls -la content/'], shell=True)) # subprocess.run(['pip install --upgrade streamlit'], shell=True) def main(): st.set_page_config( # Can be "centered" or "wide". In the future also "dashboard", etc. layout="wide", initial_sidebar_state="auto", # Can be "auto", "expanded", "collapsed" # String or None. Strings get appended with "• Streamlit". page_title="BM25 based Information Retrieval System", page_icon="🔎", # String, anything supported by st.image, or None. ) # LAYOUT hide_menu_style = """ """ st.markdown(hide_menu_style, unsafe_allow_html=True) # padding = 2 # st.markdown(f""" """, unsafe_allow_html=True) # horizontal radios st.write( '', unsafe_allow_html=True) # load documents corpus = load_docs() # load models bm25_simple, bm25_okapi, bm25_plus = load_models() # UI # st.header(f':mag_right: {algo}') st.header(':mag_right: BM25 based Information Retrieval System') st.markdown('''

git repository ''', unsafe_allow_html=True) st.markdown('---') with st.form("search_form"): query = st.text_input( 'Query', 'How much do information retrieval and dissemination systems, as well as automated libraries, cost? Are they worth it to the researcher and to industry?') st.caption('no text preprocessing') with st.expander("Query Examples"): st.markdown(''' - What systems incorporate multiprogramming or remote stations in information retrieval? What will be the extent of their use in the future? - What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles? - What is information science? Give definitions where possible. - Some Considerations Relating to the Cost-Effectiveness of Online Services in Libraries - A Fast Procedure for the Calculation of Similarity Coefficients in Automatic Classification ''') submitted = st.form_submit_button('Search') if submitted: if query: st.markdown('---') col1, col2, col3 = st.columns(3) with col1: st.subheader('BM25 Simple') bm25_simple_time, most_relevant_documents = search_docs( bm25_simple, query, corpus) st.caption(f'time: {bm25_simple_time}') print_docs(most_relevant_documents) with col2: st.subheader('BM25OKapi') bm25_okapi_time, most_relevant_documents = search_docs( bm25_okapi, query, corpus) st.caption(f'time: {bm25_okapi_time}') print_docs(most_relevant_documents) with col3: st.subheader('BM25+') bm25_plus_time, most_relevant_documents = search_docs( bm25_plus, query, corpus) st.caption(f'time: {bm25_plus_time}') print_docs(most_relevant_documents) else: st.text('add some query') def search_docs(model, query, corpus): tokenized_query = query.split(" ") start = time.time() most_relevant_documents = model.get_top_n( tokenized_query, corpus, 20) elapsed = (time.time() - start) return elapsed, most_relevant_documents[:20] def print_docs(docs): for index, doc in enumerate(docs): st.markdown(f'''

{index+1}: {doc}

''', unsafe_allow_html=True) @st.cache(ttl=3600, allow_output_mutation=True, show_spinner=True, max_entries=2) def load_docs(): # Processing DOCUMENTS doc_set = {} doc_id = "" doc_text = "" documents_file, _ = urllib.request.urlretrieve( 'https://raw.githubusercontent.com/tcvieira/bm25-exercise-report/main/content/CISI.ALL', 'CISI.ALL.downloaded') with open(documents_file) as f: lines = "" for l in f.readlines(): lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip() lines = lines.lstrip("\n").split("\n") for l in lines: if l.startswith(".I"): doc_id = int(l.split(" ")[1].strip())-1 elif l.startswith(".X"): doc_set[doc_id] = doc_text.lstrip(" ") doc_id = "" doc_text = "" else: # The first 3 characters of a line can be ignored. doc_text += l.strip()[3:] + " " return list(doc_set.values()) @st.cache(ttl=3600, allow_output_mutation=True, show_spinner=True, max_entries=2) def load_models(): bm25_simple_file, _ = urllib.request.urlretrieve( 'https://github.com/tcvieira/bm25-exercise-report/blob/main/models/BM25_simple.pkl?raw=true', 'bm25_simple_file.downloaded') with open(bm25_simple_file, 'rb') as file: bm25_simple: BM25Simple = pickle.load(file) print(bm25_simple.corpus_size) bm25_okapi_file, _ = urllib.request.urlretrieve( 'https://github.com/tcvieira/bm25-exercise-report/blob/main/models/BM25Okapi.pkl?raw=true', 'bm25_okapi_file.downloaded') with open(bm25_okapi_file, 'rb') as file: bm25_okapi: BM25Okapi = pickle.load(file) print(bm25_okapi.corpus_size) bm25_plus_file, _ = urllib.request.urlretrieve( 'https://github.com/tcvieira/bm25-exercise-report/blob/main/models/BM25Plus.pkl?raw=true', 'bm25_plus_file.downloaded') with open(bm25_plus_file, 'rb') as file: bm25_plus: BM25Plus = pickle.load(file) print(bm25_plus.corpus_size) print(subprocess.run(['ls -la'], shell=True)) # st.success("BM25 models loaded!", icon='✅') return bm25_simple, bm25_okapi, bm25_plus if __name__ == "__main__": main()