import os import subprocess import urllib import pickle import time import streamlit as st from rank_bm25 import BM25Okapi, BM25Plus from bm25Simple import BM25Simple path = os.path.dirname(__file__) print(path) print(subprocess.run(['ls -la'], shell=True)) print() print(subprocess.run(['ls -la models/'], shell=True)) print() print(subprocess.run(['ls -la content/'], shell=True)) # subprocess.run(['pip install --upgrade streamlit'], shell=True) def main(): st.set_page_config( # Can be "centered" or "wide". In the future also "dashboard", etc. layout="wide", initial_sidebar_state="auto", # Can be "auto", "expanded", "collapsed" # String or None. Strings get appended with "• Streamlit". page_title="BM25 based Information Retrieval System", page_icon="🔎", # String, anything supported by st.image, or None. ) # LAYOUT hide_menu_style = """ """ st.markdown(hide_menu_style, unsafe_allow_html=True) # padding = 2 # st.markdown(f""" """, unsafe_allow_html=True) # horizontal radios st.write( '', unsafe_allow_html=True) # load documents corpus = load_docs() # load models bm25_simple, bm25_okapi, bm25_plus = load_models() # UI # st.header(f':mag_right: {algo}') st.header(':mag_right: BM25 based Information Retrieval System') st.markdown(''' git repository ''', unsafe_allow_html=True) st.markdown('---') with st.form("search_form"): query = st.text_input( 'Query', 'How much do information retrieval and dissemination systems, as well as automated libraries, cost? Are they worth it to the researcher and to industry?') st.caption('no text preprocessing') with st.expander("Query Examples"): st.markdown(''' - What systems incorporate multiprogramming or remote stations in information retrieval? What will be the extent of their use in the future? - What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles? - What is information science? Give definitions where possible. - Some Considerations Relating to the Cost-Effectiveness of Online Services in Libraries - A Fast Procedure for the Calculation of Similarity Coefficients in Automatic Classification ''') submitted = st.form_submit_button('Search') if submitted: if query: st.markdown('---') col1, col2, col3 = st.columns(3) with col1: st.subheader('BM25 Simple') bm25_simple_time, most_relevant_documents = search_docs( bm25_simple, query, corpus) st.caption(f'time: {bm25_simple_time}') print_docs(most_relevant_documents) with col2: st.subheader('BM25OKapi') bm25_okapi_time, most_relevant_documents = search_docs( bm25_okapi, query, corpus) st.caption(f'time: {bm25_okapi_time}') print_docs(most_relevant_documents) with col3: st.subheader('BM25+') bm25_plus_time, most_relevant_documents = search_docs( bm25_plus, query, corpus) st.caption(f'time: {bm25_plus_time}') print_docs(most_relevant_documents) else: st.text('add some query') def search_docs(model, query, corpus): tokenized_query = query.split(" ") start = time.time() most_relevant_documents = model.get_top_n( tokenized_query, corpus, 20) elapsed = (time.time() - start) return elapsed, most_relevant_documents[:20] def print_docs(docs): for index, doc in enumerate(docs): st.markdown(f'''