Spaces:
Runtime error
Runtime error
import os | |
import subprocess | |
import urllib | |
import pickle | |
import time | |
import streamlit as st | |
from rank_bm25 import BM25Okapi, BM25Plus | |
from bm25Simple import BM25Simple | |
path = os.path.dirname(__file__) | |
print(path) | |
print(subprocess.run(['ls -la'], shell=True)) | |
print() | |
print(subprocess.run(['ls -la models/'], shell=True)) | |
print() | |
print(subprocess.run(['ls -la content/'], shell=True)) | |
# subprocess.run(['pip install --upgrade streamlit'], shell=True) | |
def main(): | |
st.set_page_config( | |
# Can be "centered" or "wide". In the future also "dashboard", etc. | |
layout="wide", | |
initial_sidebar_state="auto", # Can be "auto", "expanded", "collapsed" | |
# String or None. Strings get appended with "• Streamlit". | |
page_title="BM25 based Information Retrieval System", | |
page_icon="🔎", # String, anything supported by st.image, or None. | |
) | |
# LAYOUT | |
hide_menu_style = """ | |
<style> | |
#MainMenu {visibility: hidden; } | |
footer {visibility: hidden;} | |
</style> | |
""" | |
st.markdown(hide_menu_style, unsafe_allow_html=True) | |
# padding = 2 | |
# st.markdown(f""" <style> | |
# .reportview-container .main .block-container{{ | |
# padding-top: {padding}rem; | |
# padding-right: {padding}rem; | |
# padding-left: {padding}rem; | |
# padding-bottom: {padding}rem; | |
# }} </style> """, unsafe_allow_html=True) | |
# horizontal radios | |
st.write( | |
'<style>div.row-widget.stRadio > div{flex-direction:row;}</style>', unsafe_allow_html=True) | |
# load documents | |
corpus = load_docs() | |
# load models | |
bm25_simple, bm25_okapi, bm25_plus = load_models() | |
# UI | |
# st.header(f':mag_right: {algo}') | |
st.header(':mag_right: BM25 based Information Retrieval System') | |
st.markdown(''' | |
<a href="https://github.com/tcvieira/bm25-exercise-report" target="_blank" style="text-decoration: none;"> | |
<img src="https://cdn-icons-png.flaticon.com/512/25/25231.png" width="30" height="30" alt="github repository"></img> | |
</a>git repository | |
''', unsafe_allow_html=True) | |
st.markdown('---') | |
with st.form("search_form"): | |
query = st.text_input( | |
'Query', 'How much do information retrieval and dissemination systems, as well as automated libraries, cost? Are they worth it to the researcher and to industry?') | |
st.caption('no text preprocessing') | |
with st.expander("Query Examples"): | |
st.markdown(''' | |
- What systems incorporate multiprogramming or remote stations in information retrieval? What will be the extent of their use in the future? | |
- What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles? | |
- What is information science? Give definitions where possible. | |
- Some Considerations Relating to the Cost-Effectiveness of Online Services in Libraries | |
- A Fast Procedure for the Calculation of Similarity Coefficients in Automatic Classification | |
''') | |
submitted = st.form_submit_button('Search') | |
if submitted: | |
if query: | |
st.markdown('---') | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.subheader('BM25 Simple') | |
bm25_simple_time, most_relevant_documents = search_docs( | |
bm25_simple, query, corpus) | |
st.caption(f'time: {bm25_simple_time}') | |
print_docs(most_relevant_documents) | |
with col2: | |
st.subheader('BM25OKapi') | |
bm25_okapi_time, most_relevant_documents = search_docs( | |
bm25_okapi, query, corpus) | |
st.caption(f'time: {bm25_okapi_time}') | |
print_docs(most_relevant_documents) | |
with col3: | |
st.subheader('BM25+') | |
bm25_plus_time, most_relevant_documents = search_docs( | |
bm25_plus, query, corpus) | |
st.caption(f'time: {bm25_plus_time}') | |
print_docs(most_relevant_documents) | |
else: | |
st.text('add some query') | |
def search_docs(model, query, corpus): | |
tokenized_query = query.split(" ") | |
start = time.time() | |
most_relevant_documents = model.get_top_n( | |
tokenized_query, corpus, 20) | |
elapsed = (time.time() - start) | |
return elapsed, most_relevant_documents[:20] | |
def print_docs(docs): | |
for index, doc in enumerate(docs): | |
st.markdown(f''' | |
<div style="text-align: justify"> | |
<strong>{index+1}</strong>: {doc} | |
</div> | |
</br> | |
''', unsafe_allow_html=True) | |
def load_docs(): | |
# Processing DOCUMENTS | |
doc_set = {} | |
doc_id = "" | |
doc_text = "" | |
documents_file, _ = urllib.request.urlretrieve( | |
'https://raw.githubusercontent.com/tcvieira/bm25-exercise-report/main/content/CISI.ALL', 'CISI.ALL.downloaded') | |
with open(documents_file) as f: | |
lines = "" | |
for l in f.readlines(): | |
lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip() | |
lines = lines.lstrip("\n").split("\n") | |
for l in lines: | |
if l.startswith(".I"): | |
doc_id = int(l.split(" ")[1].strip())-1 | |
elif l.startswith(".X"): | |
doc_set[doc_id] = doc_text.lstrip(" ") | |
doc_id = "" | |
doc_text = "" | |
else: | |
# The first 3 characters of a line can be ignored. | |
doc_text += l.strip()[3:] + " " | |
return list(doc_set.values()) | |
def load_models(): | |
bm25_simple_file, _ = urllib.request.urlretrieve( | |
'https://github.com/tcvieira/bm25-exercise-report/blob/main/models/BM25_simple.pkl?raw=true', 'bm25_simple_file.downloaded') | |
with open(bm25_simple_file, 'rb') as file: | |
bm25_simple: BM25Simple = pickle.load(file) | |
print(bm25_simple.corpus_size) | |
bm25_okapi_file, _ = urllib.request.urlretrieve( | |
'https://github.com/tcvieira/bm25-exercise-report/blob/main/models/BM25Okapi.pkl?raw=true', 'bm25_okapi_file.downloaded') | |
with open(bm25_okapi_file, 'rb') as file: | |
bm25_okapi: BM25Okapi = pickle.load(file) | |
print(bm25_okapi.corpus_size) | |
bm25_plus_file, _ = urllib.request.urlretrieve( | |
'https://github.com/tcvieira/bm25-exercise-report/blob/main/models/BM25Plus.pkl?raw=true', 'bm25_plus_file.downloaded') | |
with open(bm25_plus_file, 'rb') as file: | |
bm25_plus: BM25Plus = pickle.load(file) | |
print(bm25_plus.corpus_size) | |
print(subprocess.run(['ls -la'], shell=True)) | |
# st.success("BM25 models loaded!", icon='✅') | |
return bm25_simple, bm25_okapi, bm25_plus | |
if __name__ == "__main__": | |
main() | |