import os
import subprocess
import urllib
import pickle
import time
import streamlit as st
from rank_bm25 import BM25Okapi, BM25Plus
from bm25Simple import BM25Simple

def main():
    # load documents
    corpus = load_docs()
    # load models
    bm25_simple, bm25_okapi, bm25_plus = load_models()

    with st.form("search_form"):
        query = st.text_input(
            'Query',
            'How much do information retrieval and dissemination systems, as well as automated libraries, cost? Are they worth it to the researcher and to industry?')
        st.caption('no text preprocessing')
        with st.expander("Query Examples"):
            st.markdown('''
                - What systems incorporate multiprogramming or remote stations in information retrieval? What will be the extent of their use in the future?
                - What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles?
                - What is information science? Give definitions where possible.
                - Some Considerations Relating to the Cost-Effectiveness of Online Services in Libraries
                - A Fast Procedure for the Calculation of Similarity Coefficients in Automatic Classification
            ''')
        submitted = st.form_submit_button('Search')

    if submitted:
        if query:
            col1, col2, col3 = st.columns(3)
            with col1:
                st.subheader('BM25 Simple')
                bm25_simple_time, most_relevant_documents = search_docs(
                    bm25_simple, query, corpus)
                st.caption(f'time: {bm25_simple_time}')
                print_docs(most_relevant_documents)
            with col2:
                st.subheader('BM25OKapi')
                bm25_okapi_time, most_relevant_documents = search_docs(
                    bm25_okapi, query, corpus)
                st.caption(f'time: {bm25_okapi_time}')
                print_docs(most_relevant_documents)
            with col3:
                st.subheader('BM25+')
                bm25_plus_time, most_relevant_documents = search_docs(
                    bm25_plus, query, corpus)
                st.caption(f'time: {bm25_plus_time}')
                print_docs(most_relevant_documents)

def search_docs(model, query, corpus):
    tokenized_query = query.split(" ")
    start = time.time()
    most_relevant_documents = model.get_top_n(
        tokenized_query, corpus, 20)
    elapsed = (time.time() - start)
    return elapsed, most_relevant_documents[:20]

def print_docs(docs):
    for index, doc in enumerate(docs):
        st.markdown(f'''