Spaces:
Runtime error
Runtime error
File size: 6,149 Bytes
ea72d75 74ce942 ea72d75 74ce942 d5f15cb b64c266 74ce942 ea72d75 74ce942 ea72d75 74ce942 ea72d75 74ce942 ea72d75 74ce942 ea72d75 74ce942 ea72d75 74ce942 356174d 176bc83 74ce942 ea72d75 74ce942 176bc83 ea72d75 74ce942 ea72d75 74ce942 d5f15cb 74ce942 d5f15cb ea72d75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
from logging import getLogger
from pathlib import Path
import pandas as pd
import plotly.express as px
import streamlit as st
from st_aggrid import AgGrid, ColumnsAutoSizeMode, GridOptionsBuilder
from utilities import initialization
initialization()
# @st.cache(show_spinner=False)
# def initialize_state():
# with st.spinner("Loading app..."):
# if 'model' not in st.session_state:
# model = Top2Vec.load('models/model.pkl')
# model._check_model_status()
# model.hierarchical_topic_reduction(num_topics=20)
#
# st.session_state.model = model
# st.session_state.umap_model = joblib.load(proj_dir / 'models' / 'umap.sav')
# logger.info("loading data...")
#
# if 'data' not in st.session_state:
# logger.info("loading data...")
# data = pd.read_csv(proj_dir / 'data' / 'data.csv')
# data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
# st.session_state.data = data
# st.session_state.selected_data = data
# st.session_state.all_topics = list(data.topic_id.unique())
#
# if 'topics' not in st.session_state:
# logger.info("loading topics...")
# topics = pd.read_csv(proj_dir / 'data' / 'topics.csv')
# topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
# st.session_state.topics = topics
#
# st.session_state.selected_points = []
def main():
max_docs = st.sidebar.slider("# docs", 10, 100, value=50)
to_search = st.text_input("Write your query here", "") or ""
with st.spinner('Embedding Query...'):
vector = st.session_state.model.embed([to_search])
with st.spinner('Dimension Reduction...'):
point = st.session_state.umap_model.transform(vector.reshape(1, -1))
documents, document_scores, document_ids = st.session_state.model.search_documents_by_vector(vector.flatten(),
num_docs=max_docs)
st.session_state.search_raw_df = pd.DataFrame({'document_ids': document_ids, 'document_scores': document_scores})
st.session_state.data_to_model = st.session_state.data.merge(st.session_state.search_raw_df, left_on='id',
right_on='document_ids').drop(['document_ids'], axis=1)
st.session_state.data_to_model = st.session_state.data_to_model.sort_values(by='document_scores',
ascending=False) # to make legend sorted https://bioinformatics.stackexchange.com/a/18847
st.session_state.data_to_model.loc[len(st.session_state.data_to_model.index)] = ['Point', *point[0].tolist(),
to_search, 'Query', 0]
st.session_state.data_to_model_with_point = st.session_state.data_to_model
st.session_state.data_to_model_without_point = st.session_state.data_to_model.iloc[:-1]
def get_topics_counts() -> pd.DataFrame:
topic_counts = st.session_state.data_to_model_without_point["topic_id"].value_counts().to_frame()
merged = topic_counts.merge(st.session_state.topics, left_index=True, right_on='topic_id')
cleaned = merged.drop(['topic_id_y'], axis=1).rename({'topic_id_x': 'topic_count'}, axis=1)
cols = ['topic_id'] + [col for col in cleaned.columns if col != 'topic_id']
return cleaned[cols]
st.write("""
# Semantic Search
This shows a 2d representation of documents embeded in a semantic space. Each dot is a document
and the dots close represent documents that are close in meaning.
Note that the distance metrics were computed at a higher dimension so take the representation with
a grain of salt.
The Query is shown with the documents in yellow.
"""
)
df = st.session_state.data_to_model_with_point.sort_values(by='topic_id', ascending=True)
fig = px.scatter(df.iloc[:-1], x='x', y='y', color='topic_id', template='plotly_dark',
hover_data=['id', 'topic_id', 'x', 'y'])
fig.add_traces(px.scatter(df.tail(1), x="x", y="y").update_traces(marker_size=10, marker_color="yellow").data)
st.plotly_chart(fig, use_container_width=True)
tab1, tab2 = st.tabs(["Docs", "Topics"])
with tab1:
cols = ['id', 'document_scores', 'topic_id', 'documents']
data = st.session_state.data_to_model_without_point.loc[:, cols]
data['topic_word'] = data.topic_id.replace(st.session_state.topic_str_to_word)
ordered_cols = ['id', 'document_scores', 'topic_id', 'topic_word', 'documents']
builder = GridOptionsBuilder.from_dataframe(data[ordered_cols])
builder.configure_pagination()
builder.configure_column('document_scores', type=["numericColumn", "numberColumnFilter", "customNumericFormat"],
precision=2)
go = builder.build()
AgGrid(data[ordered_cols], theme='streamlit', gridOptions=go,
columns_auto_size_mode=ColumnsAutoSizeMode.FIT_CONTENTS)
with tab2:
cols = ['topic_id', 'topic_count', 'topic_0']
topic_counts = get_topics_counts()
builder = GridOptionsBuilder.from_dataframe(topic_counts[cols])
builder.configure_pagination()
builder.configure_column('topic_0', header_name='Topic Word', wrap_text=True)
go = builder.build()
AgGrid(topic_counts.loc[:, cols], theme='streamlit', gridOptions=go,
columns_auto_size_mode=ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW)
if __name__ == "__main__":
# Setting up Logger and proj_dir
logger = getLogger(__name__)
proj_dir = Path(__file__).parents[2]
# For max width tables
pd.set_option('display.max_colwidth', 0)
# Streamlit settings
# st.set_page_config(layout="wide")
md_title = "# Semantic Search π"
st.markdown(md_title)
st.sidebar.markdown(md_title)
# initialize_state()
main()
|