paperswithcode_nbow / pages /2_Statistics.py
lambdaofgod's picture
assets
15fac57
raw
history blame
1.49 kB
import pandas as pd
import streamlit as st
best_results_df = pd.read_csv("assets/best_tasks_with_hits.csv")
worst_results_df = pd.read_csv("assets/worst_tasks_with_hits.csv")
show_worst_best_statistics = st.sidebar.checkbox(
label="show worst/best statistics grouped by area"
)
show_area_aggregated_results = st.sidebar.checkbox(
label="show results aggregated by area"
)
if show_worst_best_statistics:
st.markdown(
"""
## Worst/best queries
The following are top 10 worst/best queries per area by number of hits.
There are at least 10 documents per query in the test set, so number of hits/10 is the accuracy.
"""
)
sort_key = st.selectbox("sort by", list(best_results_df.columns))
st.markdown("## Queries with best results")
st.table(best_results_df.sort_values(sort_key, ascending=False))
st.markdown("## Queries with worst results")
st.table(worst_results_df.sort_values(sort_key, ascending=False))
if show_area_aggregated_results:
st.markdown("## Area aggregated results")
best_results_agg = best_results_df.groupby("area").agg("mean").reset_index()
worst_results_agg = worst_results_df.groupby("area").agg("mean").reset_index()
sort_key = st.selectbox("sort by", list(best_results_agg.columns))
st.markdown("Best results")
st.table(best_results_agg.sort_values(sort_key, ascending=False))
st.markdown("Worst results")
st.table(worst_results_agg.sort_values(sort_key, ascending=False))