File size: 2,290 Bytes
7606e16
 
1ed024e
c5a2694
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7606e16
1ed024e
7606e16
 
1ed024e
7606e16
 
 
 
 
 
 
 
 
15fac57
 
7606e16
 
 
15fac57
 
7606e16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import streamlit as st
import config
from pathlib import Path as P
import json


nbow_results_path = P("assets").glob("nbow*")

def display_metrics_dict(metrics, display_only_accuracy):
    model_name = metrics.pop("model_name")
    columns = metrics.pop("columns").split("_")
    st.markdown(f"### columns: {columns}")
    st.markdown(f"best model {model_name}")
    if not display_only_accuracy:
        st.json(metrics)
    else:
        st.json({"accuracy@10": metrics["accuracy@k"]["10"]})

def display_metrics():
    display_only_accuracy = st.sidebar.checkbox("display only accuracy@10", value=True)
    st.markdown("## Test metrics for best validation modelon given columns")
    for p in nbow_results_path:
        metrics = json.loads(open(p, "r").read())
        display_metrics_dict(metrics, display_only_accuracy)

display_metrics()

best_results_df = pd.read_csv(config.best_tasks_path)


worst_results_df = pd.read_csv(config.worst_tasks_path)

show_worst_best_statistics = st.sidebar.checkbox(
    label="show worst/best statistics grouped by area"
)

show_area_aggregated_results = st.sidebar.checkbox(
    label="show results aggregated by area"
)
if show_worst_best_statistics:
    st.markdown(
        """
    ## Worst/best queries
    The following are top 10 worst/best queries per area by number of hits.
    There are at least 10 documents per query in the test set, so number of hits/10 is the accuracy.
    """
    )
    sort_key = st.selectbox("sort by", list(best_results_df.columns))
    st.markdown("## Queries with best results")
    st.table(best_results_df.sort_values(sort_key, ascending=False))
    st.markdown("## Queries with worst results")
    st.table(worst_results_df.sort_values(sort_key, ascending=False))

if show_area_aggregated_results:
    st.markdown("## Area aggregated results")
    best_results_agg = best_results_df.groupby("area").agg("mean").reset_index()
    worst_results_agg = worst_results_df.groupby("area").agg("mean").reset_index()
    sort_key = st.selectbox("sort by", list(best_results_agg.columns))
    st.markdown("Best results")
    st.table(best_results_agg.sort_values(sort_key, ascending=False))
    st.markdown("Worst results")
    st.table(worst_results_agg.sort_values(sort_key, ascending=False))