Spaces:
Runtime error
Runtime error
import streamlit as st | |
from .streamlit_utils import ( | |
make_multiselect, | |
make_selectbox, | |
make_text_area, | |
make_text_input, | |
make_radio, | |
) | |
N_FIELDS = 7 | |
def results_page(): | |
st.session_state.card_dict["results"] = st.session_state.card_dict.get( | |
"results", {} | |
) | |
with st.expander("Previous Results", expanded=False): | |
key_pref = ["results", "results"] | |
st.session_state.card_dict["results"]["results"] = st.session_state.card_dict[ | |
"results" | |
].get("results", {}) | |
make_text_area( | |
label="What aspect of model ability can be measured with this dataset?", | |
key_list=key_pref + ["model-abilities"], | |
help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?.", | |
) | |
make_multiselect( | |
label="What metrics are typically used for this task?", | |
key_list=key_pref + ["metrics"], | |
options=[ | |
"BERT-Score", | |
"BLEU", | |
"BLEURT", | |
"ChrF", | |
"Entailment", | |
"FeQA", | |
"METEOR", "MoverScore", | |
"QAGS", | |
"ROUGE", | |
"WER", | |
"Other: Other Metrics" | |
], | |
help="Select all metrics that are typically used when evaluating models for this task.", | |
) | |
if "Other: Other Metrics" in st.session_state.card_dict["results"]["results"].get("metrics", []): | |
make_text_area( | |
label="Definitions of other metrics", | |
key_list=key_pref + ["other-metrics-definitions"], | |
help="If the evaluation strategies in the previous questions go beyond the list of metrics above, add descriptions and/or definitions for each metric.", | |
) | |
else: | |
st.session_state.card_dict["results"]["results"]["other-metrics-definitions"] = "N/A" | |
make_text_area( | |
label="List and describe the purpose of the metrics and evaluation methodology (including human evaluation) that the dataset creators used when introducing this task.", | |
key_list=key_pref + ["original-evaluation"], | |
help="When the generation task was not evaluated when this dataset was introduced, write N/A.", | |
) | |
make_radio( | |
label="Are previous results available?", | |
options=["no", "yes"], | |
key_list=key_pref + ["has-previous-results"], | |
help="Have papers evaluated models on this task? If no, write N/A for the following three questions.", | |
) | |
if st.session_state.card_dict["results"]["results"]["has-previous-results"] == "yes": | |
make_text_area( | |
label="What evaluation approaches have others used?", | |
key_list=key_pref + ["current-evaluation"], | |
help="If the current evaluation strategy diverts from the original, describe how models are being evaluated.", | |
) | |
make_text_area( | |
label="What are the most relevant previous results for this task/dataset", | |
key_list=key_pref + ["previous-results"], | |
help="List and describe the source and performance metrics for models on this dataset.", | |
) | |
else: | |
st.session_state.card_dict["results"]["results"]["current-evaluation"] = "N/A" | |
st.session_state.card_dict["results"]["results"]["previous-results"] = "N/A" | |
def results_summary(): | |
total_filled = sum( | |
[len(dct) for dct in st.session_state.card_dict.get("results", {}).values()] | |
) | |
with st.expander( | |
f"Previous Results Completion - {total_filled} of {N_FIELDS}", expanded=False | |
): | |
completion_markdown = "" | |
completion_markdown += ( | |
f"- **Overall completion:**\n - {total_filled} of {N_FIELDS} fields\n" | |
) | |
completion_markdown += f"- **Sub-section - Previous Results:**\n - {len(st.session_state.card_dict.get('results', {}).get('results', {}))} of {N_FIELDS} fields\n" | |
st.markdown(completion_markdown) | |