Spaces:

GEM
/

DatasetCardForm

Runtime error

Yacine Jernite

results

28dd726 about 3 years ago

4.21 kB

	import streamlit as st

	from .streamlit_utils import (
	make_multiselect,
	make_selectbox,
	make_text_area,
	make_text_input,
	make_radio,
	)

	N_FIELDS = 7


	def results_page():
	st.session_state.card_dict["results"] = st.session_state.card_dict.get(
	"results", {}
	)

	with st.expander("Previous Results", expanded=False):
	key_pref = ["results", "results"]
	st.session_state.card_dict["results"]["results"] = st.session_state.card_dict[
	"results"
	].get("results", {})
	make_text_area(
	label="What aspect of model ability can be measured with this dataset?",
	key_list=key_pref + ["model-abilities"],
	help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?.",
	)
	make_multiselect(
	label="What metrics are typically used for this task?",
	key_list=key_pref + ["metrics"],
	options=[
	"BERT-Score",
	"BLEU",
	"BLEURT",
	"ChrF",
	"Entailment",
	"FeQA",
	"METEOR", "MoverScore",
	"QAGS",
	"ROUGE",
	"WER",
	"Other: Other Metrics"
	],
	help="Select all metrics that are typically used when evaluating models for this task.",
	)
	if "Other: Other Metrics" in st.session_state.card_dict["results"]["results"].get("metrics", []):
	make_text_area(
	label="Definitions of other metrics",
	key_list=key_pref + ["other-metrics-definitions"],
	help="If the evaluation strategies in the previous questions go beyond the list of metrics above, add descriptions and/or definitions for each metric.",
	)
	else:
	st.session_state.card_dict["results"]["results"]["other-metrics-definitions"] = "N/A"
	make_text_area(
	label="List and describe the purpose of the metrics and evaluation methodology (including human evaluation) that the dataset creators used when introducing this task.",
	key_list=key_pref + ["original-evaluation"],
	help="When the generation task was not evaluated when this dataset was introduced, write N/A.",
	)
	make_radio(
	label="Are previous results available?",
	options=["no", "yes"],
	key_list=key_pref + ["has-previous-results"],
	help="Have papers evaluated models on this task? If no, write N/A for the following three questions.",
	)
	if st.session_state.card_dict["results"]["results"]["has-previous-results"] == "yes":
	make_text_area(
	label="What evaluation approaches have others used?",
	key_list=key_pref + ["current-evaluation"],
	help="If the current evaluation strategy diverts from the original, describe how models are being evaluated.",
	)
	make_text_area(
	label="What are the most relevant previous results for this task/dataset",
	key_list=key_pref + ["previous-results"],
	help="List and describe the source and performance metrics for models on this dataset.",
	)
	else:
	st.session_state.card_dict["results"]["results"]["current-evaluation"] = "N/A"
	st.session_state.card_dict["results"]["results"]["previous-results"] = "N/A"



	def results_summary():
	total_filled = sum(
	[len(dct) for dct in st.session_state.card_dict.get("results", {}).values()]
	)
	with st.expander(
	f"Previous Results Completion - {total_filled} of {N_FIELDS}", expanded=False
	):
	completion_markdown = ""
	completion_markdown += (
	f"- Overall completion:\n - {total_filled} of {N_FIELDS} fields\n"
	)
	completion_markdown += f"- Sub-section - Previous Results:\n - {len(st.session_state.card_dict.get('results', {}).get('results', {}))} of {N_FIELDS} fields\n"
	st.markdown(completion_markdown)