Spaces:

GEM
/

DatasetCardForm

Runtime error

DatasetCardForm / datacards /overview.py

Sebastian Gehrmann

Merge branch 'main' of https://huggingface.co/spaces/GEM/DatasetCardForm

f922ab7 almost 3 years ago

12.8 kB

	import json
	import streamlit as st

	from os.path import join as pjoin

	from .streamlit_utils import (
	make_multiselect,
	make_selectbox,
	make_text_area,
	make_text_input,
	make_radio,
	)

	N_FIELDS_WHERE = 9
	N_FIELDS_LANGUAGES = 8
	N_FIELDS_CREDIT = 5
	N_FIELDS_STRUCTURE = 7

	N_FIELDS = N_FIELDS_WHERE + N_FIELDS_LANGUAGES + N_FIELDS_CREDIT + N_FIELDS_STRUCTURE


	languages_bcp47 = [
	x
	for x in json.load(open(pjoin("resources", "bcp47.json"), encoding="utf-8"))[
	"subtags"
	]
	if x["type"] == "language"
	]

	license_list = json.load(open(pjoin("resources", "licenses.json"), encoding="utf-8"))


	def overview_page():
	st.session_state.card_dict["overview"] = st.session_state.card_dict.get(
	"overview", {}
	)
	with st.expander("What is this dataset?", expanded=True):
	key_pref = ["overview", "what"]
	st.session_state.card_dict["overview"]["what"] = st.session_state.card_dict[
	"overview"
	].get("what", {})
	make_text_area(
	label="Provide a summary of this dataset in 3-4 sentences.",
	key_list=key_pref + ["dataset"],
	help="[free text]",
	)
	with st.expander("Where to find the data and its documentation", expanded=False):
	key_pref = ["overview", "where"]
	st.session_state.card_dict["overview"]["where"] = st.session_state.card_dict[
	"overview"
	].get("where", {})
	make_text_input(
	label="What is the webpage for the dataset (if it exists)?",
	key_list=key_pref + ["website"],
	help="[URL]",
	)
	make_text_input(
	label="What is the link to where the original dataset is hosted?",
	key_list=key_pref + ["data-url"],
	help="[URL]",
	)
	make_text_input(
	label="What is the link to the paper describing the dataset (open access preferred)?",
	key_list=key_pref + ["paper-url"],
	help="[URL]",
	)
	make_text_area(
	label="Provide the BibTex-formatted reference for the dataset. Please use the correct published version (ACL anthology, etc.) instead of google scholar created Bibtex.",
	key_list=key_pref + ["paper-bibtext"],
	help="[free text]",
	)
	make_radio(
	label="Does the dataset have an active leaderboard?",
	options=["no", "yes"],
	key_list=key_pref + ["has-leaderboard"],
	help="If no, enter N/A for the following two fields",
	)
	if st.session_state.card_dict["overview"]["where"]["has-leaderboard"] == "yes":
	make_text_input(
	label="Provide a link to the leaderboard.",
	key_list=key_pref + ["leaderboard-url"],
	help="[URL] or N/A",
	)
	make_text_area(
	label="Briefly describe how the leaderboard evaluates models.",
	key_list=key_pref + ["leaderboard-description"],
	help="[free text; a paragraph] or N/A",
	)
	else:
	st.session_state.card_dict["overview"]["where"]["leaderboard-url"] = "N/A"
	st.session_state.card_dict["overview"]["where"]["leaderboard-description"] = "N/A"
	make_text_input(
	label="If known, provide the name of at least one person the reader can contact for questions about the dataset.",
	key_list=key_pref + ["contact-name"],
	help="[free text]",
	)
	make_text_input(
	label="If known, provide the email of at least one person the reader can contact for questions about the dataset.",
	key_list=key_pref + ["contact-email"],
	help="[free text]",
	)
	with st.expander("Languages and Intended Use", expanded=False):
	key_pref = ["overview", "languages"]
	st.session_state.card_dict["overview"][
	"languages"
	] = st.session_state.card_dict["overview"].get("languages", {})
	make_radio(
	label="Is the dataset multilingual?",
	options=["no", "yes"],
	key_list=key_pref + ["is-multilingual"],
	help="More than one language present in all of the text fields",
	)
	make_multiselect(
	label="What languages/dialects are covered in the dataset?",
	key_list=key_pref + ["language-names"],
	options=[", ".join(x["description"]) for x in languages_bcp47],
	help="This is a comprehensive list of languages obtained from the BCP-47 standard list.",
	)
	make_text_area(
	label="What dialects are covered? Are there multiple dialects per language?",
	key_list=key_pref + ["language-dialects"],
	help="[free text, paragraphs] - Describe the dialect(s) as appropriate.",
	)
	make_text_area(
	label="Whose language is in the dataset?",
	key_list=key_pref + ["language-speakers"],
	help="[free text, paragraphs] - Provide locally appropriate demographic information about the language producers, if available. Use ranges where reasonable in order to protect individuals’ privacy.",
	)
	make_text_area(
	label="What is the intended use of the dataset?",
	key_list=key_pref + ["intended-use"],
	help="[free text, paragraphs] - Describe how the dataset creators describe its purpose and intended use.",
	)
	make_selectbox(
	label="What is the license of the dataset?",
	key_list=key_pref + ["license"],
	options=license_list,
	help="select `other` if missing from list, `unkown` if not provided.",
	)
	if "other" in st.session_state.card_dict["overview"]["languages"].get("license", []):
	make_text_input(
	label="What is the 'other' license of the dataset?",
	key_list=key_pref + ["license-other"],
	help="[free text]",
	)
	else:
	st.session_state.card_dict["overview"]["languages"]["license-other"] = "N/A"


	make_selectbox(
	label="What primary task does the dataset support?",
	key_list=key_pref + ["task"],
	options=[
	"", # default needs to be invalid value to make sure people actually fill in
	"Content Transfer",
	"Data-to-Text",
	"Dialog Response Generation",
	"Paraphrasing",
	"Question Generation",
	"Reasoning",
	"Simplification",
	"Style Transfer",
	"Summarization",
	"Text-to-Slide",
	"Other"
	],
	help="Select `other` if the task is not included in the list.",
	)
	if "Other" in st.session_state.card_dict["overview"]["languages"].get("task", []):
	make_text_input(
	label="What is the primary task?",
	key_list=key_pref + ["task-other"],
	help="[free text]",
	)
	else:
	st.session_state.card_dict["overview"]["languages"]["task-other"] = "N/A"

	make_text_area(
	label="Provide a short description of the communicative goal of a model trained for this task on this dataset.",
	key_list=key_pref + ["communicative"],
	help="[free text, a paragraph] (e.g., describe a restaurant from a structured representation of its attributes)",
	)
	with st.expander("Credit", expanded=False):
	key_pref = ["overview", "credit"]
	st.session_state.card_dict["overview"][
	"credit"
	] = st.session_state.card_dict["overview"].get("credit", {})
	make_multiselect(
	label="In what kind of organization did the dataset curation happen?",
	options=["industry", "academic", "independent", "other"],
	key_list=key_pref + ["organization-type"],
	)
	make_text_input(
	label="Name the organization(s).",
	key_list=key_pref + ["organization-names"],
	help="comma-separated",
	)
	make_text_input(
	label="Who created the original dataset? List the people involved in collecting the dataset and their affiliation(s).",
	key_list=key_pref + ["creators"],
	help="name (affiliation); comma-separated",
	)
	make_text_input(
	label="Who funded the data creation?",
	key_list=key_pref + ["funding"],
	help="[free text] enter N/A if unkown",
	)
	make_text_input(
	label="Who contributed to the data card and adding the dataset to GEM? List the people+affiliations involved in creating this data card and who helped integrate this dataset into GEM.",
	key_list=key_pref + ["gem-added-by"],
	help="name (affiliation); comma-separated",
	)
	with st.expander("Structure", expanded=False):
	key_pref = ["overview", "structure"]
	st.session_state.card_dict["overview"]["structure"] = st.session_state.card_dict[
	"overview"
	].get("structure", {})
	data_fields_help = """
	[free text; paragraphs]
	- Mention their data type, and whether and how they are used as part of the generation pipeline.
	- Describe each fields' attributes, such as whether they are at the character level or word level, whether they are contiguous or not, etc.
	- If the datasets contain example IDs, state whether they have an inherent meaning, such as a mapping to other datasets or pointing to relationships between data points.
	"""
	make_text_area(
	label="List and describe the fields present in the dataset.",
	key_list=key_pref + ["data-fields"],
	help=data_fields_help,
	)
	make_text_area(
	label="How was the dataset structure determined?",
	key_list=key_pref + ["structure-description"],
	help="[free text; paragraph]",
	)
	make_text_area(
	label="How were the labels chosen?",
	key_list=key_pref + ["structure-labels"],
	help="[free text; paragraph]",
	)
	make_text_area(
	label="Provide a JSON formatted example of a typical instance in the dataset.",
	key_list=key_pref + ["structure-example"],
	help="[JSON]",
	)
	make_text_area(
	label="Describe and name the splits in the dataset if there are more than one.",
	key_list=key_pref + ["structure-splits"],
	help="[free text, paragraphs] - As appropriate, provide any descriptive statistics for the features, such as size, average lengths of input and output.",
	)
	make_text_area(
	label="Describe any criteria for splitting the data, if used. If there are differences between the splits (e.g., if the training annotations are machine-generated and the dev and test ones are created by humans, or if different numbers of annotators contributed to each example), describe them here.",
	key_list=key_pref + ["structure-splits-criteria"],
	help="[free text, paragraphs]",
	)
	make_text_area(
	label="What does an outlier of the dataset in terms of length/perplexity/embedding look like?",
	key_list=key_pref + ["structure-outlier"],
	help="[free text + json formatted text/file for an example]",
	)


	def overview_summary():
	total_filled = sum(
	[len(dct) for dct in st.session_state.card_dict.get("overview", {}).values()]
	)
	with st.expander(
	f"Dataset Overview Completion - {total_filled} of {N_FIELDS}", expanded=False
	):
	completion_markdown = ""
	completion_markdown += (
	f"- Overall completion:\n - {total_filled} of {N_FIELDS} fields\n"
	)
	completion_markdown += f"- Sub-section - Where to find:\n - {len(st.session_state.card_dict.get('overview', {}).get('where', {}))} of {N_FIELDS_WHERE} fields\n"
	completion_markdown += f"- Sub-section - Languages and Intended Use:\n - {len(st.session_state.card_dict.get('overview', {}).get('languages', {}))} of {N_FIELDS_LANGUAGES} fields\n"
	completion_markdown += f"- Sub-section - Credit:\n - {len(st.session_state.card_dict.get('overview', {}).get('credit', {}))} of {N_FIELDS_CREDIT} fields\n"
	completion_markdown += f"- Sub-section - Structure:\n - {len(st.session_state.card_dict.get('overview', {}).get('structure', {}))} of {N_FIELDS_STRUCTURE} fields\n"
	st.markdown(completion_markdown)