Sebastian Gehrmann
Merge branch 'main' of https://huggingface.co/spaces/GEM/DatasetCardForm
f922ab7
import json
import streamlit as st
from os.path import join as pjoin
from .streamlit_utils import (
make_multiselect,
make_selectbox,
make_text_area,
make_text_input,
make_radio,
)
N_FIELDS_WHERE = 9
N_FIELDS_LANGUAGES = 8
N_FIELDS_CREDIT = 5
N_FIELDS_STRUCTURE = 7
N_FIELDS = N_FIELDS_WHERE + N_FIELDS_LANGUAGES + N_FIELDS_CREDIT + N_FIELDS_STRUCTURE
languages_bcp47 = [
x
for x in json.load(open(pjoin("resources", "bcp47.json"), encoding="utf-8"))[
"subtags"
]
if x["type"] == "language"
]
license_list = json.load(open(pjoin("resources", "licenses.json"), encoding="utf-8"))
def overview_page():
st.session_state.card_dict["overview"] = st.session_state.card_dict.get(
"overview", {}
)
with st.expander("What is this dataset?", expanded=True):
key_pref = ["overview", "what"]
st.session_state.card_dict["overview"]["what"] = st.session_state.card_dict[
"overview"
].get("what", {})
make_text_area(
label="Provide a summary of this dataset in 3-4 sentences.",
key_list=key_pref + ["dataset"],
help="[free text]",
)
with st.expander("Where to find the data and its documentation", expanded=False):
key_pref = ["overview", "where"]
st.session_state.card_dict["overview"]["where"] = st.session_state.card_dict[
"overview"
].get("where", {})
make_text_input(
label="What is the webpage for the dataset (if it exists)?",
key_list=key_pref + ["website"],
help="[URL]",
)
make_text_input(
label="What is the link to where the original dataset is hosted?",
key_list=key_pref + ["data-url"],
help="[URL]",
)
make_text_input(
label="What is the link to the paper describing the dataset (open access preferred)?",
key_list=key_pref + ["paper-url"],
help="[URL]",
)
make_text_area(
label="Provide the BibTex-formatted reference for the dataset. Please use the correct published version (ACL anthology, etc.) instead of google scholar created Bibtex.",
key_list=key_pref + ["paper-bibtext"],
help="[free text]",
)
make_radio(
label="Does the dataset have an active leaderboard?",
options=["no", "yes"],
key_list=key_pref + ["has-leaderboard"],
help="If no, enter N/A for the following two fields",
)
if st.session_state.card_dict["overview"]["where"]["has-leaderboard"] == "yes":
make_text_input(
label="Provide a link to the leaderboard.",
key_list=key_pref + ["leaderboard-url"],
help="[URL] or N/A",
)
make_text_area(
label="Briefly describe how the leaderboard evaluates models.",
key_list=key_pref + ["leaderboard-description"],
help="[free text; a paragraph] or N/A",
)
else:
st.session_state.card_dict["overview"]["where"]["leaderboard-url"] = "N/A"
st.session_state.card_dict["overview"]["where"]["leaderboard-description"] = "N/A"
make_text_input(
label="If known, provide the name of at least one person the reader can contact for questions about the dataset.",
key_list=key_pref + ["contact-name"],
help="[free text]",
)
make_text_input(
label="If known, provide the email of at least one person the reader can contact for questions about the dataset.",
key_list=key_pref + ["contact-email"],
help="[free text]",
)
with st.expander("Languages and Intended Use", expanded=False):
key_pref = ["overview", "languages"]
st.session_state.card_dict["overview"][
"languages"
] = st.session_state.card_dict["overview"].get("languages", {})
make_radio(
label="Is the dataset multilingual?",
options=["no", "yes"],
key_list=key_pref + ["is-multilingual"],
help="More than one language present in all of the text fields",
)
make_multiselect(
label="What languages/dialects are covered in the dataset?",
key_list=key_pref + ["language-names"],
options=[", ".join(x["description"]) for x in languages_bcp47],
help="This is a comprehensive list of languages obtained from the BCP-47 standard list.",
)
make_text_area(
label="What dialects are covered? Are there multiple dialects per language?",
key_list=key_pref + ["language-dialects"],
help="[free text, paragraphs] - Describe the dialect(s) as appropriate.",
)
make_text_area(
label="Whose language is in the dataset?",
key_list=key_pref + ["language-speakers"],
help="[free text, paragraphs] - Provide locally appropriate demographic information about the language producers, if available. Use ranges where reasonable in order to protect individuals’ privacy.",
)
make_text_area(
label="What is the intended use of the dataset?",
key_list=key_pref + ["intended-use"],
help="[free text, paragraphs] - Describe how the dataset creators describe its purpose and intended use.",
)
make_selectbox(
label="What is the license of the dataset?",
key_list=key_pref + ["license"],
options=license_list,
help="select `other` if missing from list, `unkown` if not provided.",
)
if "other" in st.session_state.card_dict["overview"]["languages"].get("license", []):
make_text_input(
label="What is the 'other' license of the dataset?",
key_list=key_pref + ["license-other"],
help="[free text]",
)
else:
st.session_state.card_dict["overview"]["languages"]["license-other"] = "N/A"
make_selectbox(
label="What primary task does the dataset support?",
key_list=key_pref + ["task"],
options=[
"", # default needs to be invalid value to make sure people actually fill in
"Content Transfer",
"Data-to-Text",
"Dialog Response Generation",
"Paraphrasing",
"Question Generation",
"Reasoning",
"Simplification",
"Style Transfer",
"Summarization",
"Text-to-Slide",
"Other"
],
help="Select `other` if the task is not included in the list.",
)
if "Other" in st.session_state.card_dict["overview"]["languages"].get("task", []):
make_text_input(
label="What is the primary task?",
key_list=key_pref + ["task-other"],
help="[free text]",
)
else:
st.session_state.card_dict["overview"]["languages"]["task-other"] = "N/A"
make_text_area(
label="Provide a short description of the communicative goal of a model trained for this task on this dataset.",
key_list=key_pref + ["communicative"],
help="[free text, a paragraph] (e.g., describe a restaurant from a structured representation of its attributes)",
)
with st.expander("Credit", expanded=False):
key_pref = ["overview", "credit"]
st.session_state.card_dict["overview"][
"credit"
] = st.session_state.card_dict["overview"].get("credit", {})
make_multiselect(
label="In what kind of organization did the dataset curation happen?",
options=["industry", "academic", "independent", "other"],
key_list=key_pref + ["organization-type"],
)
make_text_input(
label="Name the organization(s).",
key_list=key_pref + ["organization-names"],
help="comma-separated",
)
make_text_input(
label="Who created the original dataset? List the people involved in collecting the dataset and their affiliation(s).",
key_list=key_pref + ["creators"],
help="name (affiliation); comma-separated",
)
make_text_input(
label="Who funded the data creation?",
key_list=key_pref + ["funding"],
help="[free text] enter N/A if unkown",
)
make_text_input(
label="Who contributed to the data card and adding the dataset to GEM? List the people+affiliations involved in creating this data card and who helped integrate this dataset into GEM.",
key_list=key_pref + ["gem-added-by"],
help="name (affiliation); comma-separated",
)
with st.expander("Structure", expanded=False):
key_pref = ["overview", "structure"]
st.session_state.card_dict["overview"]["structure"] = st.session_state.card_dict[
"overview"
].get("structure", {})
data_fields_help = """
[free text; paragraphs]
- Mention their data type, and whether and how they are used as part of the generation pipeline.
- Describe each fields' attributes, such as whether they are at the character level or word level, whether they are contiguous or not, etc.
- If the datasets contain example IDs, state whether they have an inherent meaning, such as a mapping to other datasets or pointing to relationships between data points.
"""
make_text_area(
label="List and describe the fields present in the dataset.",
key_list=key_pref + ["data-fields"],
help=data_fields_help,
)
make_text_area(
label="How was the dataset structure determined?",
key_list=key_pref + ["structure-description"],
help="[free text; paragraph]",
)
make_text_area(
label="How were the labels chosen?",
key_list=key_pref + ["structure-labels"],
help="[free text; paragraph]",
)
make_text_area(
label="Provide a JSON formatted example of a typical instance in the dataset.",
key_list=key_pref + ["structure-example"],
help="[JSON]",
)
make_text_area(
label="Describe and name the splits in the dataset if there are more than one.",
key_list=key_pref + ["structure-splits"],
help="[free text, paragraphs] - As appropriate, provide any descriptive statistics for the features, such as size, average lengths of input and output.",
)
make_text_area(
label="Describe any criteria for splitting the data, if used. If there are differences between the splits (e.g., if the training annotations are machine-generated and the dev and test ones are created by humans, or if different numbers of annotators contributed to each example), describe them here.",
key_list=key_pref + ["structure-splits-criteria"],
help="[free text, paragraphs]",
)
make_text_area(
label="What does an outlier of the dataset in terms of length/perplexity/embedding look like?",
key_list=key_pref + ["structure-outlier"],
help="[free text + json formatted text/file for an example]",
)
def overview_summary():
total_filled = sum(
[len(dct) for dct in st.session_state.card_dict.get("overview", {}).values()]
)
with st.expander(
f"Dataset Overview Completion - {total_filled} of {N_FIELDS}", expanded=False
):
completion_markdown = ""
completion_markdown += (
f"- **Overall completion:**\n - {total_filled} of {N_FIELDS} fields\n"
)
completion_markdown += f"- **Sub-section - Where to find:**\n - {len(st.session_state.card_dict.get('overview', {}).get('where', {}))} of {N_FIELDS_WHERE} fields\n"
completion_markdown += f"- **Sub-section - Languages and Intended Use:**\n - {len(st.session_state.card_dict.get('overview', {}).get('languages', {}))} of {N_FIELDS_LANGUAGES} fields\n"
completion_markdown += f"- **Sub-section - Credit:**\n - {len(st.session_state.card_dict.get('overview', {}).get('credit', {}))} of {N_FIELDS_CREDIT} fields\n"
completion_markdown += f"- **Sub-section - Structure:**\n - {len(st.session_state.card_dict.get('overview', {}).get('structure', {}))} of {N_FIELDS_STRUCTURE} fields\n"
st.markdown(completion_markdown)