Spaces:

GEM
/

DatasetCardForm

Runtime error

App Files Files Community

Yacine Jernite commited on Nov 16, 2021

Commit

8a2ec29

•

1 Parent(s): 9994065

first half done

Browse files

Files changed (2) hide show

datacards/curation.py +128 -105
datacards/overview.py +26 -11

datacards/curation.py CHANGED Viewed

@@ -11,7 +11,7 @@ from .streamlit_utils import (
 )
 N_FIELDS_ORIGINAL = 4
-N_FIELDS_LANGUAGE = 12
 N_FIELDS_ANNOTATIONS = 10
 N_FIELDS_CONSENT = 4
 N_FIELDS_PII = 7
@@ -52,11 +52,14 @@ def curation_page():
             key_list=key_pref + ["is-aggregated"],
             help="e.g. Wikipedia, movi dialogues, etc.",
         )
-        make_text_area(
-            label="If yes, list the sources",
-            key_list=key_pref + ["aggregated-sources"],
-            help="Otherwise, type N/A",
-        )
     with st.expander("Language Data", expanded=False):
         key_pref = ["curation", "language"]
@@ -74,38 +77,49 @@ def curation_page():
             ],
             key_list=key_pref + ["obtained"],
         )
-        make_multiselect(
-            label="If found, where from?",
-            options=["Multiple websites", "Single website", "Offline media collection", "Other", "N/A"],
-            key_list=key_pref + ["found"],
-            help="select N/A if none of the language data was found",
-        )
-        make_multiselect(
-            label="If crowdsourced, where from?",
-            options=[
-                "Amazon Mechanical Turk",
-                "Other crowdworker platform",
-                "Participatory experiment",
-                "Other",
-                "N/A",
-            ],
-            key_list=key_pref + ["crowdsourced"],
-            help="select N/A if none of the language data was crowdsourced",
-        )
-        make_text_area(
-            label="If created for the dataset, describe the creation process.",
-            key_list=key_pref + ["created"],
-        )
         make_text_area(
             label="What further information do we have on the language producers?",
             key_list=key_pref + ["producers-description"],
             help="Provide a description of the context in which the language was produced and who produced it.",
         )
-        make_text_input(
-            label="If text was machine-generated for the dataset, provide a link to the generation method if available (N/A otherwise).",
-            key_list=key_pref + ["machine-generated"],
-            help="if the generation code is unavailable, enter N/A",
-        )
         make_selectbox(
             label="Was the text validated by a different worker or a data curator?",
             options=[
@@ -117,16 +131,6 @@ def curation_page():
             key_list=key_pref + ["validated"],
             help="this question is about human or human-in-the-loop validation only",
         )
-        make_multiselect(
-            label="In what kind of organization did the curation happen?",
-            options=["industry", "academic", "independent", "other"],
-            key_list=key_pref + ["organization-type"],
-        )
-        make_text_input(
-            label="Name the organization(s).",
-            key_list=key_pref + ["organization-names"],
-            help="comma-separated",
-        )
         make_text_area(
             label="How was the text data pre-processed? (Enter N/A if the text was not pre-processed)",
             key_list=key_pref + ["pre-processed"],
@@ -137,11 +141,14 @@ def curation_page():
             options=["not filtered", "manually", "algorithmically", "hybrid"],
             key_list=key_pref + ["is-filtered"],
         )
-        make_text_area(
-            label="What were the selection criteria?",
-            key_list=key_pref + ["filtered-criteria"],
-            help="Describe the process for selecting instances to include in the dataset, including any tools used. If no selection was done, enter N/A.",
-        )
     with st.expander("Structured Annotations", expanded=False):
         key_pref = ["curation", "annotations"]
@@ -149,72 +156,88 @@ def curation_page():
             "annotations"
         ] = st.session_state.card_dict["curation"].get("annotations", {})
-        make_radio(
             label="Does the dataset have additional annotations for each instance?",
             options=["none", "found", "automatically created", "expert created", "crowd-sourced"],
             key_list=key_pref + ["origin"],
             help="Was any additional data collected?",
         )
-        # TODO: If yes....
         # If expert or crowdsourced, this branch
-        make_radio(
-            label="What is the number of raters ",
-            options=["unknown", "1", "2<n<10", "11<n<50", "51<n<100", "n>100"],
-            key_list=key_pref + ["rater-number"],
-            help="How many raters were used to create the additional annotations?",
-        )
-        make_text_area(
-            label="Describe the qualifications required of an annotator.",
-            key_list=key_pref + ["rater-qualifications"],
-            help="e.g., languages or dialects they speak, education requirements, number of HITs (if MTurk).",
-        )
-        make_radio(
-            label="How many annotators saw each training example?",
-            options=["0", "1", "2", "3", "4", "5", ">5"],
-            key_list=key_pref + ["rater-training-num"],
-            help="",
-        )
-        make_radio(
-            label="How many annotators saw each test example?",
-            options=["0", "1", "2", "3", "4", "5", ">5"],
-            key_list=key_pref + ["rater-test-num"],
-            help="",
-        )
-        make_radio(
-            label="Was an annotation service used?",
-            options=["yes", "no", "unknown"],
-            key_list=key_pref + ["rater-annotation-service-bool"],
-            help="",
-        )
-        # TODO if yes
-        make_multiselect(
-            label="Which annotation services were used?",
-            options=[
-                "Amazon Mechanical Turk", "Prolific Academic",
-                "Upwork", "Appen", "Crowdflower", "other"
-            ],
-            key_list=key_pref + ["rater-annotation-service"],
-        )
-        make_text_area(
-            label="Purpose and values for each annoation",
-            key_list=key_pref + ["values"],
-            help="Describe the purpose and possible values for each kind of annotation.",
-        )
-        make_multiselect(
-            label="Quality control measures?",
-            options=["none", "unknown",  "validated by another rater", "validated by data curators", "validated through automated script", "other"],
-            key_list=key_pref + ["quality-control"],
-            help="How was annotation quality controlled for / what control measures were put in place to ensure annotation quality?",
-        )
-        # TODO: If not none / unknown
-        make_text_area(
-            label="Describe the quality control measures that were taken.",
-            key_list=key_pref + ["quality-control-details"],
-            help="Describe how quality was ensured in the data curation process.",
-        )
     with st.expander("Consent", expanded=False):
         key_pref = ["curation", "consent"]

 )
 N_FIELDS_ORIGINAL = 4
+N_FIELDS_LANGUAGE = 10
 N_FIELDS_ANNOTATIONS = 10
 N_FIELDS_CONSENT = 4
 N_FIELDS_PII = 7
             key_list=key_pref + ["is-aggregated"],
             help="e.g. Wikipedia, movi dialogues, etc.",
         )
+        if st.session_state.card_dict["curation"]["original"]["is-aggregated"] == "yes":
+            make_text_area(
+                label="List the sources (one per line)",
+                key_list=key_pref + ["aggregated-sources"],
+                help="One source per line",
+            )
+        else:
+            st.session_state.card_dict["curation"]["original"]["aggregated-sources"] = "N/A"
     with st.expander("Language Data", expanded=False):
         key_pref = ["curation", "language"]
             ],
             key_list=key_pref + ["obtained"],
         )
+        if "Found" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
+            make_multiselect(
+                label="If found, where from?",
+                options=["Multiple websites", "Single website", "Offline media collection", "Other"],
+                key_list=key_pref + ["found"],
+                help="select N/A if none of the language data was found",
+            )
+        else:
+            st.session_state.card_dict["curation"]["language"]["found"] = []
+        if "Crowdsourced" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
+            make_multiselect(
+                label="If crowdsourced, where from?",
+                options=[
+                    "Amazon Mechanical Turk",
+                    "Other crowdworker platform",
+                    "Participatory experiment",
+                    "Other",
+                ],
+                key_list=key_pref + ["crowdsourced"],
+                help="select N/A if none of the language data was crowdsourced",
+            )
+        else:
+            st.session_state.card_dict["curation"]["language"]["crowdsourced"] = []
+        if "Created for the dataset" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
+            make_text_area(
+                label="If created for the dataset, describe the creation process.",
+                key_list=key_pref + ["created"],
+            )
+        else:
+            st.session_state.card_dict["curation"]["language"]["created"] = "N/A"
+        if "Machine-generated" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
+            make_text_input(
+                label="If text was machine-generated for the dataset, provide a link to the generation method if available (N/A otherwise).",
+                key_list=key_pref + ["machine-generated"],
+                help="if the generation code is unavailable, enter N/A",
+            )
+        else:
+            st.session_state.card_dict["curation"]["language"]["machine-generated"] = "N/A"
         make_text_area(
             label="What further information do we have on the language producers?",
             key_list=key_pref + ["producers-description"],
             help="Provide a description of the context in which the language was produced and who produced it.",
         )
         make_selectbox(
             label="Was the text validated by a different worker or a data curator?",
             options=[
             key_list=key_pref + ["validated"],
             help="this question is about human or human-in-the-loop validation only",
         )
         make_text_area(
             label="How was the text data pre-processed? (Enter N/A if the text was not pre-processed)",
             key_list=key_pref + ["pre-processed"],
             options=["not filtered", "manually", "algorithmically", "hybrid"],
             key_list=key_pref + ["is-filtered"],
         )
+        if st.session_state.card_dict["curation"]["language"]["is-filtered"] == "not filtered":
+            st.session_state.card_dict["curation"]["language"]["filtered-criteria"] = "N/A"
+        else:
+            make_text_area(
+                label="What were the selection criteria?",
+                key_list=key_pref + ["filtered-criteria"],
+                help="Describe the process for selecting instances to include in the dataset, including any tools used.",
+            )
     with st.expander("Structured Annotations", expanded=False):
         key_pref = ["curation", "annotations"]
             "annotations"
         ] = st.session_state.card_dict["curation"].get("annotations", {})
+        make_selectbox(
             label="Does the dataset have additional annotations for each instance?",
             options=["none", "found", "automatically created", "expert created", "crowd-sourced"],
             key_list=key_pref + ["origin"],
             help="Was any additional data collected?",
         )
         # If expert or crowdsourced, this branch
+        if st.session_state.card_dict["curation"]["annotations"]["origin"] in ["expert created", "crowd-sourced"]:
+            make_selectbox(
+                label="What is the number of raters ",
+                options=["unknown", "1", "2<n<10", "11<n<50", "51<n<100", "n>100"],
+                key_list=key_pref + ["rater-number"],
+                help="How many raters were used to create the additional annotations?",
+            )
+            make_text_area(
+                label="Describe the qualifications required of an annotator.",
+                key_list=key_pref + ["rater-qualifications"],
+                help="e.g., languages or dialects they speak, education requirements, number of HITs (if MTurk).",
+            )
+            make_selectbox(
+                label="How many annotators saw each training example?",
+                options=["0", "1", "2", "3", "4", "5", ">5"],
+                key_list=key_pref + ["rater-training-num"],
+                help="",
+            )
+            make_selectbox(
+                label="How many annotators saw each test example?",
+                options=["0", "1", "2", "3", "4", "5", ">5"],
+                key_list=key_pref + ["rater-test-num"],
+                help="",
+            )
+            make_radio(
+                label="Was an annotation service used?",
+                options=["no", "yes", "unknown"],
+                key_list=key_pref + ["rater-annotation-service-bool"],
+                help="",
+            )
+            if st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service-bool"] == "yes":
+                make_multiselect(
+                    label="Which annotation services were used?",
+                    options=[
+                        "Amazon Mechanical Turk", "Prolific Academic",
+                        "Upwork", "Appen", "Crowdflower", "other"
+                    ],
+                    key_list=key_pref + ["rater-annotation-service"],
+                )
+            else:
+                st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service"] = []
+        else:
+            st.session_state.card_dict["curation"]["annotations"]["rater-number"] = "N/A"
+            st.session_state.card_dict["curation"]["annotations"]["rater-qualifications"] = "N/A"
+            st.session_state.card_dict["curation"]["annotations"]["rater-training-num"] = "N/A"
+            st.session_state.card_dict["curation"]["annotations"]["rater-test-num"] = "N/A"
+            st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service-bool"] = "no"
+            st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service"] = []
+        if st.session_state.card_dict["curation"]["annotations"]["origin"] != "none":
+            make_text_area(
+                label="Purpose and values for each annoation",
+                key_list=key_pref + ["values"],
+                help="Describe the purpose and possible values for each kind of annotation.",
+            )
+            make_selectbox(
+                label="Quality control measures?",
+                options=["none", "unknown",  "validated by another rater", "validated by data curators", "validated through automated script", "other"],
+                key_list=key_pref + ["quality-control"],
+                help="How was annotation quality controlled for / what control measures were put in place to ensure annotation quality?",
+            )
+            if st.session_state.card_dict["curation"]["annotations"]["quality-control"] in ["none", "unknown"]:
+                st.session_state.card_dict["curation"]["annotations"]["quality-control-details"] = "N/A"
+            else:
+                make_text_area(
+                    label="Describe the quality control measures that were taken.",
+                    key_list=key_pref + ["quality-control-details"],
+                    help="Describe how quality was ensured in the data curation process.",
+                )
+        else:
+            st.session_state.card_dict["curation"]["annotations"]["values"] = "N/A"
+            st.session_state.card_dict["curation"]["annotations"]["quality-control"] = []
+            st.session_state.card_dict["curation"]["annotations"]["quality-control-details"] = "N/A"
     with st.expander("Consent", expanded=False):
         key_pref = ["curation", "consent"]

datacards/overview.py CHANGED Viewed

@@ -13,7 +13,7 @@ from .streamlit_utils import (
 N_FIELDS_WHERE = 9
 N_FIELDS_LANGUAGES = 8
-N_FIELDS_CREDIT = 3
 N_FIELDS_STRUCTURE = 7
 N_FIELDS = N_FIELDS_WHERE + N_FIELDS_LANGUAGES + N_FIELDS_CREDIT + N_FIELDS_STRUCTURE
@@ -65,16 +65,20 @@ def overview_page():
             key_list=key_pref + ["has-leaderboard"],
             help="If no, enter N/A for the following two fields",
         )
-        make_text_input(
-            label="Provide a link to the leaderboard if it exists. Otherwise, enter N/A.",
-            key_list=key_pref + ["leaderboard-url"],
-            help="[URL] or N/A",
-        )
-        make_text_area(
-            label="Briefly describe how the leaderboard evaluates models if it exists. Otherwise, enter N/A.",
-            key_list=key_pref + ["leaderboard-description"],
-            help="[free text; a paragraph] or N/A",
-        )
         make_text_input(
             label="If known, provide the name of at least one person the reader can contact for questions about the dataset.",
             key_list=key_pref + ["contact-name"],
@@ -127,6 +131,7 @@ def overview_page():
             label="What primary task does the dataset support?",
             key_list=key_pref + ["task"],
             options=[
                 "Content Transfer",
                 "Data-to-Text",
                 "Dialog Response Generation",
@@ -150,6 +155,16 @@ def overview_page():
         st.session_state.card_dict["overview"][
             "credit"
         ] = st.session_state.card_dict.get("credit", {})
         make_text_input(
             label="Who created the original dataset? List the people involved in collecting the dataset and their affiliation(s).",
             key_list=key_pref + ["creators"],

 N_FIELDS_WHERE = 9
 N_FIELDS_LANGUAGES = 8
+N_FIELDS_CREDIT = 5
 N_FIELDS_STRUCTURE = 7
 N_FIELDS = N_FIELDS_WHERE + N_FIELDS_LANGUAGES + N_FIELDS_CREDIT + N_FIELDS_STRUCTURE
             key_list=key_pref + ["has-leaderboard"],
             help="If no, enter N/A for the following two fields",
         )
+        if st.session_state.card_dict["overview"]["where"]["has-leaderboard"] == "yes":
+            make_text_input(
+                label="Provide a link to the leaderboard.",
+                key_list=key_pref + ["leaderboard-url"],
+                help="[URL] or N/A",
+            )
+            make_text_area(
+                label="Briefly describe how the leaderboard evaluates models.",
+                key_list=key_pref + ["leaderboard-description"],
+                help="[free text; a paragraph] or N/A",
+            )
+        else:
+            st.session_state.card_dict["overview"]["where"]["leaderboard-url"] = "N/A"
+            st.session_state.card_dict["overview"]["where"]["leaderboard-description"] = "N/A"
         make_text_input(
             label="If known, provide the name of at least one person the reader can contact for questions about the dataset.",
             key_list=key_pref + ["contact-name"],
             label="What primary task does the dataset support?",
             key_list=key_pref + ["task"],
             options=[
+                "",  # default needs to be invalid value to make sure people actually fill in
                 "Content Transfer",
                 "Data-to-Text",
                 "Dialog Response Generation",
         st.session_state.card_dict["overview"][
             "credit"
         ] = st.session_state.card_dict.get("credit", {})
+        make_multiselect(
+            label="In what kind of organization did the dataset curation happen?",
+            options=["industry", "academic", "independent", "other"],
+            key_list=key_pref + ["organization-type"],
+        )
+        make_text_input(
+            label="Name the organization(s).",
+            key_list=key_pref + ["organization-names"],
+            help="comma-separated",
+        )
         make_text_input(
             label="Who created the original dataset? List the people involved in collecting the dataset and their affiliation(s).",
             key_list=key_pref + ["creators"],