Spaces:
Runtime error
Runtime error
Yacine Jernite
commited on
Commit
•
8a2ec29
1
Parent(s):
9994065
first half done
Browse files- datacards/curation.py +128 -105
- datacards/overview.py +26 -11
datacards/curation.py
CHANGED
@@ -11,7 +11,7 @@ from .streamlit_utils import (
|
|
11 |
)
|
12 |
|
13 |
N_FIELDS_ORIGINAL = 4
|
14 |
-
N_FIELDS_LANGUAGE =
|
15 |
N_FIELDS_ANNOTATIONS = 10
|
16 |
N_FIELDS_CONSENT = 4
|
17 |
N_FIELDS_PII = 7
|
@@ -52,11 +52,14 @@ def curation_page():
|
|
52 |
key_list=key_pref + ["is-aggregated"],
|
53 |
help="e.g. Wikipedia, movi dialogues, etc.",
|
54 |
)
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
60 |
|
61 |
with st.expander("Language Data", expanded=False):
|
62 |
key_pref = ["curation", "language"]
|
@@ -74,38 +77,49 @@ def curation_page():
|
|
74 |
],
|
75 |
key_list=key_pref + ["obtained"],
|
76 |
)
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
"
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
make_text_area(
|
100 |
label="What further information do we have on the language producers?",
|
101 |
key_list=key_pref + ["producers-description"],
|
102 |
help="Provide a description of the context in which the language was produced and who produced it.",
|
103 |
)
|
104 |
-
make_text_input(
|
105 |
-
label="If text was machine-generated for the dataset, provide a link to the generation method if available (N/A otherwise).",
|
106 |
-
key_list=key_pref + ["machine-generated"],
|
107 |
-
help="if the generation code is unavailable, enter N/A",
|
108 |
-
)
|
109 |
make_selectbox(
|
110 |
label="Was the text validated by a different worker or a data curator?",
|
111 |
options=[
|
@@ -117,16 +131,6 @@ def curation_page():
|
|
117 |
key_list=key_pref + ["validated"],
|
118 |
help="this question is about human or human-in-the-loop validation only",
|
119 |
)
|
120 |
-
make_multiselect(
|
121 |
-
label="In what kind of organization did the curation happen?",
|
122 |
-
options=["industry", "academic", "independent", "other"],
|
123 |
-
key_list=key_pref + ["organization-type"],
|
124 |
-
)
|
125 |
-
make_text_input(
|
126 |
-
label="Name the organization(s).",
|
127 |
-
key_list=key_pref + ["organization-names"],
|
128 |
-
help="comma-separated",
|
129 |
-
)
|
130 |
make_text_area(
|
131 |
label="How was the text data pre-processed? (Enter N/A if the text was not pre-processed)",
|
132 |
key_list=key_pref + ["pre-processed"],
|
@@ -137,11 +141,14 @@ def curation_page():
|
|
137 |
options=["not filtered", "manually", "algorithmically", "hybrid"],
|
138 |
key_list=key_pref + ["is-filtered"],
|
139 |
)
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
145 |
|
146 |
with st.expander("Structured Annotations", expanded=False):
|
147 |
key_pref = ["curation", "annotations"]
|
@@ -149,72 +156,88 @@ def curation_page():
|
|
149 |
"annotations"
|
150 |
] = st.session_state.card_dict["curation"].get("annotations", {})
|
151 |
|
152 |
-
|
153 |
label="Does the dataset have additional annotations for each instance?",
|
154 |
options=["none", "found", "automatically created", "expert created", "crowd-sourced"],
|
155 |
key_list=key_pref + ["origin"],
|
156 |
help="Was any additional data collected?",
|
157 |
)
|
158 |
|
159 |
-
# TODO: If yes....
|
160 |
# If expert or crowdsourced, this branch
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
-
make_text_area(
|
202 |
-
label="Purpose and values for each annoation",
|
203 |
-
key_list=key_pref + ["values"],
|
204 |
-
help="Describe the purpose and possible values for each kind of annotation.",
|
205 |
-
)
|
206 |
-
make_multiselect(
|
207 |
-
label="Quality control measures?",
|
208 |
-
options=["none", "unknown", "validated by another rater", "validated by data curators", "validated through automated script", "other"],
|
209 |
-
key_list=key_pref + ["quality-control"],
|
210 |
-
help="How was annotation quality controlled for / what control measures were put in place to ensure annotation quality?",
|
211 |
-
)
|
212 |
-
# TODO: If not none / unknown
|
213 |
-
make_text_area(
|
214 |
-
label="Describe the quality control measures that were taken.",
|
215 |
-
key_list=key_pref + ["quality-control-details"],
|
216 |
-
help="Describe how quality was ensured in the data curation process.",
|
217 |
-
)
|
218 |
|
219 |
with st.expander("Consent", expanded=False):
|
220 |
key_pref = ["curation", "consent"]
|
|
|
11 |
)
|
12 |
|
13 |
N_FIELDS_ORIGINAL = 4
|
14 |
+
N_FIELDS_LANGUAGE = 10
|
15 |
N_FIELDS_ANNOTATIONS = 10
|
16 |
N_FIELDS_CONSENT = 4
|
17 |
N_FIELDS_PII = 7
|
|
|
52 |
key_list=key_pref + ["is-aggregated"],
|
53 |
help="e.g. Wikipedia, movi dialogues, etc.",
|
54 |
)
|
55 |
+
if st.session_state.card_dict["curation"]["original"]["is-aggregated"] == "yes":
|
56 |
+
make_text_area(
|
57 |
+
label="List the sources (one per line)",
|
58 |
+
key_list=key_pref + ["aggregated-sources"],
|
59 |
+
help="One source per line",
|
60 |
+
)
|
61 |
+
else:
|
62 |
+
st.session_state.card_dict["curation"]["original"]["aggregated-sources"] = "N/A"
|
63 |
|
64 |
with st.expander("Language Data", expanded=False):
|
65 |
key_pref = ["curation", "language"]
|
|
|
77 |
],
|
78 |
key_list=key_pref + ["obtained"],
|
79 |
)
|
80 |
+
if "Found" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
|
81 |
+
make_multiselect(
|
82 |
+
label="If found, where from?",
|
83 |
+
options=["Multiple websites", "Single website", "Offline media collection", "Other"],
|
84 |
+
key_list=key_pref + ["found"],
|
85 |
+
help="select N/A if none of the language data was found",
|
86 |
+
)
|
87 |
+
else:
|
88 |
+
st.session_state.card_dict["curation"]["language"]["found"] = []
|
89 |
+
if "Crowdsourced" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
|
90 |
+
make_multiselect(
|
91 |
+
label="If crowdsourced, where from?",
|
92 |
+
options=[
|
93 |
+
"Amazon Mechanical Turk",
|
94 |
+
"Other crowdworker platform",
|
95 |
+
"Participatory experiment",
|
96 |
+
"Other",
|
97 |
+
],
|
98 |
+
key_list=key_pref + ["crowdsourced"],
|
99 |
+
help="select N/A if none of the language data was crowdsourced",
|
100 |
+
)
|
101 |
+
else:
|
102 |
+
st.session_state.card_dict["curation"]["language"]["crowdsourced"] = []
|
103 |
+
if "Created for the dataset" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
|
104 |
+
make_text_area(
|
105 |
+
label="If created for the dataset, describe the creation process.",
|
106 |
+
key_list=key_pref + ["created"],
|
107 |
+
)
|
108 |
+
else:
|
109 |
+
st.session_state.card_dict["curation"]["language"]["created"] = "N/A"
|
110 |
+
if "Machine-generated" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
|
111 |
+
make_text_input(
|
112 |
+
label="If text was machine-generated for the dataset, provide a link to the generation method if available (N/A otherwise).",
|
113 |
+
key_list=key_pref + ["machine-generated"],
|
114 |
+
help="if the generation code is unavailable, enter N/A",
|
115 |
+
)
|
116 |
+
else:
|
117 |
+
st.session_state.card_dict["curation"]["language"]["machine-generated"] = "N/A"
|
118 |
make_text_area(
|
119 |
label="What further information do we have on the language producers?",
|
120 |
key_list=key_pref + ["producers-description"],
|
121 |
help="Provide a description of the context in which the language was produced and who produced it.",
|
122 |
)
|
|
|
|
|
|
|
|
|
|
|
123 |
make_selectbox(
|
124 |
label="Was the text validated by a different worker or a data curator?",
|
125 |
options=[
|
|
|
131 |
key_list=key_pref + ["validated"],
|
132 |
help="this question is about human or human-in-the-loop validation only",
|
133 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
make_text_area(
|
135 |
label="How was the text data pre-processed? (Enter N/A if the text was not pre-processed)",
|
136 |
key_list=key_pref + ["pre-processed"],
|
|
|
141 |
options=["not filtered", "manually", "algorithmically", "hybrid"],
|
142 |
key_list=key_pref + ["is-filtered"],
|
143 |
)
|
144 |
+
if st.session_state.card_dict["curation"]["language"]["is-filtered"] == "not filtered":
|
145 |
+
st.session_state.card_dict["curation"]["language"]["filtered-criteria"] = "N/A"
|
146 |
+
else:
|
147 |
+
make_text_area(
|
148 |
+
label="What were the selection criteria?",
|
149 |
+
key_list=key_pref + ["filtered-criteria"],
|
150 |
+
help="Describe the process for selecting instances to include in the dataset, including any tools used.",
|
151 |
+
)
|
152 |
|
153 |
with st.expander("Structured Annotations", expanded=False):
|
154 |
key_pref = ["curation", "annotations"]
|
|
|
156 |
"annotations"
|
157 |
] = st.session_state.card_dict["curation"].get("annotations", {})
|
158 |
|
159 |
+
make_selectbox(
|
160 |
label="Does the dataset have additional annotations for each instance?",
|
161 |
options=["none", "found", "automatically created", "expert created", "crowd-sourced"],
|
162 |
key_list=key_pref + ["origin"],
|
163 |
help="Was any additional data collected?",
|
164 |
)
|
165 |
|
|
|
166 |
# If expert or crowdsourced, this branch
|
167 |
+
if st.session_state.card_dict["curation"]["annotations"]["origin"] in ["expert created", "crowd-sourced"]:
|
168 |
+
make_selectbox(
|
169 |
+
label="What is the number of raters ",
|
170 |
+
options=["unknown", "1", "2<n<10", "11<n<50", "51<n<100", "n>100"],
|
171 |
+
key_list=key_pref + ["rater-number"],
|
172 |
+
help="How many raters were used to create the additional annotations?",
|
173 |
+
)
|
174 |
+
make_text_area(
|
175 |
+
label="Describe the qualifications required of an annotator.",
|
176 |
+
key_list=key_pref + ["rater-qualifications"],
|
177 |
+
help="e.g., languages or dialects they speak, education requirements, number of HITs (if MTurk).",
|
178 |
+
)
|
179 |
+
make_selectbox(
|
180 |
+
label="How many annotators saw each training example?",
|
181 |
+
options=["0", "1", "2", "3", "4", "5", ">5"],
|
182 |
+
key_list=key_pref + ["rater-training-num"],
|
183 |
+
help="",
|
184 |
+
)
|
185 |
+
make_selectbox(
|
186 |
+
label="How many annotators saw each test example?",
|
187 |
+
options=["0", "1", "2", "3", "4", "5", ">5"],
|
188 |
+
key_list=key_pref + ["rater-test-num"],
|
189 |
+
help="",
|
190 |
+
)
|
191 |
+
make_radio(
|
192 |
+
label="Was an annotation service used?",
|
193 |
+
options=["no", "yes", "unknown"],
|
194 |
+
key_list=key_pref + ["rater-annotation-service-bool"],
|
195 |
+
help="",
|
196 |
+
)
|
197 |
+
if st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service-bool"] == "yes":
|
198 |
+
make_multiselect(
|
199 |
+
label="Which annotation services were used?",
|
200 |
+
options=[
|
201 |
+
"Amazon Mechanical Turk", "Prolific Academic",
|
202 |
+
"Upwork", "Appen", "Crowdflower", "other"
|
203 |
+
],
|
204 |
+
key_list=key_pref + ["rater-annotation-service"],
|
205 |
+
)
|
206 |
+
else:
|
207 |
+
st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service"] = []
|
208 |
+
else:
|
209 |
+
st.session_state.card_dict["curation"]["annotations"]["rater-number"] = "N/A"
|
210 |
+
st.session_state.card_dict["curation"]["annotations"]["rater-qualifications"] = "N/A"
|
211 |
+
st.session_state.card_dict["curation"]["annotations"]["rater-training-num"] = "N/A"
|
212 |
+
st.session_state.card_dict["curation"]["annotations"]["rater-test-num"] = "N/A"
|
213 |
+
st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service-bool"] = "no"
|
214 |
+
st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service"] = []
|
215 |
|
216 |
+
if st.session_state.card_dict["curation"]["annotations"]["origin"] != "none":
|
217 |
+
make_text_area(
|
218 |
+
label="Purpose and values for each annoation",
|
219 |
+
key_list=key_pref + ["values"],
|
220 |
+
help="Describe the purpose and possible values for each kind of annotation.",
|
221 |
+
)
|
222 |
+
make_selectbox(
|
223 |
+
label="Quality control measures?",
|
224 |
+
options=["none", "unknown", "validated by another rater", "validated by data curators", "validated through automated script", "other"],
|
225 |
+
key_list=key_pref + ["quality-control"],
|
226 |
+
help="How was annotation quality controlled for / what control measures were put in place to ensure annotation quality?",
|
227 |
+
)
|
228 |
+
if st.session_state.card_dict["curation"]["annotations"]["quality-control"] in ["none", "unknown"]:
|
229 |
+
st.session_state.card_dict["curation"]["annotations"]["quality-control-details"] = "N/A"
|
230 |
+
else:
|
231 |
+
make_text_area(
|
232 |
+
label="Describe the quality control measures that were taken.",
|
233 |
+
key_list=key_pref + ["quality-control-details"],
|
234 |
+
help="Describe how quality was ensured in the data curation process.",
|
235 |
+
)
|
236 |
+
else:
|
237 |
+
st.session_state.card_dict["curation"]["annotations"]["values"] = "N/A"
|
238 |
+
st.session_state.card_dict["curation"]["annotations"]["quality-control"] = []
|
239 |
+
st.session_state.card_dict["curation"]["annotations"]["quality-control-details"] = "N/A"
|
240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
with st.expander("Consent", expanded=False):
|
243 |
key_pref = ["curation", "consent"]
|
datacards/overview.py
CHANGED
@@ -13,7 +13,7 @@ from .streamlit_utils import (
|
|
13 |
|
14 |
N_FIELDS_WHERE = 9
|
15 |
N_FIELDS_LANGUAGES = 8
|
16 |
-
N_FIELDS_CREDIT =
|
17 |
N_FIELDS_STRUCTURE = 7
|
18 |
|
19 |
N_FIELDS = N_FIELDS_WHERE + N_FIELDS_LANGUAGES + N_FIELDS_CREDIT + N_FIELDS_STRUCTURE
|
@@ -65,16 +65,20 @@ def overview_page():
|
|
65 |
key_list=key_pref + ["has-leaderboard"],
|
66 |
help="If no, enter N/A for the following two fields",
|
67 |
)
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
78 |
make_text_input(
|
79 |
label="If known, provide the name of at least one person the reader can contact for questions about the dataset.",
|
80 |
key_list=key_pref + ["contact-name"],
|
@@ -127,6 +131,7 @@ def overview_page():
|
|
127 |
label="What primary task does the dataset support?",
|
128 |
key_list=key_pref + ["task"],
|
129 |
options=[
|
|
|
130 |
"Content Transfer",
|
131 |
"Data-to-Text",
|
132 |
"Dialog Response Generation",
|
@@ -150,6 +155,16 @@ def overview_page():
|
|
150 |
st.session_state.card_dict["overview"][
|
151 |
"credit"
|
152 |
] = st.session_state.card_dict.get("credit", {})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
make_text_input(
|
154 |
label="Who created the original dataset? List the people involved in collecting the dataset and their affiliation(s).",
|
155 |
key_list=key_pref + ["creators"],
|
|
|
13 |
|
14 |
N_FIELDS_WHERE = 9
|
15 |
N_FIELDS_LANGUAGES = 8
|
16 |
+
N_FIELDS_CREDIT = 5
|
17 |
N_FIELDS_STRUCTURE = 7
|
18 |
|
19 |
N_FIELDS = N_FIELDS_WHERE + N_FIELDS_LANGUAGES + N_FIELDS_CREDIT + N_FIELDS_STRUCTURE
|
|
|
65 |
key_list=key_pref + ["has-leaderboard"],
|
66 |
help="If no, enter N/A for the following two fields",
|
67 |
)
|
68 |
+
if st.session_state.card_dict["overview"]["where"]["has-leaderboard"] == "yes":
|
69 |
+
make_text_input(
|
70 |
+
label="Provide a link to the leaderboard.",
|
71 |
+
key_list=key_pref + ["leaderboard-url"],
|
72 |
+
help="[URL] or N/A",
|
73 |
+
)
|
74 |
+
make_text_area(
|
75 |
+
label="Briefly describe how the leaderboard evaluates models.",
|
76 |
+
key_list=key_pref + ["leaderboard-description"],
|
77 |
+
help="[free text; a paragraph] or N/A",
|
78 |
+
)
|
79 |
+
else:
|
80 |
+
st.session_state.card_dict["overview"]["where"]["leaderboard-url"] = "N/A"
|
81 |
+
st.session_state.card_dict["overview"]["where"]["leaderboard-description"] = "N/A"
|
82 |
make_text_input(
|
83 |
label="If known, provide the name of at least one person the reader can contact for questions about the dataset.",
|
84 |
key_list=key_pref + ["contact-name"],
|
|
|
131 |
label="What primary task does the dataset support?",
|
132 |
key_list=key_pref + ["task"],
|
133 |
options=[
|
134 |
+
"", # default needs to be invalid value to make sure people actually fill in
|
135 |
"Content Transfer",
|
136 |
"Data-to-Text",
|
137 |
"Dialog Response Generation",
|
|
|
155 |
st.session_state.card_dict["overview"][
|
156 |
"credit"
|
157 |
] = st.session_state.card_dict.get("credit", {})
|
158 |
+
make_multiselect(
|
159 |
+
label="In what kind of organization did the dataset curation happen?",
|
160 |
+
options=["industry", "academic", "independent", "other"],
|
161 |
+
key_list=key_pref + ["organization-type"],
|
162 |
+
)
|
163 |
+
make_text_input(
|
164 |
+
label="Name the organization(s).",
|
165 |
+
key_list=key_pref + ["organization-names"],
|
166 |
+
help="comma-separated",
|
167 |
+
)
|
168 |
make_text_input(
|
169 |
label="Who created the original dataset? List the people involved in collecting the dataset and their affiliation(s).",
|
170 |
key_list=key_pref + ["creators"],
|