Spaces:
Runtime error
Runtime error
from argparse import ArgumentParser | |
from json import load | |
import pathlib | |
import os | |
def multi_grep(d, l1, l2, l3): | |
return d.get(l1, {}).get(l2, {}).get(l3, "[Needs More Information]") | |
def multi_grep2(d, l1, l2, l3): | |
return d.get(l1, {}).get(l2, {}).get(l3, ["unknown"]) | |
def sanitize_md_url(s): | |
"""Strip out MD fragments if they exist.""" | |
if len(s.split("](")) > 1: | |
return s.split("](")[1].replace(")", "") | |
else: | |
return s | |
# --- | |
# annotations_creators: | |
# - expert-generated | |
# language_creators: | |
# - found | |
# languages: | |
# - en | |
# licenses: | |
# - unknown | |
# multilinguality: | |
# - monolingual | |
# pretty_name: FairytaleQA | |
# size_categories: | |
# - 10K<n<100K | |
# source_datasets: | |
# - original | |
# task_categories: | |
# - question-generation | |
# task_ids: | |
# - abstractive-qg | |
# --- | |
def construct_preamble(data, name): | |
pre = "---\n" | |
pre += "annotations_creators:\n" | |
# - expert-generated | |
s = multi_grep(data, "curation", "annotations", "origin") | |
if s == "[Needs More Information]": | |
pre += "- unknown\n" | |
else: | |
pre += "- " + s.replace(" ", "-") + "\n" | |
pre += "language_creators:\n- unknown\n" | |
pre += "languages:" | |
languages = multi_grep2(data, "overview", "languages", "language_names") | |
for l in languages: | |
pre += f"\n- {l}" | |
pre += "\nlicenses:\n" | |
s = multi_grep(data, "overview", "languages", "license") | |
if s == "[Needs More Information]": | |
pre += "- unknown\n" | |
else: | |
pre += "- " + s.split(":")[0] + "\n" | |
pre += "multilinguality:\n" | |
if languages == ["unknown"]: | |
pre += "- unknown" | |
elif len(languages) == 1: | |
pre += "- monolingual" | |
else: | |
pre += "- multilingual" | |
# - monolingual | |
pre += f"\npretty_name: {name}\n" | |
pre += "size_categories:\n- unknown\n" | |
pre += "source_datasets:\n- original\n" | |
pre += "task_categories:\n" | |
s = multi_grep(data, "overview", "languages", "task") | |
if s == "[Needs More Information]": | |
pre += "- unknown\n" | |
else: | |
pre += "- " + "-".join(s.lower().split(" ")) + "\n" | |
# - question-generation | |
pre += "task_ids:\n- unknown\n" | |
# - abstractive-qg | |
pre += "---\n\n" | |
return pre | |
## Table of Contents | |
# - [Dataset Description](#dataset-description) | |
# - [Dataset Summary](#dataset-summary) | |
# - [Supported Tasks](#supported-tasks-and-leaderboards) | |
# - [Languages](#languages) | |
# - [Dataset Structure](#dataset-structure) | |
# - [Data Instances](#data-instances) | |
# - [Data Fields](#data-instances) | |
# - [Data Splits](#data-instances) | |
# - [Dataset Creation](#dataset-creation) | |
# - [Curation Rationale](#curation-rationale) | |
# - [Source Data](#source-data) | |
# - [Annotations](#annotations) | |
# - [Personal and Sensitive Information](#personal-and-sensitive-information) | |
# - [Considerations for Using the Data](#considerations-for-using-the-data) | |
# - [Social Impact of Dataset](#social-impact-of-dataset) | |
# - [Discussion of Biases](#discussion-of-biases) | |
# - [Other Known Limitations](#other-known-limitations) | |
# - [Additional Information](#additional-information) | |
# - [Dataset Curators](#dataset-curators) | |
# - [Licensing Information](#licensing-information) | |
# - [Citation Information](#citation-information) | |
def construct_toc(data): | |
pass | |
def construct_links(data): | |
links = "## Dataset Description\n\n" | |
s = sanitize_md_url(multi_grep(data, "overview", "where", "website")) | |
links += f"- **Homepage:** {s}\n" | |
s = sanitize_md_url(multi_grep(data, "overview", "where", "data-url")) | |
links += f"- **Repository:** {s}\n" | |
s = sanitize_md_url(multi_grep(data, "overview", "where", "paper-url")) | |
links += f"- **Paper:** {s}\n" | |
s = sanitize_md_url(multi_grep(data, "overview", "where", "leaderboard-url")) | |
links += f"- **Leaderboard:** {s}\n" | |
s = multi_grep(data, "overview", "where", "contact-name") | |
links += f"- **Point of Contact:** {s}\n\n" | |
return links | |
def json_to_markdown(filename, original_json_path): | |
json = load(open(filename)) | |
original_json = load(open(original_json_path)) | |
dataset_name = pathlib.Path(original_json_path).stem | |
preamble = construct_preamble(original_json, dataset_name) | |
markdown = preamble | |
markdown += f'# Dataset Card for GEM/{json["name"]}\n\n' | |
# ToC here. | |
markdown += construct_links(original_json) | |
markdown += "### Link to Main Data Card\n\n" | |
markdown += f'You can find the main data card on the [GEM Website](https://gem-benchmark.com/data_cards/{dataset_name}).\n\n' | |
markdown += "### Dataset Summary \n\n" | |
markdown += json['summary'] + '\n\n' | |
for key in json: | |
if key not in ('name', 'summary', 'sections'): | |
markdown += f'#### {key}\n{json[key]}\n\n' | |
markdown += '\n'.join(section_to_markdown(section) \ | |
for section in json['sections']) | |
readme_path = os.path.join(pathlib.Path(original_json_path).parents[0], "README.md") | |
with open(readme_path, 'w') as f: | |
f.write(markdown) | |
def section_to_markdown(section): | |
markdown = f'{"#" * section["level"]} {section["title"]}\n\n' | |
markdown += '\n'.join(subsection_to_markdown(subsection) \ | |
for subsection in section['subsections']) | |
return markdown + '\n' | |
def subsection_to_markdown(subsection): | |
markdown = f'{"#" * subsection["level"]} {subsection["title"]}\n\n' | |
markdown += '\n'.join(field_to_markdown(field) \ | |
for field in subsection['fields']) | |
return markdown + '\n' | |
def field_to_markdown(field): | |
markdown = f'{"#" * field["level"]} {field["title"]}\n\n' | |
if 'flags' in field and 'quick' in field['flags']: | |
markdown += f'<!-- quick -->\n' | |
if field.get('info', False): | |
markdown += f'<!-- info: {field["info"]} -->\n' | |
if field.get('scope', False): | |
markdown += f'<!-- scope: {field["scope"]} -->\n' | |
markdown += field.get('content', '') | |
return markdown + '\n' | |
# def main(): | |
# """Converts JSON output from `reformat_json.py` | |
# to Markdown input for Data Cards Labs.""" | |
# args = parse_args() | |
# for filename in args.input: | |
# if filename[-5:] == '.json': | |
# json_to_markdown(filename) | |
if __name__ == "__main__": | |
for dataset in os.listdir("../../../GEMv2"): | |
data_card_path = f"../../../GEMv2/{dataset}/{dataset}.json" | |
if os.path.exists(data_card_path): | |
print(f"Now processing {dataset}.") | |
# This script assumes you have run reformat_json.py | |
new_path = f"datacards/{dataset}.json" | |
md_string = json_to_markdown(new_path, data_card_path) | |
else: | |
print(f"{dataset} has no data card!") | |