import json import os import shutil import uuid from datetime import datetime from pathlib import Path import jsonlines import streamlit as st from dotenv import load_dotenv from huggingface_hub import Repository, cached_download, hf_hub_url from utils import http_get, http_post, validate_json if Path(".env").is_file(): load_dotenv(".env") HF_TOKEN = os.getenv("HF_TOKEN") AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME") AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API") LOCAL_REPO = "submission_repo" LOGS_REPO = "submission-logs" # TODO # 1. Add check that fields are nested under `tasks` field correctly # 2. Add check that names of tasks and datasets are valid MARKDOWN = """--- benchmark: gem type: prediction submission_name: {submission_name} tags: - evaluation - benchmark --- # GEM Submission Submission name: {submission_name} """ def generate_dataset_card(submission_name): """ Generate dataset card for the submission """ markdown = MARKDOWN.format( submission_name=submission_name, ) with open(os.path.join(LOCAL_REPO, "README.md"), "w") as f: f.write(markdown) def load_json(path): with open(path, "r") as f: return json.load(f) def get_submission_names(): """Download all submission names. The GEM frontend requires the submission names to be unique, so here we download all submission names and use them as a check against the user submissions. """ scores_url = hf_hub_url("GEM-submissions/submission-scores", "scores.json", repo_type="dataset") scores_filepath = cached_download(scores_url, force_download=True) scores_data = load_json(scores_filepath) return [score["submission_name"] for score in scores_data] ####### # APP # ####### st.title("GEM Submissions") st.markdown( """ Welcome to the [GEM benchmark](https://gem-benchmark.com/)! GEM is a benchmark environment for Natural Language Generation with a focus on its Evaluation, both through human annotations and automated Metrics. GEM aims to: - measure NLG progress across many NLG tasks across languages. - audit data and models and present results via data cards and model robustness reports. - develop standards for evaluation of generated text using both automated and human metrics. Use this page to submit your system's predictions to the benchmark. """ ) with st.form(key="form"): # Flush local repo shutil.rmtree(LOCAL_REPO, ignore_errors=True) submission_errors = 0 uploaded_file = st.file_uploader("Upload submission file", type=["json"]) if uploaded_file: data = str(uploaded_file.read(), "utf-8") json_data = json.loads(data) submission_names = get_submission_names() submission_name = json_data["submission_name"] if submission_name in submission_names: st.error(f"🙈 Submission name `{submission_name}` is already taken. Please rename your submission.") submission_errors += 1 else: is_valid, message = validate_json(json_data) if is_valid: st.success(message) else: st.error(message) submission_errors += 1 with st.expander("Submission format"): st.markdown( """ Please follow this JSON format for your `submission.json` file: ```json { "submission_name": "An identifying name of your system", "param_count": 123, # The number of parameters your system has. "description": "An optional brief description of the system that will be shown on the results page", "tasks": { "dataset_identifier": { "values": ["output-0", "output-1", "..."], # A list of system outputs. "keys": ["gem_id-0", "gem_id-1", ...] # A list of GEM IDs. } } } ``` Here, `dataset_identifier` is the identifier of the dataset followed by an identifier of the set the outputs were created from, for example `_validation` or `_test`. For example, the `mlsum_de` test set has the identifier `mlsum_de_test`. The `keys` field is needed to avoid accidental shuffling that will impact your metrics. Simply add a list of IDs from the `gem_id` column of each evaluation dataset in the same order as your values. Please see the sample submission below: """ ) with open("sample-submission.json", "r") as f: example_submission = json.load(f) st.json(example_submission) user_name = st.text_input("Enter your 🤗 Hub username", help="This field is required to track your submission and cannot be empty") submit_button = st.form_submit_button("Make Submission") if submit_button and submission_errors == 0: with st.spinner("⏳ Preparing submission for evaluation ..."): submission_name = json_data["submission_name"] submission_name_formatted = submission_name.lower().replace(" ", "-").replace("/", "-") submission_time = str(int(datetime.now().timestamp())) # Create submission dataset under benchmarks ORG submission_repo_id = f"GEM-submissions/{user_name}__{submission_name_formatted}__{submission_time}" dataset_repo_url = f"https://huggingface.co/datasets/{submission_repo_id}" repo = Repository( local_dir=LOCAL_REPO, clone_from=dataset_repo_url, repo_type="dataset", private=False, use_auth_token=HF_TOKEN, ) generate_dataset_card(submission_name) with open(f"{LOCAL_REPO}/submission.json", "w", encoding="utf-8") as f: json.dump(json_data, f) # TODO: add informative commit msg commit_url = repo.push_to_hub() if commit_url is not None: commit_sha = commit_url.split("/")[-1] else: commit_sha = repo.git_head_commit_url().split("/")[-1] submission_id = submission_name + "__" + str(uuid.uuid4())[:6] + "__" + submission_time # Define AutoTrain payload project_config = {} # Need a dummy dataset to use the dataset loader in AutoTrain project_config["dataset_name"] = "lewtun/imdb-dummy" project_config["dataset_config"] = "lewtun--imdb-dummy" project_config["dataset_split"] = "train" project_config["col_mapping"] = {"text": "text", "label": "target"} # Specify benchmark parameters project_config["model"] = "gem" project_config["dataset"] = "GEM/references" project_config["submission_dataset"] = submission_repo_id project_id = str(uuid.uuid4()).split("-")[0] project_payload = { "username": AUTOTRAIN_USERNAME, "proj_name": f"benchmark-gem-{project_id}", "task": 1, "config": { "language": "en", "max_models": 5, "instance": { "provider": "aws", "instance_type": "ml.g4dn.4xlarge", "max_runtime_seconds": 172800, "num_instances": 1, "disk_size_gb": 150, }, "benchmark": { "dataset": project_config["dataset"], "model": project_config["model"], "submission_dataset": project_config["submission_dataset"], }, }, } project_json_resp = http_post( path="/projects/create", payload=project_payload, token=HF_TOKEN, domain=AUTOTRAIN_BACKEND_API ).json() print(f"Project creation: {project_json_resp}") # Upload data payload = { "split": 4, "col_mapping": project_config["col_mapping"], "load_config": {"max_size_bytes": 0, "shuffle": False}, } data_json_resp = http_post( path=f"/projects/{project_json_resp['id']}/data/{project_config['dataset_name']}", payload=payload, token=HF_TOKEN, domain=AUTOTRAIN_BACKEND_API, params={ "type": "dataset", "config_name": project_config["dataset_config"], "split_name": project_config["dataset_split"], }, ).json() print(f"Dataset creation: {data_json_resp}") # Run training train_json_resp = http_get( path=f"/projects/{project_json_resp['id']}/data/start_process", token=HF_TOKEN, domain=AUTOTRAIN_BACKEND_API, ).json() print(f"Training job response: {train_json_resp}") logs_repo_url = f"https://huggingface.co/datasets/GEM-submissions/{LOGS_REPO}" logs_repo = Repository( local_dir=LOGS_REPO, clone_from=logs_repo_url, repo_type="dataset", private=True, use_auth_token=HF_TOKEN, ) evaluation_log = {} evaluation_log["payload"] = project_payload evaluation_log["project_creation_response"] = project_json_resp evaluation_log["dataset_creation_response"] = data_json_resp evaluation_log["autotrain_job_response"] = train_json_resp with jsonlines.open(f"{LOGS_REPO}/logs.jsonl") as r: lines = [] for obj in r: lines.append(obj) lines.append(evaluation_log) with jsonlines.open(f"{LOGS_REPO}/logs.jsonl", mode="w") as writer: for job in lines: writer.write(job) logs_repo.push_to_hub(commit_message=f"Submission with job ID {project_json_resp['id']}") if train_json_resp["success"] == 1: st.success( f"✅ Submission {submission_name} was successfully submitted for evaluation!" ) st.markdown( f""" Evaluation can take up to 1 hour to complete, so grab a ☕ or 🍵 while you wait: * 📊 Click [here](https://huggingface.co/spaces/GEM/results) to view the results from your submission * 💾 Click [here]({dataset_repo_url}) to view your submission file on the Hugging Face Hub Please [contact the organisers](mailto:gehrmann@google.com) if you would like your submission and/or evaluation scores deleted. """ ) else: st.error( "🙈 Oh noes, there was an error submitting your submission! Please [contact the organisers](mailto:gehrmann@google.com)" ) # # Flush local repos shutil.rmtree(LOCAL_REPO, ignore_errors=True) shutil.rmtree(LOGS_REPO, ignore_errors=True) with st.expander("Download all submissions and scores"): st.markdown("Click the button below if you'd like to download all the submissions and evaluations from GEM:") outputs_url = hf_hub_url( "GEM-submissions/v2-outputs-and-scores", "gem-v2-outputs-and-scores.zip", repo_type="dataset" ) outputs_filepath = cached_download(outputs_url) with open(outputs_filepath, "rb") as f: btn = st.download_button(label="Download submissions and scores", data=f, file_name="outputs-and-scores.zip")