File size: 4,272 Bytes
212696e
be48d91
212696e
 
9475ab9
bdd92c6
212696e
b727941
212696e
aa6287d
212696e
 
 
 
b727941
 
80bc608
85fb5e3
80bc608
b727941
 
 
 
aa6287d
b727941
 
 
 
212696e
 
aa6287d
 
212696e
 
aa6287d
b727941
aa6287d
212696e
b727941
212696e
b727941
aa6287d
 
 
 
 
 
b727941
 
b66bb5e
aa6287d
 
bdd92c6
7cfc852
be48d91
bdd92c6
b50cacf
 
 
 
7cfc852
3f9b5a9
 
 
 
 
b3c67da
b727941
 
 
 
 
 
 
 
2f0778f
9475ab9
85fb5e3
9475ab9
 
b727941
212696e
 
 
 
 
 
b0781a3
b66bb5e
 
 
 
 
79a5877
 
 
 
20aaf01
9b1a4d2
b66bb5e
 
b727941
 
f842e59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import streamlit as st
from datasets import get_dataset_config_names
from dotenv import load_dotenv
from huggingface_hub import DatasetFilter, list_datasets

if Path(".env").is_file():
    load_dotenv(".env")

auth_token = os.getenv("HF_HUB_TOKEN")

TASKS = sorted(get_dataset_config_names("ought/raft"))
# Split and capitalize the task names, e.g. banking_77 => Banking 77
FORMATTED_TASK_NAMES = sorted([" ".join(t.capitalize() for t in task.split("_")) for task in TASKS])


def extract_tags(dataset):
    tags = {}
    for tag in dataset.tags:
        k, v = tuple(tag.split(":", 1))
        tags[k] = v
    return tags


def download_submissions():
    filt = DatasetFilter(benchmark="raft")
    all_submissions = list_datasets(filter=filt, cardData=True, use_auth_token=auth_token)
    submissions = []

    for dataset in all_submissions:
        tags = extract_tags(dataset)
        if tags.get("type") == "evaluation":
            submissions.append(dataset)
    return submissions

def format_submissions(submissions):
    submission_data = {
        **{"Submitter": []},
        **{"Submission Name": []},
        **{"Submission Date": []},
        **{t: [] for t in TASKS},
    }

    # The following picks the latest submissions which adhere to the model card schema
    for submission in submissions:
        submission_id = submission.id
        card_data = submission.cardData
        username = card_data["submission_dataset"].split("/")[0]
        submission_data["Submitter"].append(username)
        submission_id = card_data["submission_id"]
        submission_name, sha, timestamp = submission_id.split("__")
        # Format submission names with new backend constraints
        # TODO(lewtun): make this less hacky!
        if "_XXX_" in submission_name:
            submission_name = submission_name.replace("_XXX_", " ")
        submission_data["Submission Name"].append(submission_name)
        # Handle mismatch in epoch microseconds vs epoch seconds in new AutoTrain API
        if len(timestamp) > 10:
            timestamp = pd.to_datetime(int(timestamp))
        else:
            timestamp = pd.to_datetime(int(timestamp), unit="s")
        submission_data["Submission Date"].append(datetime.date(timestamp).strftime("%b %d, %Y"))

        for task in card_data["results"]:
            task_data = task["task"]
            task_name = task_data["name"]
            score = task_data["metrics"][0]["value"]
            submission_data[task_name].append(score)

    df = pd.DataFrame(submission_data)
    df.insert(3, "Overall", df[TASKS].mean(axis=1))
    df = df.copy().sort_values("Overall", ascending=False)
    df.rename(columns={k: v for k, v in zip(TASKS, FORMATTED_TASK_NAMES)}, inplace=True)
    # Start ranking from 1
    df.insert(0, "Rank", np.arange(1, len(df) + 1))
    return df


###########
### APP ###
###########
st.set_page_config(layout="wide")
st.title("RAFT: Real-world Annotated Few-shot Tasks")
st.markdown(
    """
Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? 

[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:

- across multiple domains (lit review, tweets, customer interaction, etc.)
- on economically valuable classification tasks (someone inherently cares about the task)
- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)

To submit to RAFT, follow the instruction posted on [this page](https://huggingface.co/datasets/ought/raft-submission).
"""
)
submissions = download_submissions()
df = format_submissions(submissions)
styler = df.style.set_precision(3).set_properties(**{"white-space": "pre-wrap", "text-align": "center"})
# hack to remove index column: https://discuss.streamlit.io/t/questions-on-st-table/6878/3
st.markdown(
    """
<style>
table td:nth-child(1) {
    display: none
}
table th:nth-child(1) {
    display: none
}
</style>
""",
    unsafe_allow_html=True,
)
st.table(styler)