LeaderboardsExplorer

Sleeping

File size: 4,349 Bytes

from enum import Enum
from dataclasses import dataclass

@dataclass
class Tag:
    key: str
    name: str # for display
    usage: str # explains usage
    icon: str

class SubmissionType(Enum):
    automatic = Tag(
        key="submission:automatic", 
        name="Automatic",
        usage="users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention",
        icon=""
    )
    semiautomatic = Tag(
        key="submission:semiautomatic", 
        name="Semi Automatic",
        usage="the leaderboard requires the model owner to run evaluations on his side and submit the results",
        icon=""
    )
    manual = Tag(
        key="submission:manual", 
        name="Manual",
        usage="the leaderboard requires the leaderboard owner to run evaluations for new submissions",
        icon=""
    )
    closed = Tag(
        key="submission:closed", 
        name="Closed",
        usage="the leaderboard does not accept submissions at the moment",
        icon=""
    )

class TestSetStatus(Enum):
    public = Tag(
        key="test:public", 
        name="Public",
        usage="all the test sets used are public, the evaluations are completely reproducible",
        icon=""
    )
    mix = Tag(
        key="test:mix", 
        name="Mix",
        usage="some test sets are public and some private",
        icon=""
    )
    private = Tag(
        key="test:private", 
        name="Private",
        usage="all the test sets used are private, the evaluations are hard to game",
        icon=""
    )
    rolling = Tag(
        key="test:rolling", 
        name="Rolling",
        usage="the test sets used change regularly through time and evaluation scores are refreshed",
        icon=""
    )

class Judge(Enum):
    auto = Tag(
        key="judge:auto", 
        name="Automatic metric",
        usage="evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`",
        icon=""
    )
    model = Tag(
        key="judge:model", 
        name="Model",
        usage="evaluations are run using a model as a judge approach to rate answer",
        icon=""
    )
    humans = Tag(
        key="judge:humans", 
        name="Human",
        usage="evaluations are done by humans to rate answer - this is an arena",
        icon=""
    )
    vibe_check = Tag(
        key="judge:vibe_check", 
        name="Vibe check",
        usage="evaluations are done manually by one or several humans",
        icon=""
    )

class Modality(Enum):
    text = Tag(
        key="modality:text", 
        name="Text",
        usage="",
        icon=""
    )
    image = Tag(
        key="modality:image", 
        name="Image",
        usage="",
        icon=""
    )
    audio = Tag(
        key="modality:audio", 
        name="Audio",
        usage="",
        icon=""
    )
    video = Tag(
        key="modality:video", 
        name="Video",
        usage="",
        icon=""
    )
    tools = Tag(
        key="modality:tools", 
        name="Tools",
        usage="requires added tool usage - mostly for assistant models (a bit outside of usual modalities)",
        icon=""
    )
    artefacts = Tag(
        key="modality:artefacts", 
        name="Artefacts",
        usage="the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings (a bit outside of usual modalities)",
        icon=""
    )

class EvaluationCategory(Enum):
    generation = Tag(
        key="eval:generation", 
        name="Generation",
        usage="the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...) ",
        icon=""
    )
    math = Tag(
        key="eval:math", 
        name="Math",
        usage="the evaluation tests math abilities",
        icon=""
    )
    code = Tag(
        key="eval:code", 
        name="Code",
        usage="the evaluation tests coding capabilities",
        icon=""
    )
    performance = Tag(
        key="eval:performance", 
        name="Performance",
        usage="model performance (speed, energy consumption, ...)",
        icon=""
    )
    safety = Tag(
        key="eval:safety", 
        name="Safety",
        usage="the evaluation considers safety, toxicity, bias",
        icon=""
    )