Clémentine
fix typo in judge
64d2c90
raw history blame
No virus
4.35 kB
from enum import Enum
from dataclasses import dataclass
@dataclass
class Tag:
key: str
name: str # for display
usage: str # explains usage
icon: str
class SubmissionType(Enum):
automatic = Tag(
key="submission:automatic",
name="Automatic",
usage="users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention",
icon=""
)
semiautomatic = Tag(
key="submission:semiautomatic",
name="Semi Automatic",
usage="the leaderboard requires the model owner to run evaluations on his side and submit the results",
icon=""
)
manual = Tag(
key="submission:manual",
name="Manual",
usage="the leaderboard requires the leaderboard owner to run evaluations for new submissions",
icon=""
)
closed = Tag(
key="submission:closed",
name="Closed",
usage="the leaderboard does not accept submissions at the moment",
icon=""
)
class TestSetStatus(Enum):
public = Tag(
key="test:public",
name="Public",
usage="all the test sets used are public, the evaluations are completely reproducible",
icon=""
)
mix = Tag(
key="test:mix",
name="Mix",
usage="some test sets are public and some private",
icon=""
)
private = Tag(
key="test:private",
name="Private",
usage="all the test sets used are private, the evaluations are hard to game",
icon=""
)
rolling = Tag(
key="test:rolling",
name="Rolling",
usage="the test sets used change regularly through time and evaluation scores are refreshed",
icon=""
)
class Judge(Enum):
auto = Tag(
key="judge:auto",
name="Automatic metric",
usage="evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`",
icon=""
)
model = Tag(
key="judge:model",
name="Model",
usage="evaluations are run using a model as a judge approach to rate answer",
icon=""
)
humans = Tag(
key="judge:humans",
name="Human",
usage="evaluations are done by humans to rate answer - this is an arena",
icon=""
)
vibe_check = Tag(
key="judge:vibe_check",
name="Vibe check",
usage="evaluations are done manually by one or several humans",
icon=""
)
class Modality(Enum):
text = Tag(
key="modality:text",
name="Text",
usage="",
icon=""
)
image = Tag(
key="modality:image",
name="Image",
usage="",
icon=""
)
audio = Tag(
key="modality:audio",
name="Audio",
usage="",
icon=""
)
video = Tag(
key="modality:video",
name="Video",
usage="",
icon=""
)
tools = Tag(
key="modality:tools",
name="Tools",
usage="requires added tool usage - mostly for assistant models (a bit outside of usual modalities)",
icon=""
)
artefacts = Tag(
key="modality:artefacts",
name="Artefacts",
usage="the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings (a bit outside of usual modalities)",
icon=""
)
class EvaluationCategory(Enum):
generation = Tag(
key="eval:generation",
name="Generation",
usage="the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...) ",
icon=""
)
math = Tag(
key="eval:math",
name="Math",
usage="the evaluation tests math abilities",
icon=""
)
code = Tag(
key="eval:code",
name="Code",
usage="the evaluation tests coding capabilities",
icon=""
)
performance = Tag(
key="eval:performance",
name="Performance",
usage="model performance (speed, energy consumption, ...)",
icon=""
)
safety = Tag(
key="eval:safety",
name="Safety",
usage="the evaluation considers safety, toxicity, bias",
icon=""
)