Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
from enum import Enum | |
from dataclasses import dataclass | |
class Tag: | |
key: str | |
name: str # for display | |
usage: str # explains usage | |
icon: str | |
class SubmissionType(Enum): | |
automatic = Tag( | |
key="submission:automatic", | |
name="Automatic", | |
usage="users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention", | |
icon="" | |
) | |
semiautomatic = Tag( | |
key="submission:semiautomatic", | |
name="Semi Automatic", | |
usage="the leaderboard requires the model owner to run evaluations on his side and submit the results", | |
icon="" | |
) | |
manual = Tag( | |
key="submission:manual", | |
name="Manual", | |
usage="the leaderboard requires the leaderboard owner to run evaluations for new submissions", | |
icon="" | |
) | |
closed = Tag( | |
key="submission:closed", | |
name="Closed", | |
usage="the leaderboard does not accept submissions at the moment", | |
icon="" | |
) | |
class TestSetStatus(Enum): | |
public = Tag( | |
key="test:public", | |
name="Public", | |
usage="all the test sets used are public, the evaluations are completely reproducible", | |
icon="" | |
) | |
mix = Tag( | |
key="test:mix", | |
name="Mix", | |
usage="some test sets are public and some private", | |
icon="" | |
) | |
private = Tag( | |
key="test:private", | |
name="Private", | |
usage="all the test sets used are private, the evaluations are hard to game", | |
icon="" | |
) | |
rolling = Tag( | |
key="test:rolling", | |
name="Rolling", | |
usage="the test sets used change regularly through time and evaluation scores are refreshed", | |
icon="" | |
) | |
class Judge(Enum): | |
auto = Tag( | |
key="judge:auto", | |
name="Automatic metric", | |
usage="evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`", | |
icon="" | |
) | |
model = Tag( | |
key="judge:model", | |
name="Model", | |
usage="evaluations are run using a model as a judge approach to rate answer", | |
icon="" | |
) | |
humans = Tag( | |
key="judge:humans", | |
name="Human", | |
usage="evaluations are done by humans to rate answer - this is an arena", | |
icon="" | |
) | |
vibe_check = Tag( | |
key="judge:vibe_check", | |
name="Vibe check", | |
usage="evaluations are done manually by one or several humans", | |
icon="" | |
) | |
class Modality(Enum): | |
text = Tag( | |
key="modality:text", | |
name="Text", | |
usage="", | |
icon="" | |
) | |
image = Tag( | |
key="modality:image", | |
name="Image", | |
usage="", | |
icon="" | |
) | |
audio = Tag( | |
key="modality:audio", | |
name="Audio", | |
usage="", | |
icon="" | |
) | |
video = Tag( | |
key="modality:video", | |
name="Video", | |
usage="", | |
icon="" | |
) | |
tools = Tag( | |
key="modality:tools", | |
name="Tools", | |
usage="requires added tool usage - mostly for assistant models (a bit outside of usual modalities)", | |
icon="" | |
) | |
artefacts = Tag( | |
key="modality:artefacts", | |
name="Artefacts", | |
usage="the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings (a bit outside of usual modalities)", | |
icon="" | |
) | |
class EvaluationCategory(Enum): | |
generation = Tag( | |
key="eval:generation", | |
name="Generation", | |
usage="the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...) ", | |
icon="" | |
) | |
math = Tag( | |
key="eval:math", | |
name="Math", | |
usage="the evaluation tests math abilities", | |
icon="" | |
) | |
code = Tag( | |
key="eval:code", | |
name="Code", | |
usage="the evaluation tests coding capabilities", | |
icon="" | |
) | |
performance = Tag( | |
key="eval:performance", | |
name="Performance", | |
usage="model performance (speed, energy consumption, ...)", | |
icon="" | |
) | |
safety = Tag( | |
key="eval:safety", | |
name="Safety", | |
usage="the evaluation considers safety, toxicity, bias", | |
icon="" | |
) |