Spaces:
Sleeping
Sleeping
File size: 4,349 Bytes
84b5dfa 64d2c90 84b5dfa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
from enum import Enum
from dataclasses import dataclass
@dataclass
class Tag:
key: str
name: str # for display
usage: str # explains usage
icon: str
class SubmissionType(Enum):
automatic = Tag(
key="submission:automatic",
name="Automatic",
usage="users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention",
icon=""
)
semiautomatic = Tag(
key="submission:semiautomatic",
name="Semi Automatic",
usage="the leaderboard requires the model owner to run evaluations on his side and submit the results",
icon=""
)
manual = Tag(
key="submission:manual",
name="Manual",
usage="the leaderboard requires the leaderboard owner to run evaluations for new submissions",
icon=""
)
closed = Tag(
key="submission:closed",
name="Closed",
usage="the leaderboard does not accept submissions at the moment",
icon=""
)
class TestSetStatus(Enum):
public = Tag(
key="test:public",
name="Public",
usage="all the test sets used are public, the evaluations are completely reproducible",
icon=""
)
mix = Tag(
key="test:mix",
name="Mix",
usage="some test sets are public and some private",
icon=""
)
private = Tag(
key="test:private",
name="Private",
usage="all the test sets used are private, the evaluations are hard to game",
icon=""
)
rolling = Tag(
key="test:rolling",
name="Rolling",
usage="the test sets used change regularly through time and evaluation scores are refreshed",
icon=""
)
class Judge(Enum):
auto = Tag(
key="judge:auto",
name="Automatic metric",
usage="evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`",
icon=""
)
model = Tag(
key="judge:model",
name="Model",
usage="evaluations are run using a model as a judge approach to rate answer",
icon=""
)
humans = Tag(
key="judge:humans",
name="Human",
usage="evaluations are done by humans to rate answer - this is an arena",
icon=""
)
vibe_check = Tag(
key="judge:vibe_check",
name="Vibe check",
usage="evaluations are done manually by one or several humans",
icon=""
)
class Modality(Enum):
text = Tag(
key="modality:text",
name="Text",
usage="",
icon=""
)
image = Tag(
key="modality:image",
name="Image",
usage="",
icon=""
)
audio = Tag(
key="modality:audio",
name="Audio",
usage="",
icon=""
)
video = Tag(
key="modality:video",
name="Video",
usage="",
icon=""
)
tools = Tag(
key="modality:tools",
name="Tools",
usage="requires added tool usage - mostly for assistant models (a bit outside of usual modalities)",
icon=""
)
artefacts = Tag(
key="modality:artefacts",
name="Artefacts",
usage="the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings (a bit outside of usual modalities)",
icon=""
)
class EvaluationCategory(Enum):
generation = Tag(
key="eval:generation",
name="Generation",
usage="the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...) ",
icon=""
)
math = Tag(
key="eval:math",
name="Math",
usage="the evaluation tests math abilities",
icon=""
)
code = Tag(
key="eval:code",
name="Code",
usage="the evaluation tests coding capabilities",
icon=""
)
performance = Tag(
key="eval:performance",
name="Performance",
usage="model performance (speed, energy consumption, ...)",
icon=""
)
safety = Tag(
key="eval:safety",
name="Safety",
usage="the evaluation considers safety, toxicity, bias",
icon=""
) |