File size: 4,349 Bytes
84b5dfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64d2c90
84b5dfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from enum import Enum
from dataclasses import dataclass

@dataclass
class Tag:
    key: str
    name: str # for display
    usage: str # explains usage
    icon: str

class SubmissionType(Enum):
    automatic = Tag(
        key="submission:automatic", 
        name="Automatic",
        usage="users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention",
        icon=""
    )
    semiautomatic = Tag(
        key="submission:semiautomatic", 
        name="Semi Automatic",
        usage="the leaderboard requires the model owner to run evaluations on his side and submit the results",
        icon=""
    )
    manual = Tag(
        key="submission:manual", 
        name="Manual",
        usage="the leaderboard requires the leaderboard owner to run evaluations for new submissions",
        icon=""
    )
    closed = Tag(
        key="submission:closed", 
        name="Closed",
        usage="the leaderboard does not accept submissions at the moment",
        icon=""
    )

class TestSetStatus(Enum):
    public = Tag(
        key="test:public", 
        name="Public",
        usage="all the test sets used are public, the evaluations are completely reproducible",
        icon=""
    )
    mix = Tag(
        key="test:mix", 
        name="Mix",
        usage="some test sets are public and some private",
        icon=""
    )
    private = Tag(
        key="test:private", 
        name="Private",
        usage="all the test sets used are private, the evaluations are hard to game",
        icon=""
    )
    rolling = Tag(
        key="test:rolling", 
        name="Rolling",
        usage="the test sets used change regularly through time and evaluation scores are refreshed",
        icon=""
    )

class Judge(Enum):
    auto = Tag(
        key="judge:auto", 
        name="Automatic metric",
        usage="evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`",
        icon=""
    )
    model = Tag(
        key="judge:model", 
        name="Model",
        usage="evaluations are run using a model as a judge approach to rate answer",
        icon=""
    )
    humans = Tag(
        key="judge:humans", 
        name="Human",
        usage="evaluations are done by humans to rate answer - this is an arena",
        icon=""
    )
    vibe_check = Tag(
        key="judge:vibe_check", 
        name="Vibe check",
        usage="evaluations are done manually by one or several humans",
        icon=""
    )

class Modality(Enum):
    text = Tag(
        key="modality:text", 
        name="Text",
        usage="",
        icon=""
    )
    image = Tag(
        key="modality:image", 
        name="Image",
        usage="",
        icon=""
    )
    audio = Tag(
        key="modality:audio", 
        name="Audio",
        usage="",
        icon=""
    )
    video = Tag(
        key="modality:video", 
        name="Video",
        usage="",
        icon=""
    )
    tools = Tag(
        key="modality:tools", 
        name="Tools",
        usage="requires added tool usage - mostly for assistant models (a bit outside of usual modalities)",
        icon=""
    )
    artefacts = Tag(
        key="modality:artefacts", 
        name="Artefacts",
        usage="the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings (a bit outside of usual modalities)",
        icon=""
    )

class EvaluationCategory(Enum):
    generation = Tag(
        key="eval:generation", 
        name="Generation",
        usage="the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...) ",
        icon=""
    )
    math = Tag(
        key="eval:math", 
        name="Math",
        usage="the evaluation tests math abilities",
        icon=""
    )
    code = Tag(
        key="eval:code", 
        name="Code",
        usage="the evaluation tests coding capabilities",
        icon=""
    )
    performance = Tag(
        key="eval:performance", 
        name="Performance",
        usage="model performance (speed, energy consumption, ...)",
        icon=""
    )
    safety = Tag(
        key="eval:safety", 
        name="Safety",
        usage="the evaluation considers safety, toxicity, bias",
        icon=""
    )