Clémentine commited on
Commit
84b5dfa
1 Parent(s): 4b2522c
app.py CHANGED
@@ -1,31 +1,102 @@
1
  import gradio as gr
2
  from apscheduler.schedulers.background import BackgroundScheduler
3
  from src.static.env import API, REPO_ID, HF_TOKEN
4
- from src.static.about import TITLE, INTRO, ABOUT
5
 
6
  from src.leaderboards.get_from_hub import get_leaderboard_info
 
 
7
 
8
 
9
  def restart_space():
10
  API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
11
 
12
- leaderboards_to_info, info_to_leaderboards = get_leaderboard_info()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  demo = gr.Blocks()
16
  with demo:
17
- gr.HTML(TITLE)
18
  gr.Markdown(INTRO, elem_classes="markdown-text")
19
 
20
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
21
  with gr.TabItem("Search"):
22
  gr.Markdown("Let's look for leaderboards relevant for you! Select the categories of your choice")
23
-
 
 
 
 
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  with gr.TabItem("About"):
27
  gr.Markdown(ABOUT, elem_classes="markdown-text")
28
 
 
 
 
29
  scheduler = BackgroundScheduler()
30
  scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
31
  scheduler.start()
 
1
  import gradio as gr
2
  from apscheduler.schedulers.background import BackgroundScheduler
3
  from src.static.env import API, REPO_ID, HF_TOKEN
4
+ from src.static.about import TITLE, INTRO, ABOUT, DOCUMENTATION
5
 
6
  from src.leaderboards.get_from_hub import get_leaderboard_info
7
+ from src.static.tag_info import *
8
+ from src.static.display import make_clickable
9
 
10
 
11
  def restart_space():
12
  API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
13
 
14
+ LEADERBOARDS_TO_INFO, INFO_TO_LEADERBOARDS = get_leaderboard_info()
15
+
16
+ def update_leaderboards(show_all, modality_tags, submission_tags, test_set_tags, evaluation_tags, language_tags):
17
+ spaces_of_interest = []
18
+ if show_all:
19
+ spaces_of_interest = INFO_TO_LEADERBOARDS["all"]
20
+ else:
21
+ for tag in modality_tags:
22
+ spaces_of_interest.extend(INFO_TO_LEADERBOARDS["modality"][tag.lower()])
23
+ for tag in submission_tags:
24
+ spaces_of_interest.extend(INFO_TO_LEADERBOARDS["submission"][tag.lower()])
25
+ for tag in test_set_tags:
26
+ spaces_of_interest.extend(INFO_TO_LEADERBOARDS["test"][tag.lower()])
27
+ for tag in evaluation_tags:
28
+ spaces_of_interest.extend(INFO_TO_LEADERBOARDS["modality"][tag.lower()])
29
+ for tag in language_tags:
30
+ spaces_of_interest.extend(INFO_TO_LEADERBOARDS["language"][tag.lower()])
31
+
32
+ return "- " + "\n - ".join([
33
+ make_clickable(space) +
34
+ f"\n*Tags: {', '.join(LEADERBOARDS_TO_INFO[space])}*"
35
+ for space in spaces_of_interest
36
+ ])
37
+
38
 
39
 
40
  demo = gr.Blocks()
41
  with demo:
42
+ gr.Markdown(TITLE)
43
  gr.Markdown(INTRO, elem_classes="markdown-text")
44
 
45
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
46
  with gr.TabItem("Search"):
47
  gr.Markdown("Let's look for leaderboards relevant for you! Select the categories of your choice")
48
+ with gr.Row():
49
+ with gr.Column():
50
+ show_all = gr.Checkbox(
51
+ value=False,
52
+ label="Show all leaderboards"
53
+ )
54
 
55
+ modality_tags = gr.CheckboxGroup(
56
+ choices=[tag.name for tag in Modality],
57
+ value=[],
58
+ label="Modality of choice"
59
+ )
60
+ submission_tags = gr.CheckboxGroup(
61
+ choices=[tag.name for tag in SubmissionType],
62
+ value=[],
63
+ label="Submission type"
64
+ )
65
+ test_set_tags = gr.CheckboxGroup(
66
+ choices=[tag.name for tag in TestSetStatus],
67
+ value=[],
68
+ label="Test set status"
69
+ )
70
+ with gr.Column():
71
+ evaluation_tags = gr.CheckboxGroup(
72
+ choices=[tag.name for tag in EvaluationCategory],
73
+ value=[],
74
+ label="Specific evaluation categories"
75
+ )
76
+ language_tags = gr.CheckboxGroup(
77
+ choices=[tag.capitalize() for tag in sorted(list(INFO_TO_LEADERBOARDS["language"].keys()))],
78
+ value=[],
79
+ label="Specific languages"
80
+ )
81
+ with gr.Row():
82
+ leaderboards = gr.Markdown(
83
+ value="",
84
+ )
85
+
86
+ for selector in [show_all, modality_tags, submission_tags, test_set_tags, evaluation_tags, language_tags]:
87
+ selector.change(
88
+ update_leaderboards,
89
+ [show_all, modality_tags, submission_tags, test_set_tags, evaluation_tags, language_tags],
90
+ leaderboards,
91
+ queue=True,
92
+ )
93
 
94
  with gr.TabItem("About"):
95
  gr.Markdown(ABOUT, elem_classes="markdown-text")
96
 
97
+ with gr.TabItem("Documentation"):
98
+ gr.Markdown(DOCUMENTATION, elem_classes="markdown-text")
99
+
100
  scheduler = BackgroundScheduler()
101
  scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
102
  scheduler.start()
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- huggingface_hub
 
 
1
+ huggingface_hub
2
+ appscheduler
src/leaderboards/get_from_hub.py CHANGED
@@ -45,22 +45,33 @@ def get_leaderboard_info() -> tuple[list, dict]:
45
  saved_leaderboards = [(k, v) for k, v in leaderboard_to_tags.items()]
46
 
47
  seen_leaderboards = []
48
- leaderboard_df = []
49
  info_to_leaderboard = defaultdict(lambda: defaultdict(list))
50
  for name, tags in leaderboards + arenas + saved_leaderboards:
 
 
51
  if name in seen_leaderboards:
52
  continue
53
 
54
  seen_leaderboards.append(name)
55
 
 
56
  if name in leaderboard_to_tags:
57
  tags += leaderboard_to_tags[name]
58
 
59
  grouped_tags = group_all_tags(tags)
60
- current_info = grouped_tags
61
- current_info["name"] = name
62
- leaderboard_df.append(current_info)
63
  for category, tags in grouped_tags.items():
64
  for tag in tags:
65
  info_to_leaderboard[category][tag].append(name)
66
- return leaderboard_df, info_to_leaderboard
 
 
 
 
 
 
 
 
 
 
 
 
45
  saved_leaderboards = [(k, v) for k, v in leaderboard_to_tags.items()]
46
 
47
  seen_leaderboards = []
48
+ leaderboard_to_info = defaultdict(list)
49
  info_to_leaderboard = defaultdict(lambda: defaultdict(list))
50
  for name, tags in leaderboards + arenas + saved_leaderboards:
51
+ # If we have a duplicate between the leaderboards from the hub (leaderboards, arena)
52
+ # and the ones we saved manually, we use the version from the hub
53
  if name in seen_leaderboards:
54
  continue
55
 
56
  seen_leaderboards.append(name)
57
 
58
+ # If the model has its own tags, plus the ones we saved, we aggregate them
59
  if name in leaderboard_to_tags:
60
  tags += leaderboard_to_tags[name]
61
 
62
  grouped_tags = group_all_tags(tags)
 
 
 
63
  for category, tags in grouped_tags.items():
64
  for tag in tags:
65
  info_to_leaderboard[category][tag].append(name)
66
+ leaderboard_to_info[name].append(f"{category}:{tag}")
67
+
68
+ # We pass everything to sets
69
+ for leaderboard, tags in leaderboard_to_info.items():
70
+ leaderboard_to_info[leaderboard] = sorted(list(set(tags)))
71
+
72
+ for category, category_dict in info_to_leaderboard.items():
73
+ for tag, space_list in category_dict.items():
74
+ info_to_leaderboard[category][tag] = sorted(list(set(space_list)))
75
+
76
+ info_to_leaderboard["all"] = sorted(list(set(seen_leaderboards)))
77
+ return leaderboard_to_info, info_to_leaderboard
src/leaderboards/saved.py CHANGED
@@ -11,7 +11,7 @@ leaderboard_to_tags = {
11
  "mteb/leaderboard": ["submission:semiautomatic", "modality:text", "Embeddings", "modality:artefacts"],
12
  "gaia-benchmark/leaderboard": ["submission:automatic", "test:private", "judge:auto", "modality:text", "modality:tools", "modality:text", "modality:image", "modality:video"],
13
  "opencompass/opencompass-llm-leaderboard": ["submission:manual", "modality:text", "language:chinese"],
14
- "upstage/open-ko-llm-leaderboard": ["submission:automatic", "judge:auto", "test:mix", "modality:text", ],
15
  "BramVanroy/open_dutch_llm_leaderboard": ["submission:manual", "judge:auto", "modality:text", "language:dutch"],
16
  "vectara/leaderboard": ["submission:semiautomatic", "judge:model", "modality:text", "Hallucinations"],
17
  "facebook/CyberSecEval": ["submission:closed", "eval:code", "eval:safety"],
 
11
  "mteb/leaderboard": ["submission:semiautomatic", "modality:text", "Embeddings", "modality:artefacts"],
12
  "gaia-benchmark/leaderboard": ["submission:automatic", "test:private", "judge:auto", "modality:text", "modality:tools", "modality:text", "modality:image", "modality:video"],
13
  "opencompass/opencompass-llm-leaderboard": ["submission:manual", "modality:text", "language:chinese"],
14
+ "upstage/open-ko-llm-leaderboard": ["submission:automatic", "judge:auto", "test:mix", "modality:text", "language:korean"],
15
  "BramVanroy/open_dutch_llm_leaderboard": ["submission:manual", "judge:auto", "modality:text", "language:dutch"],
16
  "vectara/leaderboard": ["submission:semiautomatic", "judge:model", "modality:text", "Hallucinations"],
17
  "facebook/CyberSecEval": ["submission:closed", "eval:code", "eval:safety"],
src/static/about.py CHANGED
@@ -1,10 +1,12 @@
 
 
1
  TITLE = "# Leaderboard explorer"
2
 
3
  INTRO = """
4
  Have you ever wondered which leaderboard would be best for your use case?
5
  """
6
 
7
- ABOUT = """
8
  If you want your leaderboard to appear in our suggestions, feel free to add relevant information in its tag metadata, and it will be displayed here.
9
 
10
  # First step
@@ -21,44 +23,45 @@ tags:
21
  ## Submission type
22
  Arenas are not concerned by this category.
23
 
24
- - `submission:automatic`: users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention
25
- - `submission:semiautomatic`: the leaderboard requires the model owner to run evaluations on his side and submit the results
26
- - `submission:manual`: the leaderboard requires the leaderboard owner to run evaluations for new submissions
27
- - `submission:closed`: the leaderboard does not accept submissions at the moment
28
-
29
  ## Test set status
30
  Arenas are not concerned by this category.
31
 
32
- - `test:public`: all the test sets used are public, the evaluations are completely reproducible
33
- - `test:mix`: some test sets are public and some private
34
- - `test:private`: all the test sets used are private, the evaluations are hard to game
35
- - `test:rolling`: the test sets used change regularly through time and evaluation scores are refreshed
36
 
37
  ## Judges
38
- - `judge:auto`: evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`
39
- - `judge:model`: evaluations are run using a model as a judge approach to rate answer
40
- - `judge:humans`: evaluations are done by humans to rate answer - this is an arena
41
- - `judge:vibe_check`: evaluations are done manually by one human
42
 
43
  ## Modalities
44
  Can be any (or several) of the following list:
45
- - `modality:text`
46
- - `modality:image`
47
- - `modality:video`
48
- - `modality:audio`
49
- A bit outside of usual modalities
50
- - `modality:tools`: requires added tool usage - mostly for assistant models
51
- - `modality:artefacts`: the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings.
52
 
53
  ## Evaluation categories
54
  Can be any (or several) of the following list:
55
- - `eval:generation`: the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...)
56
- - `eval:math`
57
- - `eval:code`
58
- - `eval:performance`: model performance (speed, energy consumption, ...)
59
- - `eval:safety`: safety, toxicity, bias evaluations
60
 
61
  ## Language
62
  You can indicate the languages covered by your benchmark like so: `language:mylanguage`.
63
  At the moment, we do not support language codes, please use the language name in English.
 
 
 
 
 
 
 
 
64
  """
 
1
+ from src.static.tag_info import *
2
+
3
  TITLE = "# Leaderboard explorer"
4
 
5
  INTRO = """
6
  Have you ever wondered which leaderboard would be best for your use case?
7
  """
8
 
9
+ ABOUT = ("""
10
  If you want your leaderboard to appear in our suggestions, feel free to add relevant information in its tag metadata, and it will be displayed here.
11
 
12
  # First step
 
23
  ## Submission type
24
  Arenas are not concerned by this category.
25
 
26
+ """ +
27
+ "\n".join([f"- {s.value.key}: {s.value.usage}" for s in SubmissionType]) +
28
+ """
 
 
29
  ## Test set status
30
  Arenas are not concerned by this category.
31
 
32
+ """ +
33
+ "\n".join([f"- {s.value.key}: {s.value.usage}" for s in TestSetStatus]) +
34
+ """
 
35
 
36
  ## Judges
37
+
38
+ """ +
39
+ "\n".join([f"- {s.value.key}: {s.value.usage}" for s in Judge]) +
40
+ """
41
 
42
  ## Modalities
43
  Can be any (or several) of the following list:
44
+
45
+ """ +
46
+ "\n".join([f"- {s.value.key}: {s.value.usage}" for s in Modality]) +
47
+ """
 
 
 
48
 
49
  ## Evaluation categories
50
  Can be any (or several) of the following list:
51
+
52
+ """ +
53
+ "\n".join([f"- {s.value.key}: {s.value.usage}" for s in EvaluationCategory]) +
54
+ """
 
55
 
56
  ## Language
57
  You can indicate the languages covered by your benchmark like so: `language:mylanguage`.
58
  At the moment, we do not support language codes, please use the language name in English.
59
+ """)
60
+
61
+ DOCUMENTATION = """
62
+ How to create your own leaderboard?
63
+
64
+ I'll make an updated documentation page here at some point, but for now, you can check our [demo leaderboard org](https://huggingface.co/demo-leaderboard-backend)!
65
+
66
+ You just need to duplicate the front space (and backend if you want to run your leaderboard on spaces compute), copy the datasets to your own org, and edit the env variables.
67
  """
src/static/display.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def space_html_block(space_info) -> str:
2
+ url = space_info.url
3
+
4
+ return f"""
5
+ <article class="">
6
+ <a href="{url}" class="relative z-0 mx-auto flex flex-col items-center justify-center bg-gradient-to-br p-4 filter from-blue-600 to-blue-600 overflow-hidden hover:brightness-110 h-40 rounded-lg">
7
+ <div class="absolute left-0 top-0 h-24 w-1/2 bg-gradient-to-br from-black/20 via-transparent to-transparent"></div>
8
+ <div class="absolute flex items-center rounded-xl top-2.5 right-4 text-sm"><svg class="mr-1.5 text-white" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" fill="currentColor"><path d="M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13,5.64,15.64a5.7,5.7,0,0,1,0-8,5.48,5.48,0,0,1,7.82,0L16,10.24l2.53-2.58A5.44,5.44,0,0,1,22.45,6m0-2a7.47,7.47,0,0,0-5.34,2.24L16,7.36,14.89,6.24a7.49,7.49,0,0,0-10.68,0,7.72,7.72,0,0,0,0,10.82L16,29,27.79,17.06a7.72,7.72,0,0,0,0-10.82A7.49,7.49,0,0,0,22.45,4Z"></path></svg>
9
+ <span class="text-white">22</span></div>
10
+ <div class="absolute opacity-60 text-6xl mb-1 drop-shadow-xl">{icons}</div>
11
+ <h4 class="z-40 max-w-full truncate text-center font-bold leading-tight text-blue-50 text-xl " style="text-shadow: 0px 1px 2px rgba(0, 0, 0, 0.25);">{name}</h4>
12
+ </a>
13
+ """
14
+
15
+ def model_hyperlink(link, model_name):
16
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
17
+
18
+
19
+ def make_clickable(space):
20
+ link = f"https://huggingface.co/{space}"
21
+
22
+ return model_hyperlink(link, space)
src/static/tag_info.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+ from dataclasses import dataclass
3
+
4
+ @dataclass
5
+ class Tag:
6
+ key: str
7
+ name: str # for display
8
+ usage: str # explains usage
9
+ icon: str
10
+
11
+ class SubmissionType(Enum):
12
+ automatic = Tag(
13
+ key="submission:automatic",
14
+ name="Automatic",
15
+ usage="users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention",
16
+ icon=""
17
+ )
18
+ semiautomatic = Tag(
19
+ key="submission:semiautomatic",
20
+ name="Semi Automatic",
21
+ usage="the leaderboard requires the model owner to run evaluations on his side and submit the results",
22
+ icon=""
23
+ )
24
+ manual = Tag(
25
+ key="submission:manual",
26
+ name="Manual",
27
+ usage="the leaderboard requires the leaderboard owner to run evaluations for new submissions",
28
+ icon=""
29
+ )
30
+ closed = Tag(
31
+ key="submission:closed",
32
+ name="Closed",
33
+ usage="the leaderboard does not accept submissions at the moment",
34
+ icon=""
35
+ )
36
+
37
+ class TestSetStatus(Enum):
38
+ public = Tag(
39
+ key="test:public",
40
+ name="Public",
41
+ usage="all the test sets used are public, the evaluations are completely reproducible",
42
+ icon=""
43
+ )
44
+ mix = Tag(
45
+ key="test:mix",
46
+ name="Mix",
47
+ usage="some test sets are public and some private",
48
+ icon=""
49
+ )
50
+ private = Tag(
51
+ key="test:private",
52
+ name="Private",
53
+ usage="all the test sets used are private, the evaluations are hard to game",
54
+ icon=""
55
+ )
56
+ rolling = Tag(
57
+ key="test:rolling",
58
+ name="Rolling",
59
+ usage="the test sets used change regularly through time and evaluation scores are refreshed",
60
+ icon=""
61
+ )
62
+
63
+ class Judge(Enum):
64
+ public = Tag(
65
+ key="judge:auto",
66
+ name="Automatic metric",
67
+ usage="evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`",
68
+ icon=""
69
+ )
70
+ model = Tag(
71
+ key="judge:model",
72
+ name="Model",
73
+ usage="evaluations are run using a model as a judge approach to rate answer",
74
+ icon=""
75
+ )
76
+ humans = Tag(
77
+ key="judge:humans",
78
+ name="Human",
79
+ usage="evaluations are done by humans to rate answer - this is an arena",
80
+ icon=""
81
+ )
82
+ vibe_check = Tag(
83
+ key="judge:vibe_check",
84
+ name="Vibe check",
85
+ usage="evaluations are done manually by one or several humans",
86
+ icon=""
87
+ )
88
+
89
+ class Modality(Enum):
90
+ text = Tag(
91
+ key="modality:text",
92
+ name="Text",
93
+ usage="",
94
+ icon=""
95
+ )
96
+ image = Tag(
97
+ key="modality:image",
98
+ name="Image",
99
+ usage="",
100
+ icon=""
101
+ )
102
+ audio = Tag(
103
+ key="modality:audio",
104
+ name="Audio",
105
+ usage="",
106
+ icon=""
107
+ )
108
+ video = Tag(
109
+ key="modality:video",
110
+ name="Video",
111
+ usage="",
112
+ icon=""
113
+ )
114
+ tools = Tag(
115
+ key="modality:tools",
116
+ name="Tools",
117
+ usage="requires added tool usage - mostly for assistant models (a bit outside of usual modalities)",
118
+ icon=""
119
+ )
120
+ artefacts = Tag(
121
+ key="modality:artefacts",
122
+ name="Artefacts",
123
+ usage="the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings (a bit outside of usual modalities)",
124
+ icon=""
125
+ )
126
+
127
+ class EvaluationCategory(Enum):
128
+ generation = Tag(
129
+ key="eval:generation",
130
+ name="Generation",
131
+ usage="the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...) ",
132
+ icon=""
133
+ )
134
+ math = Tag(
135
+ key="eval:math",
136
+ name="Math",
137
+ usage="the evaluation tests math abilities",
138
+ icon=""
139
+ )
140
+ code = Tag(
141
+ key="eval:code",
142
+ name="Code",
143
+ usage="the evaluation tests coding capabilities",
144
+ icon=""
145
+ )
146
+ performance = Tag(
147
+ key="eval:performance",
148
+ name="Performance",
149
+ usage="model performance (speed, energy consumption, ...)",
150
+ icon=""
151
+ )
152
+ safety = Tag(
153
+ key="eval:safety",
154
+ name="Safety",
155
+ usage="the evaluation considers safety, toxicity, bias",
156
+ icon=""
157
+ )