Clémentine commited on
Commit
beb2b32
1 Parent(s): aa85eec
README.md CHANGED
@@ -1,15 +1,17 @@
1
  ---
2
- title: Open LLM Leaderboard
3
  emoji: 🏆
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.20.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
11
  fullWidth: true
12
  startup_duration_timeout: 1h
 
13
  space_ci:
14
  private: true
15
  secrets:
 
1
  ---
2
+ title: Open LLM Leaderboard 2
3
  emoji: 🏆
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.36.1
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
+ duplicated_from: open-llm-leaderboard/open_llm_leaderboard
12
  fullWidth: true
13
  startup_duration_timeout: 1h
14
+ hf_oauth: true
15
  space_ci:
16
  private: true
17
  secrets:
app.py CHANGED
@@ -1,12 +1,17 @@
1
  import os
2
  import logging
3
  import time
 
4
  import datetime
5
  import gradio as gr
 
6
  import datasets
7
  from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
8
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
9
 
 
 
 
10
  from src.display.about import (
11
  CITATION_BUTTON_LABEL,
12
  CITATION_BUTTON_TEXT,
@@ -27,6 +32,7 @@ from src.display.utils import (
27
  Precision,
28
  WeightType,
29
  fields,
 
30
  )
31
  from src.envs import (
32
  API,
@@ -35,35 +41,343 @@ from src.envs import (
35
  HF_TOKEN,
36
  QUEUE_REPO,
37
  REPO_ID,
 
 
38
  HF_HOME,
39
  )
 
 
 
 
40
 
41
- demo = gr.Blocks(css=custom_css)
42
- with demo:
43
- gr.HTML(TITLE)
44
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- countdown = gr.HTML(
47
- """<div align="center">
48
- <div position: relative>
49
- <img
50
- src="https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/resolve/main/gif.gif"
51
- allowtransparency="true"
52
- style="display:block;width:100%;height:auto;"
53
- />
54
- <iframe
55
- src="https://logwork.com/widget/countdown/?text=Surprise%20loading...&amp;timezone=Europe%2FParis&amp;width=&amp;style=circles&amp;uid=815898&amp;loc=https://logwork.com/countdown-fxmc&amp;language=en&amp;textcolor=&amp;background=%23ffd21e&amp;date=2024-06-26%2015%3A00%3A00&amp;digitscolor=%23ff9d00&amp;unitscolor=&amp"
56
- style="position: absolute; top:0; left: 0; border: medium; width:100%; height:100%; margin: 0px; visibility: visible;"
57
- scrolling="no"
58
- allowtransparency="true"
59
- frameborder="0"
60
- allowfullscreen
61
- />
62
- </div>
63
- </div>"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  )
65
- #gif = gr.Image(value="./gif.gif", interactive=False)
66
- gr.Markdown("*Countdown by Logwork.com, gif art by Chun Te Lee*")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  with gr.Row():
69
  with gr.Accordion("📙 Citation", open=False):
@@ -75,4 +389,69 @@ with demo:
75
  show_copy_button=True,
76
  )
77
 
78
- demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import logging
3
  import time
4
+ import schedule
5
  import datetime
6
  import gradio as gr
7
+ from threading import Thread
8
  import datasets
9
  from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
10
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
11
 
12
+ # Start ephemeral Spaces on PRs (see config in README.md)
13
+ from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
14
+
15
  from src.display.about import (
16
  CITATION_BUTTON_LABEL,
17
  CITATION_BUTTON_TEXT,
 
32
  Precision,
33
  WeightType,
34
  fields,
35
+ EvalQueueColumn
36
  )
37
  from src.envs import (
38
  API,
 
41
  HF_TOKEN,
42
  QUEUE_REPO,
43
  REPO_ID,
44
+ VOTES_REPO,
45
+ VOTES_PATH,
46
  HF_HOME,
47
  )
48
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
49
+ from src.submission.submit import add_new_eval
50
+ from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
51
+ from src.voting.vote_system import VoteManager, run_scheduler
52
 
53
+ # Configure logging
54
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
55
+
56
+ # Start ephemeral Spaces on PRs (see config in README.md)
57
+ from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
58
+
59
+ # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
60
+ # This controls whether a full initialization should be performed.
61
+ DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
62
+ LAST_UPDATE_LEADERBOARD = datetime.datetime.now()
63
+ LEADERBOARD_DF = None
64
+
65
+ def restart_space():
66
+ API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
67
+
68
+
69
+ def time_diff_wrapper(func):
70
+ def wrapper(*args, **kwargs):
71
+ start_time = time.time()
72
+ result = func(*args, **kwargs)
73
+ end_time = time.time()
74
+ diff = end_time - start_time
75
+ logging.info(f"Time taken for {func.__name__}: {diff} seconds")
76
+ return result
77
+
78
+ return wrapper
79
 
80
+
81
+ @time_diff_wrapper
82
+ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
83
+ """Download dataset with exponential backoff retries."""
84
+ attempt = 0
85
+ while attempt < max_attempts:
86
+ try:
87
+ logging.info(f"Downloading {repo_id} to {local_dir}")
88
+ snapshot_download(
89
+ repo_id=repo_id,
90
+ local_dir=local_dir,
91
+ repo_type=repo_type,
92
+ tqdm_class=None,
93
+ etag_timeout=30,
94
+ max_workers=8,
95
+ )
96
+ logging.info("Download successful")
97
+ return
98
+ except Exception as e:
99
+ wait_time = backoff_factor**attempt
100
+ logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
101
+ time.sleep(wait_time)
102
+ attempt += 1
103
+ raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
104
+
105
+ def get_latest_data_leaderboard(leaderboard_initial_df = None):
106
+ current_time = datetime.datetime.now()
107
+ global LAST_UPDATE_LEADERBOARD
108
+ if current_time - LAST_UPDATE_LEADERBOARD < datetime.timedelta(minutes=10) and leaderboard_initial_df is not None:
109
+ return leaderboard_initial_df
110
+ LAST_UPDATE_LEADERBOARD = current_time
111
+ leaderboard_dataset = datasets.load_dataset(
112
+ AGGREGATED_REPO,
113
+ "default",
114
+ split="train",
115
+ cache_dir=HF_HOME,
116
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
117
+ verification_mode="no_checks"
118
+ )
119
+
120
+ global LEADERBOARD_DF
121
+ LEADERBOARD_DF = get_leaderboard_df(
122
+ leaderboard_dataset=leaderboard_dataset,
123
+ cols=COLS,
124
+ benchmark_cols=BENCHMARK_COLS,
125
  )
126
+
127
+ return LEADERBOARD_DF
128
+
129
+ def get_latest_data_queue():
130
+ eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
131
+ return eval_queue_dfs
132
+
133
+ def init_space():
134
+ """Initializes the application space, loading only necessary data."""
135
+ if DO_FULL_INIT:
136
+ # These downloads only occur on full initialization
137
+ try:
138
+ download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
139
+ download_dataset(VOTES_REPO, VOTES_PATH)
140
+ except Exception:
141
+ restart_space()
142
+
143
+ # Always redownload the leaderboard DataFrame
144
+ global LEADERBOARD_DF
145
+ LEADERBOARD_DF = get_latest_data_leaderboard()
146
+
147
+ # Evaluation queue DataFrame retrieval is independent of initialization detail level
148
+ eval_queue_dfs = get_latest_data_queue()
149
+
150
+ return LEADERBOARD_DF, eval_queue_dfs
151
+
152
+ # Initialize VoteManager
153
+ vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
154
+
155
+
156
+ # Schedule the upload_votes method to run every 15 minutes
157
+ schedule.every(15).minutes.do(vote_manager.upload_votes)
158
+
159
+ # Start the scheduler in a separate thread
160
+ scheduler_thread = Thread(target=run_scheduler, args=(vote_manager,), daemon=True)
161
+ scheduler_thread.start()
162
+
163
+ # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
164
+ # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
165
+ LEADERBOARD_DF, eval_queue_dfs = init_space()
166
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
167
+
168
+
169
+ # Data processing for plots now only on demand in the respective Gradio tab
170
+ def load_and_create_plots():
171
+ plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
172
+ return plot_df
173
+
174
+ # Function to check if a user is logged in
175
+ def check_login(profile: gr.OAuthProfile | None) -> bool:
176
+ if profile is None:
177
+ return False
178
+ return True
179
+
180
+ def init_leaderboard(dataframe):
181
+ if dataframe is None or dataframe.empty:
182
+ raise ValueError("Leaderboard DataFrame is empty or None.")
183
+ return Leaderboard(
184
+ value=dataframe,
185
+ datatype=[c.type for c in fields(AutoEvalColumn)],
186
+ select_columns=SelectColumns(
187
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
188
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
189
+ label="Select Columns to Display:",
190
+ ),
191
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
192
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
193
+ filter_columns=[
194
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
195
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
196
+ ColumnFilter(
197
+ AutoEvalColumn.params.name,
198
+ type="slider",
199
+ min=0.01,
200
+ max=150,
201
+ label="Select the number of parameters (B)",
202
+ ),
203
+ ColumnFilter(
204
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
205
+ ),
206
+ ColumnFilter(
207
+ AutoEvalColumn.merged.name, type="boolean", label="Merge/MoErge", default=True
208
+ ),
209
+ ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
210
+ ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
211
+ ColumnFilter(AutoEvalColumn.maintainers_highlight.name, type="boolean", label="Show only maintainer's highlight", default=False),
212
+ ],
213
+ bool_checkboxgroup_label="Hide models",
214
+ interactive=False,
215
+ )
216
+
217
+ main_block = gr.Blocks(css=custom_css)
218
+ with main_block:
219
+ with gr.Row(elem_id="header-row"):
220
+ gr.HTML(TITLE)
221
+
222
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
223
+
224
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
225
+ with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
226
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
227
+
228
+ with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=5):
229
+ with gr.Column():
230
+ with gr.Row():
231
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
232
+
233
+ with gr.Row():
234
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
235
+
236
+ with gr.Row():
237
+ with gr.Column():
238
+ model_name_textbox = gr.Textbox(label="Model name")
239
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="latest")
240
+ with gr.Row():
241
+ model_type = gr.Dropdown(
242
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
243
+ label="Model type",
244
+ multiselect=False,
245
+ value=ModelType.FT.to_str(" : "),
246
+ interactive=True,
247
+ )
248
+ chat_template_toggle = gr.Checkbox(
249
+ label="Use chat template",
250
+ value=False,
251
+ info="Is your model a chat model?",
252
+ )
253
+
254
+ with gr.Column():
255
+ precision = gr.Dropdown(
256
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
257
+ label="Precision",
258
+ multiselect=False,
259
+ value="float16",
260
+ interactive=True,
261
+ )
262
+ weight_type = gr.Dropdown(
263
+ choices=[i.value.name for i in WeightType],
264
+ label="Weights type",
265
+ multiselect=False,
266
+ value="Original",
267
+ interactive=True,
268
+ )
269
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
270
+
271
+ with gr.Column():
272
+ with gr.Accordion(
273
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
274
+ open=False,
275
+ ):
276
+ with gr.Row():
277
+ finished_eval_table = gr.components.Dataframe(
278
+ value=finished_eval_queue_df,
279
+ headers=EVAL_COLS,
280
+ datatype=EVAL_TYPES,
281
+ row_count=5,
282
+ )
283
+ with gr.Accordion(
284
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
285
+ open=False,
286
+ ):
287
+ with gr.Row():
288
+ running_eval_table = gr.components.Dataframe(
289
+ value=running_eval_queue_df,
290
+ headers=EVAL_COLS,
291
+ datatype=EVAL_TYPES,
292
+ row_count=5,
293
+ )
294
+
295
+ with gr.Accordion(
296
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
297
+ open=False,
298
+ ):
299
+ with gr.Row():
300
+ pending_eval_table = gr.components.Dataframe(
301
+ value=pending_eval_queue_df,
302
+ headers=EVAL_COLS,
303
+ datatype=EVAL_TYPES,
304
+ row_count=5,
305
+ )
306
+
307
+ submit_button = gr.Button("Submit Eval")
308
+ submission_result = gr.Markdown()
309
+
310
+ # The chat template checkbox update function
311
+ def update_chat_checkbox(model_type_value):
312
+ return ModelType.from_str(model_type_value) == ModelType.chat
313
+
314
+ model_type.change(
315
+ fn=update_chat_checkbox,
316
+ inputs=[model_type], # Pass the current checkbox value
317
+ outputs=chat_template_toggle,
318
+ )
319
+
320
+ submit_button.click(
321
+ add_new_eval,
322
+ [
323
+ model_name_textbox,
324
+ base_model_name_textbox,
325
+ revision_name_textbox,
326
+ precision,
327
+ weight_type,
328
+ model_type,
329
+ chat_template_toggle,
330
+ ],
331
+ submission_result,
332
+ )
333
+
334
+ # Ensure the values in 'pending_eval_queue_df' are correct and ready for the DataFrame component
335
+ with gr.TabItem("🆙 Model Vote"):
336
+ with gr.Row():
337
+ gr.Markdown(
338
+ "## Vote for the models which should be evaluated first! \nYou'll need to sign in with the button above first. All votes are recorded.",
339
+ elem_classes="markdown-text"
340
+ )
341
+ login_button = gr.LoginButton(elem_id="oauth-button")
342
+
343
+
344
+ with gr.Row():
345
+ pending_models = pending_eval_queue_df[EvalQueueColumn.model_name.name].to_list()
346
+
347
+ with gr.Column():
348
+ selected_model = gr.Dropdown(
349
+ choices=pending_models,
350
+ label="Models",
351
+ multiselect=False,
352
+ value="str",
353
+ interactive=True,
354
+ )
355
+
356
+ vote_button = gr.Button("Vote", variant="primary")
357
+
358
+ with gr.Row():
359
+ with gr.Accordion(
360
+ f"Available models pending ({len(pending_eval_queue_df)})",
361
+ open=True,
362
+ ):
363
+ with gr.Row():
364
+ pending_eval_table_votes = gr.components.Dataframe(
365
+ value=vote_manager.create_request_vote_df(
366
+ pending_eval_queue_df
367
+ ),
368
+ headers=EVAL_COLS,
369
+ datatype=EVAL_TYPES,
370
+ row_count=5,
371
+ interactive=False
372
+ )
373
+
374
+ # Set the click event for the vote button
375
+ vote_button.click(
376
+ vote_manager.add_vote,
377
+ inputs=[selected_model, pending_eval_table],
378
+ outputs=[pending_eval_table_votes]
379
+ )
380
+
381
 
382
  with gr.Row():
383
  with gr.Accordion("📙 Citation", open=False):
 
389
  show_copy_button=True,
390
  )
391
 
392
+ main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard], outputs=[leaderboard])
393
+ leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
394
+ pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
395
+
396
+ main_block.queue(default_concurrency_limit=40)
397
+
398
+
399
+ def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
400
+ # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
401
+ # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
402
+ # ht to Lucain!
403
+ if SPACE_ID is None:
404
+ print("Not in a Space: Space CI disabled.")
405
+ return WebhooksServer(ui=main_block)
406
+
407
+ if IS_EPHEMERAL_SPACE:
408
+ print("In an ephemeral Space: Space CI disabled.")
409
+ return WebhooksServer(ui=main_block)
410
+
411
+ card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
412
+ config = card.data.get("space_ci", {})
413
+ print(f"Enabling Space CI with config from README: {config}")
414
+
415
+ return configure_space_ci(
416
+ blocks=ui,
417
+ trusted_authors=config.get("trusted_authors"),
418
+ private=config.get("private", "auto"),
419
+ variables=config.get("variables", "auto"),
420
+ secrets=config.get("secrets"),
421
+ hardware=config.get("hardware"),
422
+ storage=config.get("storage"),
423
+ )
424
+
425
+ # Create webhooks server (with CI url if in Space and not ephemeral)
426
+ webhooks_server = enable_space_ci_and_return_server(ui=main_block)
427
+
428
+ # Add webhooks
429
+ @webhooks_server.add_webhook
430
+ def update_leaderboard(payload: WebhookPayload) -> None:
431
+ """Redownloads the leaderboard dataset each time it updates"""
432
+ if payload.repo.type == "dataset" and payload.event.action == "update":
433
+ datasets.load_dataset(
434
+ AGGREGATED_REPO,
435
+ "default",
436
+ split="train",
437
+ cache_dir=HF_HOME,
438
+ download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
439
+ verification_mode="no_checks"
440
+ )
441
+
442
+ # The below code is not used at the moment, as we can manage the queue file locally
443
+ LAST_UPDATE_QUEUE = datetime.datetime.now()
444
+ @webhooks_server.add_webhook
445
+ def update_queue(payload: WebhookPayload) -> None:
446
+ """Redownloads the queue dataset each time it updates"""
447
+ if payload.repo.type == "dataset" and payload.event.action == "update":
448
+ current_time = datetime.datetime.now()
449
+ global LAST_UPDATE_QUEUE
450
+ if current_time - LAST_UPDATE_QUEUE > datetime.timedelta(minutes=10):
451
+ print("Would have updated the queue")
452
+ # We only redownload is last update was more than 10 minutes ago, as the queue is
453
+ # updated regularly and heavy to download
454
+ #download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
455
+ LAST_UPDATE_QUEUE = datetime.datetime.now()
456
+
457
+ webhooks_server.launch()
app_bkp.py DELETED
@@ -1,316 +0,0 @@
1
- import os
2
- import logging
3
- import time
4
- import datetime
5
- import gradio as gr
6
- import datasets
7
- from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
8
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
9
-
10
- from src.display.about import (
11
- CITATION_BUTTON_LABEL,
12
- CITATION_BUTTON_TEXT,
13
- EVALUATION_QUEUE_TEXT,
14
- FAQ_TEXT,
15
- INTRODUCTION_TEXT,
16
- LLM_BENCHMARKS_TEXT,
17
- TITLE,
18
- )
19
- from src.display.css_html_js import custom_css
20
- from src.display.utils import (
21
- BENCHMARK_COLS,
22
- COLS,
23
- EVAL_COLS,
24
- EVAL_TYPES,
25
- AutoEvalColumn,
26
- ModelType,
27
- Precision,
28
- WeightType,
29
- fields,
30
- )
31
- from src.envs import (
32
- API,
33
- EVAL_REQUESTS_PATH,
34
- AGGREGATED_REPO,
35
- HF_TOKEN,
36
- QUEUE_REPO,
37
- REPO_ID,
38
- HF_HOME,
39
- )
40
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
41
- from src.submission.submit import add_new_eval
42
- from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
43
-
44
- # Configure logging
45
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
46
-
47
-
48
- # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
49
- # This controls whether a full initialization should be performed.
50
- DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
51
- LAST_UPDATE_LEADERBOARD = datetime.datetime.now()
52
-
53
- def restart_space():
54
- API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
55
-
56
-
57
- def time_diff_wrapper(func):
58
- def wrapper(*args, **kwargs):
59
- start_time = time.time()
60
- result = func(*args, **kwargs)
61
- end_time = time.time()
62
- diff = end_time - start_time
63
- logging.info(f"Time taken for {func.__name__}: {diff} seconds")
64
- return result
65
-
66
- return wrapper
67
-
68
-
69
- @time_diff_wrapper
70
- def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
71
- """Download dataset with exponential backoff retries."""
72
- attempt = 0
73
- while attempt < max_attempts:
74
- try:
75
- logging.info(f"Downloading {repo_id} to {local_dir}")
76
- snapshot_download(
77
- repo_id=repo_id,
78
- local_dir=local_dir,
79
- repo_type=repo_type,
80
- tqdm_class=None,
81
- etag_timeout=30,
82
- max_workers=8,
83
- )
84
- logging.info("Download successful")
85
- return
86
- except Exception as e:
87
- wait_time = backoff_factor**attempt
88
- logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
89
- time.sleep(wait_time)
90
- attempt += 1
91
- raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
92
-
93
- def get_latest_data_leaderboard(leaderboard_initial_df = None):
94
- current_time = datetime.datetime.now()
95
- global LAST_UPDATE_LEADERBOARD
96
- if current_time - LAST_UPDATE_LEADERBOARD < datetime.timedelta(minutes=10) and leaderboard_initial_df is not None:
97
- return leaderboard_initial_df
98
- LAST_UPDATE_LEADERBOARD = current_time
99
- leaderboard_dataset = datasets.load_dataset(
100
- AGGREGATED_REPO,
101
- "default",
102
- split="train",
103
- cache_dir=HF_HOME,
104
- download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
105
- verification_mode="no_checks"
106
- )
107
-
108
- leaderboard_df = get_leaderboard_df(
109
- leaderboard_dataset=leaderboard_dataset,
110
- cols=COLS,
111
- benchmark_cols=BENCHMARK_COLS,
112
- )
113
-
114
- return leaderboard_df
115
-
116
- def get_latest_data_queue():
117
- eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
118
- return eval_queue_dfs
119
-
120
- def init_space():
121
- """Initializes the application space, loading only necessary data."""
122
- if DO_FULL_INIT:
123
- # These downloads only occur on full initialization
124
- try:
125
- download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
126
- except Exception:
127
- restart_space()
128
-
129
- # Always redownload the leaderboard DataFrame
130
- leaderboard_df = get_latest_data_leaderboard()
131
-
132
- # Evaluation queue DataFrame retrieval is independent of initialization detail level
133
- eval_queue_dfs = get_latest_data_queue()
134
-
135
- return leaderboard_df, eval_queue_dfs
136
-
137
-
138
- # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
139
- # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
140
- leaderboard_df, eval_queue_dfs = init_space()
141
- finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
142
-
143
-
144
- # Data processing for plots now only on demand in the respective Gradio tab
145
- def load_and_create_plots():
146
- plot_df = create_plot_df(create_scores_df(leaderboard_df))
147
- return plot_df
148
-
149
- def init_leaderboard(dataframe):
150
- return Leaderboard(
151
- value = dataframe,
152
- datatype=[c.type for c in fields(AutoEvalColumn)],
153
- select_columns=SelectColumns(
154
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
155
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
156
- label="Select Columns to Display:",
157
- ),
158
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
159
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
160
- filter_columns=[
161
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
162
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
163
- ColumnFilter(
164
- AutoEvalColumn.params.name,
165
- type="slider",
166
- min=0.01,
167
- max=150,
168
- label="Select the number of parameters (B)",
169
- ),
170
- ColumnFilter(
171
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
172
- ),
173
- ColumnFilter(
174
- AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
175
- ),
176
- ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
177
- ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
178
- ],
179
- bool_checkboxgroup_label="Hide models",
180
- interactive=False,
181
- )
182
-
183
- demo = gr.Blocks(css=custom_css)
184
- with demo:
185
- gr.HTML(TITLE)
186
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
187
-
188
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
189
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
190
- leaderboard = init_leaderboard(leaderboard_df)
191
-
192
- with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
193
- with gr.Row():
194
- with gr.Column():
195
- plot_df = load_and_create_plots()
196
- chart = create_metric_plot_obj(
197
- plot_df,
198
- [AutoEvalColumn.average.name],
199
- title="Average of Top Scores and Human Baseline Over Time (from last update)",
200
- )
201
- gr.Plot(value=chart, min_width=500)
202
- with gr.Column():
203
- plot_df = load_and_create_plots()
204
- chart = create_metric_plot_obj(
205
- plot_df,
206
- BENCHMARK_COLS,
207
- title="Top Scores and Human Baseline Over Time (from last update)",
208
- )
209
- gr.Plot(value=chart, min_width=500)
210
-
211
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
212
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
213
-
214
- with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
215
- gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
216
-
217
- with gr.TabItem("🚀 Submit? ", elem_id="llm-benchmark-tab-table", id=5):
218
- countdown = gr.HTML(
219
- """<div align="center">
220
- <div position: relative>
221
- <img
222
- src="https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/resolve/main/gif.gif"
223
- allowtransparency="true"
224
- style="display:block;width:100%;height:auto;"
225
- />
226
- <iframe
227
- src="https://logwork.com/widget/countdown/?text=Surprise%20loading...&amp;timezone=Europe%2FParis&amp;width=&amp;style=circles&amp;uid=815898&amp;loc=https://logwork.com/countdown-fxmc&amp;language=en&amp;textcolor=&amp;background=%23ffd21e&amp;date=2024-06-26%2015%3A00%3A00&amp;digitscolor=%23ff9d00&amp;unitscolor=&amp"
228
- style="position: absolute; top:0; left: 0; border: medium; width:100%; height:100%; margin: 0px; visibility: visible;"
229
- scrolling="no"
230
- allowtransparency="true"
231
- frameborder="0"
232
- allowfullscreen
233
- />
234
- </div>
235
- </div>"""
236
- )
237
- #gif = gr.Image(value="./gif.gif", interactive=False)
238
- gr.Markdown("*Countdown by Logwork.com, gif art by Chun Te Lee*")
239
-
240
- with gr.Row():
241
- with gr.Accordion("📙 Citation", open=False):
242
- citation_button = gr.Textbox(
243
- value=CITATION_BUTTON_TEXT,
244
- label=CITATION_BUTTON_LABEL,
245
- lines=20,
246
- elem_id="citation-button",
247
- show_copy_button=True,
248
- )
249
-
250
- demo.load(fn=get_latest_data_leaderboard, inputs=[leaderboard], outputs=[leaderboard])
251
-
252
-
253
- demo.queue(default_concurrency_limit=40)
254
-
255
- # Start ephemeral Spaces on PRs (see config in README.md)
256
- from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
257
-
258
- def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
259
- # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
260
- # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
261
- # ht to Lucain!
262
- if SPACE_ID is None:
263
- print("Not in a Space: Space CI disabled.")
264
- return WebhooksServer(ui=demo)
265
-
266
- if IS_EPHEMERAL_SPACE:
267
- print("In an ephemeral Space: Space CI disabled.")
268
- return WebhooksServer(ui=demo)
269
-
270
- card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
271
- config = card.data.get("space_ci", {})
272
- print(f"Enabling Space CI with config from README: {config}")
273
-
274
- return configure_space_ci(
275
- blocks=ui,
276
- trusted_authors=config.get("trusted_authors"),
277
- private=config.get("private", "auto"),
278
- variables=config.get("variables", "auto"),
279
- secrets=config.get("secrets"),
280
- hardware=config.get("hardware"),
281
- storage=config.get("storage"),
282
- )
283
-
284
- # Create webhooks server (with CI url if in Space and not ephemeral)
285
- webhooks_server = enable_space_ci_and_return_server(ui=demo)
286
-
287
- # Add webhooks
288
- @webhooks_server.add_webhook
289
- def update_leaderboard(payload: WebhookPayload) -> None:
290
- """Redownloads the leaderboard dataset each time it updates"""
291
- if payload.repo.type == "dataset" and payload.event.action == "update":
292
- datasets.load_dataset(
293
- AGGREGATED_REPO,
294
- "default",
295
- split="train",
296
- cache_dir=HF_HOME,
297
- download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
298
- verification_mode="no_checks"
299
- )
300
-
301
- # The below code is not used at the moment, as we can manage the queue file locally
302
- LAST_UPDATE_QUEUE = datetime.datetime.now()
303
- @webhooks_server.add_webhook
304
- def update_queue(payload: WebhookPayload) -> None:
305
- """Redownloads the queue dataset each time it updates"""
306
- if payload.repo.type == "dataset" and payload.event.action == "update":
307
- current_time = datetime.datetime.now()
308
- global LAST_UPDATE_QUEUE
309
- if current_time - LAST_UPDATE_QUEUE > datetime.timedelta(minutes=10):
310
- print("Would have updated the queue")
311
- # We only redownload is last update was more than 10 minutes ago, as the queue is
312
- # updated regularly and heavy to download
313
- #download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
314
- LAST_UPDATE_QUEUE = datetime.datetime.now()
315
-
316
- webhooks_server.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gif.gif DELETED

Git LFS Details

  • SHA256: ca34fd48c50eda15857dffedd1659921e7ae33e1d53f5e7afa34696040f4ef80
  • Pointer size: 132 Bytes
  • Size of remote file: 3.85 MB
pyproject.toml CHANGED
@@ -38,16 +38,18 @@ numpy = "1.26.0"
38
  pandas = "2.2.2"
39
  plotly = "5.14.1"
40
  python-dateutil = "2.8.2"
41
- requests = "2.28.2"
42
  sentencepiece = "^0.2.0"
43
  tqdm = "4.65.0"
44
  transformers = "4.41.1"
45
  tokenizers = ">=0.15.0"
46
  gradio-space-ci = {git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci", rev = "0.2.3"}
47
- gradio = " 4.20.0"
48
  isort = "^5.13.2"
49
  ruff = "^0.3.5"
50
  gradio-leaderboard = "0.0.8"
 
 
 
 
51
 
52
  [build-system]
53
  requires = ["poetry-core"]
 
38
  pandas = "2.2.2"
39
  plotly = "5.14.1"
40
  python-dateutil = "2.8.2"
 
41
  sentencepiece = "^0.2.0"
42
  tqdm = "4.65.0"
43
  transformers = "4.41.1"
44
  tokenizers = ">=0.15.0"
45
  gradio-space-ci = {git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci", rev = "0.2.3"}
 
46
  isort = "^5.13.2"
47
  ruff = "^0.3.5"
48
  gradio-leaderboard = "0.0.8"
49
+ gradio = {extras = ["oauth"], version = "^4.36.1"}
50
+ requests = "^2.31.0"
51
+ requests-oauthlib = "^1.3.1"
52
+ schedule = "^1.2.2"
53
 
54
  [build-system]
55
  requires = ["poetry-core"]
requirements.txt CHANGED
@@ -8,11 +8,16 @@ numpy==1.26.0
8
  pandas==2.2.2
9
  plotly==5.14.1
10
  python-dateutil==2.8.2
11
- requests==2.28.2
12
  sentencepiece
13
  tqdm==4.65.0
14
  transformers==4.41.1
15
  tokenizers>=0.15.0
16
  gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
17
- gradio==4.20.0
 
 
 
18
  gradio_leaderboard==0.0.9
 
 
 
 
8
  pandas==2.2.2
9
  plotly==5.14.1
10
  python-dateutil==2.8.2
 
11
  sentencepiece
12
  tqdm==4.65.0
13
  transformers==4.41.1
14
  tokenizers>=0.15.0
15
  gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
16
+ isort
17
+ ruff
18
+ gradio==4.31.0
19
+ gradio[oauth]
20
  gradio_leaderboard==0.0.9
21
+ requests==2.31.0
22
+ requests-oauthlib== 1.3.1
23
+ schedule == 1.2.2
src/display/about.py CHANGED
@@ -219,6 +219,89 @@ CITATION_BUTTON_TEXT = r"""
219
  publisher = {Hugging Face},
220
  howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
221
  }
222
-
223
- ????
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  """
 
219
  publisher = {Hugging Face},
220
  howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
221
  }
222
+ @software{eval-harness,
223
+ author = {Gao, Leo and
224
+ Tow, Jonathan and
225
+ Biderman, Stella and
226
+ Black, Sid and
227
+ DiPofi, Anthony and
228
+ Foster, Charles and
229
+ Golding, Laurence and
230
+ Hsu, Jeffrey and
231
+ McDonell, Kyle and
232
+ Muennighoff, Niklas and
233
+ Phang, Jason and
234
+ Reynolds, Laria and
235
+ Tang, Eric and
236
+ Thite, Anish and
237
+ Wang, Ben and
238
+ Wang, Kevin and
239
+ Zou, Andy},
240
+ title = {A framework for few-shot language model evaluation},
241
+ month = sep,
242
+ year = 2021,
243
+ publisher = {Zenodo},
244
+ version = {v0.0.1},
245
+ doi = {10.5281/zenodo.5371628},
246
+ url = {https://doi.org/10.5281/zenodo.5371628}
247
+ }
248
+ @misc{clark2018think,
249
+ title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
250
+ author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
251
+ year={2018},
252
+ eprint={1803.05457},
253
+ archivePrefix={arXiv},
254
+ primaryClass={cs.AI}
255
+ }
256
+ @misc{zellers2019hellaswag,
257
+ title={HellaSwag: Can a Machine Really Finish Your Sentence?},
258
+ author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
259
+ year={2019},
260
+ eprint={1905.07830},
261
+ archivePrefix={arXiv},
262
+ primaryClass={cs.CL}
263
+ }
264
+ @misc{hendrycks2021measuring,
265
+ title={Measuring Massive Multitask Language Understanding},
266
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
267
+ year={2021},
268
+ eprint={2009.03300},
269
+ archivePrefix={arXiv},
270
+ primaryClass={cs.CY}
271
+ }
272
+ @misc{lin2022truthfulqa,
273
+ title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
274
+ author={Stephanie Lin and Jacob Hilton and Owain Evans},
275
+ year={2022},
276
+ eprint={2109.07958},
277
+ archivePrefix={arXiv},
278
+ primaryClass={cs.CL}
279
+ }
280
+ @misc{DBLP:journals/corr/abs-1907-10641,
281
+ title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
282
+ author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
283
+ year={2019},
284
+ eprint={1907.10641},
285
+ archivePrefix={arXiv},
286
+ primaryClass={cs.CL}
287
+ }
288
+ @misc{DBLP:journals/corr/abs-2110-14168,
289
+ title={Training Verifiers to Solve Math Word Problems},
290
+ author={Karl Cobbe and
291
+ Vineet Kosaraju and
292
+ Mohammad Bavarian and
293
+ Mark Chen and
294
+ Heewoo Jun and
295
+ Lukasz Kaiser and
296
+ Matthias Plappert and
297
+ Jerry Tworek and
298
+ Jacob Hilton and
299
+ Reiichiro Nakano and
300
+ Christopher Hesse and
301
+ John Schulman},
302
+ year={2021},
303
+ eprint={2110.14168},
304
+ archivePrefix={arXiv},
305
+ primaryClass={cs.CL}
306
+ }
307
  """
src/display/css_html_js.py CHANGED
@@ -9,7 +9,7 @@ table th:first-child {
9
 
10
  /* Full width space */
11
  .gradio-container {
12
- max-width: 95%!important;
13
  }
14
 
15
  /* Text style and margins */
@@ -48,7 +48,7 @@ table th:first-child {
48
  }
49
 
50
  /* Filters style */
51
- #filter_type{
52
  border: 0;
53
  padding-left: 0;
54
  padding-top: 0;
@@ -56,29 +56,53 @@ table th:first-child {
56
  #filter_type label {
57
  display: flex;
58
  }
59
- #filter_type label > span{
60
  margin-top: var(--spacing-lg);
61
  margin-right: 0.5em;
62
  }
63
- #filter_type label > .wrap{
64
  width: 103px;
65
  }
66
- #filter_type label > .wrap .wrap-inner{
67
  padding: 2px;
68
  }
69
- #filter_type label > .wrap .wrap-inner input{
70
- width: 1px
71
  }
72
- #filter-columns-type{
73
- border:0;
74
- padding:0.5;
 
 
 
 
75
  }
76
- #filter-columns-size{
77
- border:0;
78
- padding:0.5;
79
  }
80
- #box-filter > .form{
81
- border: 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  }
83
  """
84
 
 
9
 
10
  /* Full width space */
11
  .gradio-container {
12
+ max-width: 95% !important;
13
  }
14
 
15
  /* Text style and margins */
 
48
  }
49
 
50
  /* Filters style */
51
+ #filter_type {
52
  border: 0;
53
  padding-left: 0;
54
  padding-top: 0;
 
56
  #filter_type label {
57
  display: flex;
58
  }
59
+ #filter_type label > span {
60
  margin-top: var(--spacing-lg);
61
  margin-right: 0.5em;
62
  }
63
+ #filter_type label > .wrap {
64
  width: 103px;
65
  }
66
+ #filter_type label > .wrap .wrap-inner {
67
  padding: 2px;
68
  }
69
+ #filter_type label > .wrap .wrap-inner input {
70
+ width: 1px;
71
  }
72
+ #filter-columns-type {
73
+ border: 0;
74
+ padding: 0.5;
75
+ }
76
+ #filter-columns-size {
77
+ border: 0;
78
+ padding: 0.5;
79
  }
80
+ #box-filter > .form {
81
+ border: 0;
 
82
  }
83
+
84
+ /* Header styles */
85
+ #header-title {
86
+ text-align: left;
87
+ display: inline-block;
88
+ }
89
+
90
+ #header-row {
91
+ display: flex;
92
+ justify-content: space-between;
93
+ align-items: center;
94
+ }
95
+
96
+ #header-row .gradio-html {
97
+ flex-grow: 1;
98
+ }
99
+
100
+ #oauth-button {
101
+ height: auto;
102
+ min-width: max-content;
103
+ white-space: nowrap;
104
+ padding: 10px 20px;
105
+ border-radius: 4px;
106
  }
107
  """
108
 
src/display/formatting.py CHANGED
@@ -11,7 +11,7 @@ def make_clickable_model(model_name):
11
  link = f"https://huggingface.co/{model_name}"
12
 
13
  details_model_name = model_name.replace("/", "__")
14
- details_link = f"https://huggingface.co/datasets/open-llm-leaderboard-old/details_{details_model_name}"
15
 
16
  return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
17
 
 
11
  link = f"https://huggingface.co/{model_name}"
12
 
13
  details_model_name = model_name.replace("/", "__")
14
+ details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/{details_model_name}-details"
15
 
16
  return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
17
 
src/display/utils.py CHANGED
@@ -49,12 +49,23 @@ class Task:
49
 
50
 
51
  class Tasks(Enum):
52
- arc = Task("arc:challenge", "acc_norm", "ARC")
53
- hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
54
- mmlu = Task("hendrycksTest", "acc", "MMLU")
55
- truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
56
- winogrande = Task("winogrande", "acc", "Winogrande")
57
- gsm8k = Task("gsm8k", "acc", "GSM8K")
 
 
 
 
 
 
 
 
 
 
 
58
 
59
 
60
  # These classes are for user facing column names,
@@ -77,7 +88,8 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
77
  # Scores
78
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
79
  for task in Tasks:
80
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
81
  # Model information
82
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
83
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
@@ -94,7 +106,10 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
94
  auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
95
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
96
  auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
97
- # Dummy column for the search bar (hidden by the custom CSS)
 
 
 
98
  auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
99
 
100
  # We use make dataclass to dynamically fill the scores from Tasks
@@ -103,30 +118,31 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
103
 
104
  @dataclass(frozen=True)
105
  class EvalQueueColumn: # Queue column
106
- model = ColumnContent("model", "markdown", True)
 
107
  revision = ColumnContent("revision", "str", True)
108
- private = ColumnContent("private", "bool", True)
109
  precision = ColumnContent("precision", "str", True)
110
- weight_type = ColumnContent("weight_type", "str", "Original")
111
  status = ColumnContent("status", "str", True)
112
 
113
 
114
- baseline_row = {
115
- AutoEvalColumn.model.name: "<p>Baseline</p>",
116
- AutoEvalColumn.revision.name: "N/A",
117
- AutoEvalColumn.precision.name: None,
118
- AutoEvalColumn.merged.name: False,
119
- AutoEvalColumn.average.name: 31.0,
120
- AutoEvalColumn.arc.name: 25.0,
121
- AutoEvalColumn.hellaswag.name: 25.0,
122
- AutoEvalColumn.mmlu.name: 25.0,
123
- AutoEvalColumn.truthfulqa.name: 25.0,
124
- AutoEvalColumn.winogrande.name: 50.0,
125
- AutoEvalColumn.gsm8k.name: 0.21,
126
- AutoEvalColumn.fullname.name: "baseline",
127
- AutoEvalColumn.model_type.name: "",
128
- AutoEvalColumn.not_flagged.name: False,
129
- }
130
 
131
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
132
  # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
@@ -136,22 +152,22 @@ baseline_row = {
136
  # Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
137
  # GSM8K: paper
138
  # Define the human baselines
139
- human_baseline_row = {
140
- AutoEvalColumn.model.name: "<p>Human performance</p>",
141
- AutoEvalColumn.revision.name: "N/A",
142
- AutoEvalColumn.precision.name: None,
143
- AutoEvalColumn.average.name: 92.75,
144
- AutoEvalColumn.merged.name: False,
145
- AutoEvalColumn.arc.name: 80.0,
146
- AutoEvalColumn.hellaswag.name: 95.0,
147
- AutoEvalColumn.mmlu.name: 89.8,
148
- AutoEvalColumn.truthfulqa.name: 94.0,
149
- AutoEvalColumn.winogrande.name: 94.0,
150
- AutoEvalColumn.gsm8k.name: 100,
151
- AutoEvalColumn.fullname.name: "human_baseline",
152
- AutoEvalColumn.model_type.name: "",
153
- AutoEvalColumn.not_flagged.name: False,
154
- }
155
 
156
 
157
  @dataclass
@@ -166,22 +182,22 @@ class ModelType(Enum):
166
  FT = ModelDetails(name="🔶 fine-tuned on domain-specific datasets", symbol="🔶")
167
  chat = ModelDetails(name="💬 chat models (RLHF, DPO, IFT, ...)", symbol="💬")
168
  merges = ModelDetails(name="🤝 base merges and moerges", symbol="🤝")
169
- Unknown = ModelDetails(name="", symbol="?")
170
 
171
  def to_str(self, separator=" "):
172
  return f"{self.value.symbol}{separator}{self.value.name}"
173
 
174
  @staticmethod
175
- def from_str(type):
176
- if "fine-tuned" in type or "🔶" in type:
177
  return ModelType.FT
178
- if "continously pretrained" in type or "🟩" in type:
179
  return ModelType.CPT
180
- if "pretrained" in type or "🟢" in type:
181
  return ModelType.PT
182
- if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
183
  return ModelType.chat
184
- if "merge" in type or "🤝" in type:
185
  return ModelType.merges
186
  return ModelType.Unknown
187
 
@@ -200,6 +216,7 @@ class Precision(Enum):
200
  qt_GPTQ = ModelDetails("GPTQ")
201
  Unknown = ModelDetails("?")
202
 
 
203
  def from_str(precision):
204
  if precision in ["torch.float16", "float16"]:
205
  return Precision.float16
 
49
 
50
 
51
  class Tasks(Enum):
52
+ ifeval = Task("leaderboard_ifeval", "strict_acc,none", "IFEval")
53
+ ifeval_raw = Task("leaderboard_ifeval", "strict_acc,none", "IFEval Raw")
54
+
55
+ bbh = Task("leaderboard_bbh", "acc_norm,none", "BBH")
56
+ bbh_raw = Task("leaderboard_bbh", "acc_norm,none", "BBH Raw")
57
+
58
+ math = Task("leaderboard_math_hard", "exact_match,none", "MATH Lvl 5")
59
+ math_raw = Task("leaderboard_math_hard", "exact_match,none", "MATH Lvl 5 Raw")
60
+
61
+ gpqa = Task("leaderboard_gpqa", "acc_norm,none", "GPQA")
62
+ gpqa_raw = Task("leaderboard_gpqa", "acc_norm,none", "GPQA Raw")
63
+
64
+ musr = Task("leaderboard_musr", "acc_norm,none", "MUSR")
65
+ musr_raw = Task("leaderboard_musr", "acc_norm,none", "MUSR Raw")
66
+
67
+ mmlu_pro = Task("leaderboard_mmlu_pro", "acc,none", "MMLU-PRO")
68
+ mmlu_pro_raw = Task("leaderboard_mmlu_pro", "acc,none", "MMLU-PRO Raw")
69
 
70
 
71
  # These classes are for user facing column names,
 
88
  # Scores
89
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
90
  for task in Tasks:
91
+ displayed_by_default = not task.name.endswith("_raw")
92
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", displayed_by_default=displayed_by_default)])
93
  # Model information
94
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
95
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
106
  auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
107
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
108
  auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
109
+ auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Chat Template", "bool", False)])
110
+ auto_eval_column_dict.append(["maintainers_highlight", ColumnContent, ColumnContent("Maintainer's Highlight", "bool", False, hidden=True)])
111
+
112
+ # fullname structure: <user>/<model_name>
113
  auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
114
 
115
  # We use make dataclass to dynamically fill the scores from Tasks
 
118
 
119
  @dataclass(frozen=True)
120
  class EvalQueueColumn: # Queue column
121
+ model_link = ColumnContent("model_link", "markdown", True)
122
+ model_name = ColumnContent("model_name", "str", True)
123
  revision = ColumnContent("revision", "str", True)
124
+ #private = ColumnContent("private", "bool", True) # Should not be displayed
125
  precision = ColumnContent("precision", "str", True)
126
+ #weight_type = ColumnContent("weight_type", "str", "Original") # Might be confusing, to think about
127
  status = ColumnContent("status", "str", True)
128
 
129
 
130
+ # baseline_row = {
131
+ # AutoEvalColumn.model.name: "<p>Baseline</p>",
132
+ # AutoEvalColumn.revision.name: "N/A",
133
+ # AutoEvalColumn.precision.name: None,
134
+ # AutoEvalColumn.merged.name: False,
135
+ # AutoEvalColumn.average.name: 31.0,
136
+ # AutoEvalColumn.arc.name: 25.0,
137
+ # AutoEvalColumn.hellaswag.name: 25.0,
138
+ # AutoEvalColumn.mmlu.name: 25.0,
139
+ # AutoEvalColumn.truthfulqa.name: 25.0,
140
+ # AutoEvalColumn.winogrande.name: 50.0,
141
+ # AutoEvalColumn.gsm8k.name: 0.21,
142
+ # AutoEvalColumn.fullname.name: "baseline",
143
+ # AutoEvalColumn.model_type.name: "",
144
+ # AutoEvalColumn.not_flagged.name: False,
145
+ # }
146
 
147
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
148
  # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
 
152
  # Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
153
  # GSM8K: paper
154
  # Define the human baselines
155
+ # human_baseline_row = {
156
+ # AutoEvalColumn.model.name: "<p>Human performance</p>",
157
+ # AutoEvalColumn.revision.name: "N/A",
158
+ # AutoEvalColumn.precision.name: None,
159
+ # AutoEvalColumn.average.name: 92.75,
160
+ # AutoEvalColumn.merged.name: False,
161
+ # AutoEvalColumn.arc.name: 80.0,
162
+ # AutoEvalColumn.hellaswag.name: 95.0,
163
+ # AutoEvalColumn.mmlu.name: 89.8,
164
+ # AutoEvalColumn.truthfulqa.name: 94.0,
165
+ # AutoEvalColumn.winogrande.name: 94.0,
166
+ # AutoEvalColumn.gsm8k.name: 100,
167
+ # AutoEvalColumn.fullname.name: "human_baseline",
168
+ # AutoEvalColumn.model_type.name: "",
169
+ # AutoEvalColumn.not_flagged.name: False,
170
+ # }
171
 
172
 
173
  @dataclass
 
182
  FT = ModelDetails(name="🔶 fine-tuned on domain-specific datasets", symbol="🔶")
183
  chat = ModelDetails(name="💬 chat models (RLHF, DPO, IFT, ...)", symbol="💬")
184
  merges = ModelDetails(name="🤝 base merges and moerges", symbol="🤝")
185
+ Unknown = ModelDetails(name="❓ other", symbol="")
186
 
187
  def to_str(self, separator=" "):
188
  return f"{self.value.symbol}{separator}{self.value.name}"
189
 
190
  @staticmethod
191
+ def from_str(m_type):
192
+ if any([k for k in m_type if k in ["fine-tuned","🔶", "finetuned"]]):
193
  return ModelType.FT
194
+ if "continuously pretrained" in m_type or "🟩" in m_type:
195
  return ModelType.CPT
196
+ if "pretrained" in m_type or "🟢" in m_type:
197
  return ModelType.PT
198
+ if any([k in m_type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
199
  return ModelType.chat
200
+ if "merge" in m_type or "🤝" in m_type:
201
  return ModelType.merges
202
  return ModelType.Unknown
203
 
 
216
  qt_GPTQ = ModelDetails("GPTQ")
217
  Unknown = ModelDetails("?")
218
 
219
+ @staticmethod
220
  def from_str(precision):
221
  if precision in ["torch.float16", "float16"]:
222
  return Precision.float16
src/envs.py CHANGED
@@ -4,9 +4,10 @@ from huggingface_hub import HfApi
4
  # clone / pull the lmeh eval data
5
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
6
 
7
- REPO_ID = "open-llm-leaderboard-old/open_llm_leaderboard"
8
- QUEUE_REPO = "open-llm-leaderboard-old/requests"
9
- AGGREGATED_REPO = "open-llm-leaderboard-old/contents"
 
10
 
11
  HF_HOME = os.getenv("HF_HOME", ".")
12
 
@@ -20,11 +21,12 @@ if not os.access(HF_HOME, os.W_OK):
20
  else:
21
  print("Write access confirmed for HF_HOME")
22
 
 
23
  EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
24
 
25
  # Rate limit variables
26
  RATE_LIMIT_PERIOD = 7
27
  RATE_LIMIT_QUOTA = 5
28
- HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
29
 
30
  API = HfApi(token=HF_TOKEN)
 
4
  # clone / pull the lmeh eval data
5
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
6
 
7
+ REPO_ID = "open-llm-leaderboard/open_llm_leaderboard_v2"
8
+ QUEUE_REPO = "open-llm-leaderboard/requests"
9
+ AGGREGATED_REPO = "open-llm-leaderboard/contents"
10
+ VOTES_REPO = "open-llm-leaderboard/votes"
11
 
12
  HF_HOME = os.getenv("HF_HOME", ".")
13
 
 
21
  else:
22
  print("Write access confirmed for HF_HOME")
23
 
24
+ VOTES_PATH = os.path.join(HF_HOME, "model-votes")
25
  EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
26
 
27
  # Rate limit variables
28
  RATE_LIMIT_PERIOD = 7
29
  RATE_LIMIT_QUOTA = 5
30
+ HAS_HIGHER_RATE_LIMIT = []
31
 
32
  API = HfApi(token=HF_TOKEN)
src/leaderboard/filter_models.py CHANGED
@@ -4,122 +4,8 @@ from src.display.utils import AutoEvalColumn
4
 
5
  # Models which have been flagged by users as being problematic for a reason or another
6
  # (Model name to forum discussion link)
7
- FLAGGED_MODELS = {
8
- "merged": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
9
- "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/202",
10
- "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/207",
11
- "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/213",
12
- "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/236",
13
- "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/237",
14
- "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/215",
15
- "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
16
- "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
17
- "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
18
- "fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/444",
19
- "jan-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
20
- "rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
21
- "rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
22
- "GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
23
- "GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
24
- "GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
25
- "viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
26
- "GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
27
- "janai-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
28
- "ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
29
- "fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
30
- "mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
31
- "mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
32
- "Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
33
- "GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
34
- "quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
35
- "quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
36
- "quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
37
- "mncai/mistral-7b-dpo-v5": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
38
- "cookinai/BruinHermes": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
39
- "jan-ai/Pandora-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
40
- "v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
41
- "v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
42
- "rwitz2/pee": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
43
- "zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/503",
44
- "dillfrescott/trinity-medium": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
45
- "udkai/Garrulus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/526",
46
- "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
47
- "eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
48
- "abideen/NexoNimbus-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
49
- "alnrg2arg/test2_3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
50
- "nfaheem/Marcoroni-7b-DPO-Merge": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
51
- "CultriX/MergeTrix-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
52
- "liminerity/Blur-7b-v1.21": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
53
- # Merges not indicated
54
- "gagan3012/MetaModelv2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
55
- "gagan3012/MetaModelv3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
56
- "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
57
- "kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
58
- "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
59
- "kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
60
- "fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
61
- "perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
62
- "rwitz/go-bruins-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
63
- "rwitz/go-bruins": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
64
- "Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
65
- "aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
66
- "NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
67
- "Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
68
- "OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
69
- "perlthoughts/Falkor-7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
70
- "v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
71
- "Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
72
- "DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
73
- "PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
74
- "Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
75
- "Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
76
- "perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
77
- "elinas/chronos007-70b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
78
- "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
79
- "Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
80
- "diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
81
- "Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
82
- "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
83
- "Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
84
- "garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
85
- "Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
86
- "uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
87
- "DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
88
- "cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
89
- "DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
90
- "DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
91
- "gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
92
- "udkai/Turdus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
93
- "kodonho/Solar-OrcaDPO-Solar-Instruct-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
94
- "kodonho/SolarM-SakuraSolar-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
95
- "Yhyu13/LMCocktail-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
96
- "mlabonne/NeuralMarcoro14-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
97
- "Neuronovo/neuronovo-7B-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
98
- "ryandt/MusingCaterpillar": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
99
- "Neuronovo/neuronovo-7B-v0.3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
100
- "SanjiWatsuki/Lelantos-DPO-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
101
- "bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
102
- "cookinai/OpenCM-14": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
103
- "bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
104
- "jan-hq/supermario-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
105
- # MoErges
106
- "cloudyu/Yi-34Bx2-MoE-60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
107
- "cloudyu/Mixtral_34Bx2_MoE_60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
108
- "gagan3012/MetaModel_moe": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
109
- "macadeliccc/SOLAR-math-2x10.7b-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
110
- "cloudyu/Mixtral_7Bx2_MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
111
- "macadeliccc/SOLAR-math-2x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
112
- "macadeliccc/Orca-SOLAR-4x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
113
- "macadeliccc/piccolo-8x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
114
- "cloudyu/Mixtral_7Bx4_MOE_24B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
115
- "macadeliccc/laser-dolphin-mixtral-2x7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
116
- "macadeliccc/polyglot-math-4x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
117
- # Other - contamination mostly
118
- "DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/566",
119
- "CultriX/MistralTrix-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/556",
120
- "Contamination/contaminated_proof_7b_v1.0": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
121
- "Contamination/contaminated_proof_7b_v1.0_safetensor": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
122
- }
123
 
124
  # Models which have been requested by orgs to not be submitted on the leaderboard
125
  DO_NOT_SUBMIT_MODELS = [
@@ -133,12 +19,16 @@ DO_NOT_SUBMIT_MODELS = [
133
  def flag_models(leaderboard_data: list[dict]):
134
  """Flags models based on external criteria or flagged status."""
135
  for model_data in leaderboard_data:
 
 
 
 
 
136
  # If a model is not flagged, use its "fullname" as a key
137
  if model_data[AutoEvalColumn.not_flagged.name]:
138
  flag_key = model_data[AutoEvalColumn.fullname.name]
139
  else:
140
- # Merges and moes are flagged
141
- flag_key = "merged"
142
 
143
  # Reverse the logic: Check for non-flagged models instead
144
  if flag_key in FLAGGED_MODELS:
 
4
 
5
  # Models which have been flagged by users as being problematic for a reason or another
6
  # (Model name to forum discussion link)
7
+ # None for the v2 so far!
8
+ FLAGGED_MODELS = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  # Models which have been requested by orgs to not be submitted on the leaderboard
11
  DO_NOT_SUBMIT_MODELS = [
 
19
  def flag_models(leaderboard_data: list[dict]):
20
  """Flags models based on external criteria or flagged status."""
21
  for model_data in leaderboard_data:
22
+ # Skip flagging if maintainers_highlight is True
23
+ if model_data.get(AutoEvalColumn.maintainers_highlight.name, False):
24
+ model_data[AutoEvalColumn.not_flagged.name] = True
25
+ continue
26
+
27
  # If a model is not flagged, use its "fullname" as a key
28
  if model_data[AutoEvalColumn.not_flagged.name]:
29
  flag_key = model_data[AutoEvalColumn.fullname.name]
30
  else:
31
+ flag_key = None
 
32
 
33
  # Reverse the logic: Check for non-flagged models instead
34
  if flag_key in FLAGGED_MODELS:
src/populate.py CHANGED
@@ -2,14 +2,15 @@ import pathlib
2
  import pandas as pd
3
  from datasets import Dataset
4
  from src.display.formatting import has_no_nan_values, make_clickable_model
5
- from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
6
  from src.leaderboard.filter_models import filter_models_flags
7
  from src.display.utils import load_json_data
8
 
9
 
10
  def _process_model_data(entry, model_name_key="model", revision_key="revision"):
11
  """Enrich model data with clickable links and revisions."""
12
- entry[EvalQueueColumn.model.name] = make_clickable_model(entry.get(model_name_key, ""))
 
13
  entry[EvalQueueColumn.revision.name] = entry.get(revision_key, "main")
14
  return entry
15
 
@@ -50,4 +51,4 @@ def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols:
50
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
51
  df = df[cols].round(decimals=2)
52
  df = df[has_no_nan_values(df, benchmark_cols)]
53
- return df
 
2
  import pandas as pd
3
  from datasets import Dataset
4
  from src.display.formatting import has_no_nan_values, make_clickable_model
5
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn
6
  from src.leaderboard.filter_models import filter_models_flags
7
  from src.display.utils import load_json_data
8
 
9
 
10
  def _process_model_data(entry, model_name_key="model", revision_key="revision"):
11
  """Enrich model data with clickable links and revisions."""
12
+ entry[EvalQueueColumn.model_name.name] = entry.get(model_name_key, "")
13
+ entry[EvalQueueColumn.model_link.name] = make_clickable_model(entry.get(model_name_key, ""))
14
  entry[EvalQueueColumn.revision.name] = entry.get(revision_key, "main")
15
  return entry
16
 
 
51
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
52
  df = df[cols].round(decimals=2)
53
  df = df[has_no_nan_values(df, benchmark_cols)]
54
+ return df
src/submission/submit.py CHANGED
@@ -32,6 +32,7 @@ def add_new_eval(
32
  precision: str,
33
  weight_type: str,
34
  model_type: str,
 
35
  ):
36
  global REQUESTED_MODELS
37
  global USERS_TO_SUBMISSION_DATES
@@ -129,6 +130,7 @@ def add_new_eval(
129
  "model_type": model_type,
130
  "job_id": -1,
131
  "job_start_time": None,
 
132
  }
133
 
134
  supplementary_info = {
 
32
  precision: str,
33
  weight_type: str,
34
  model_type: str,
35
+ use_chat_template: bool,
36
  ):
37
  global REQUESTED_MODELS
38
  global USERS_TO_SUBMISSION_DATES
 
130
  "model_type": model_type,
131
  "job_id": -1,
132
  "job_start_time": None,
133
+ "use_chat_template": use_chat_template,
134
  }
135
 
136
  supplementary_info = {
src/tools/plots.py CHANGED
@@ -4,7 +4,7 @@ import plotly.express as px
4
  from plotly.graph_objs import Figure
5
 
6
  from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
7
- from src.display.utils import human_baseline_row as HUMAN_BASELINE
8
  from src.leaderboard.filter_models import FLAGGED_MODELS
9
 
10
 
 
4
  from plotly.graph_objs import Figure
5
 
6
  from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
7
+ # from src.display.utils import human_baseline_row as HUMAN_BASELINE
8
  from src.leaderboard.filter_models import FLAGGED_MODELS
9
 
10
 
src/voting/vote_system.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import pathlib
4
+ import pandas as pd
5
+ import gradio as gr
6
+ import schedule
7
+ import time
8
+ from datetime import datetime, timezone
9
+ from src.display.utils import EvalQueueColumn
10
+
11
+ from src.envs import API
12
+
13
+ # Set up logging
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class VoteManager:
18
+ def __init__(self, votes_path, eval_requests_path, repo_id):
19
+ self.votes_path = votes_path
20
+ self.eval_requests_path = eval_requests_path
21
+ self.repo_id = repo_id
22
+ self.vote_dataset = self.read_vote_dataset()
23
+ self.vote_check_set = self.make_check_set(self.vote_dataset)
24
+ self.votes_to_upload = []
25
+
26
+ def init_vote_dataset(self):
27
+ self.vote_dataset = self.read_vote_dataset()
28
+ self.vote_check_set = self.make_check_set(self.vote_dataset)
29
+
30
+ def read_vote_dataset(self):
31
+ result = []
32
+ votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
33
+ if votes_file.exists():
34
+ with open(votes_file, "r") as f:
35
+ for line in f:
36
+ data = json.loads(line.strip())
37
+ result.append(data)
38
+ result = pd.DataFrame(result)
39
+ return result
40
+
41
+ def make_check_set(self, vote_dataset: pd.DataFrame):
42
+ result = list()
43
+ for row in vote_dataset.itertuples(index=False, name='vote'):
44
+ result.append((row.model, row.revision, row.username))
45
+ return set(result)
46
+
47
+ def get_model_revision(self, selected_model: str) -> str:
48
+ """Fetch the revision for the given model from the request files."""
49
+ for user_folder in pathlib.Path(self.eval_requests_path).iterdir():
50
+ if user_folder.is_dir():
51
+ for file in user_folder.glob("*.json"):
52
+ with open(file, "r") as f:
53
+ data = json.load(f)
54
+ if data.get("model") == selected_model:
55
+ return data.get("revision", "main")
56
+ return "main"
57
+
58
+ def create_request_vote_df(self, pending_models_df: gr.Dataframe):
59
+ if pending_models_df.empty or not "model_name" in pending_models_df.columns:
60
+ return pending_models_df
61
+ self.vote_dataset = self.read_vote_dataset()
62
+ vote_counts = self.vote_dataset.groupby(['model', 'revision']).size().reset_index(name='vote_count')
63
+
64
+ pending_models_df_votes = pd.merge(
65
+ pending_models_df,
66
+ vote_counts,
67
+ left_on=["model_name", 'revision'],
68
+ right_on=['model', 'revision'],
69
+ how='left'
70
+ )
71
+ # Filling empty votes
72
+ pending_models_df_votes['vote_count'] = pending_models_df_votes['vote_count'].fillna(0)
73
+ pending_models_df_votes = pending_models_df_votes.sort_values(by=["vote_count", "model_name"], ascending=[False, True])
74
+ # Removing useless columns
75
+ pending_models_df_votes = pending_models_df_votes.drop(["model_name", "model"], axis=1)
76
+ return pending_models_df_votes
77
+
78
+ # Function to be called when a user votes for a model
79
+ def add_vote(
80
+ self,
81
+ selected_model: str,
82
+ pending_models_df: gr.Dataframe,
83
+ profile: gr.OAuthProfile | None
84
+ ):
85
+ logger.debug(f"Type of list before usage: {type(list)}")
86
+ # model_name, revision, user_id, timestamp
87
+ if selected_model in ["str", ""]:
88
+ gr.Warning("No model selected")
89
+ return
90
+
91
+ if profile is None:
92
+ gr.Warning("Hub Login required")
93
+ return
94
+
95
+ vote_username = profile.username
96
+ model_revision = self.get_model_revision(selected_model)
97
+
98
+ # tuple (immutable) for checking than already voted for model
99
+ check_tuple = (selected_model, model_revision, vote_username)
100
+ if check_tuple in self.vote_check_set:
101
+ gr.Warning("Already voted for this model")
102
+ return
103
+
104
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
105
+
106
+ vote_obj = {
107
+ "model": selected_model,
108
+ "revision": model_revision,
109
+ "username": vote_username,
110
+ "timestamp": current_time
111
+ }
112
+
113
+ # Append the vote to the JSONL file
114
+ try:
115
+ votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
116
+ with open(votes_file, "a") as f:
117
+ f.write(json.dumps(vote_obj) + "\n")
118
+ logger.info(f"Vote added locally: {vote_obj}")
119
+
120
+ self.votes_to_upload.append(vote_obj)
121
+ except Exception as e:
122
+ logger.error(f"Failed to write vote to file: {e}")
123
+ gr.Warning("Failed to record vote. Please try again")
124
+ return
125
+
126
+ self.vote_check_set.add(check_tuple)
127
+ gr.Info(f"Voted for {selected_model}")
128
+
129
+ return self.create_request_vote_df(pending_models_df)
130
+
131
+ def upload_votes(self):
132
+ if self.votes_to_upload:
133
+ votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
134
+ try:
135
+ with open(votes_file, "rb") as f:
136
+ API.upload_file(
137
+ path_or_fileobj=f,
138
+ path_in_repo="votes_data.jsonl",
139
+ repo_id=self.repo_id,
140
+ repo_type="dataset",
141
+ commit_message="Updating votes_data.jsonl with new votes",
142
+ )
143
+ logger.info("Votes uploaded to votes repository")
144
+ self.votes_to_upload.clear()
145
+ except Exception as e:
146
+ logger.error(f"Failed to upload votes to repository: {e}")
147
+
148
+ def run_scheduler(vote_manager):
149
+ while True:
150
+ schedule.run_pending()
151
+ time.sleep(1)