open_llm_leaderboard-check

Runtime error

App Files Files Community

alozowski HF staff commited on Apr 9, 2024

Commit

0a3530a

•

1 Parent(s): c81dadf

search-update (#662)

Browse files

- New search logic (e82b8efe7266fd5b30bd9867371776024aa378f9)
- Conditional initialization based on SKIP_INIT (a18a1a4380052a8324f7493d66cea0e092230ac5)
- Returned enable_space_ci import (2785a0b0f691843aaa3da52516f7e6bf658ae4ad)
- Refactored app.py (64564977ed789023bdc471b607dbf4e4deaeccd2)
- Updated about.py (47b18e393de110048cc9b08c93cf93d45bc78a57)

Files changed (19) hide show

.gitignore +1 -0
.python-version +1 -0
app.py +149 -64
poetry.lock +0 -0
pyproject.toml +36 -3
src/display/about.py +10 -1
src/display/formatting.py +1 -5
src/display/utils.py +13 -5
src/envs.py +1 -1
src/leaderboard/filter_models.py +20 -21
src/leaderboard/read_evals.py +16 -22
src/populate.py +3 -1
src/scripts/create_request_file.py +1 -1
src/scripts/update_all_request_files.py +39 -38
src/submission/check_validity.py +34 -16
src/submission/submit.py +25 -14
src/tools/collections.py +0 -2
src/tools/plots.py +13 -10
update_dynamic.py +1 -1

.gitignore CHANGED Viewed

@@ -4,6 +4,7 @@ __pycache__/
 .ipynb_checkpoints
 *ipynb
 .vscode/
 eval-queue/
 eval-results/

 .ipynb_checkpoints
 *ipynb
 .vscode/
+.DS_Store
 eval-queue/
 eval-results/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10.0

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
@@ -8,9 +9,9 @@ from src.display.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     EVALUATION_QUEUE_TEXT,
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
-    FAQ_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
@@ -23,23 +24,32 @@ from src.display.utils import (
     TYPES,
     AutoEvalColumn,
     ModelType,
-    fields,
     WeightType,
-    Precision
 )
-from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
 from src.scripts.update_all_request_files import update_dynamic_files
 from src.tools.collections import update_collections
-from src.tools.plots import (
-    create_metric_plot_obj,
-    create_plot_df,
-    create_scores_df,
-)
 # Start ephemeral Spaces on PRs (see config in README.md)
-#enable_space_ci()
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
@@ -50,32 +60,46 @@ def init_space(full_init: bool = True):
         try:
             print(EVAL_REQUESTS_PATH)
             snapshot_download(
-                repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, max_workers=8
             )
         except Exception:
             restart_space()
         try:
             print(DYNAMIC_INFO_PATH)
             snapshot_download(
-                repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, max_workers=8
             )
         except Exception:
             restart_space()
         try:
             print(EVAL_RESULTS_PATH)
             snapshot_download(
-                repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,   max_workers=8
             )
         except Exception:
-            restart_space()
     raw_data, original_df = get_leaderboard_df(
-        results_path=EVAL_RESULTS_PATH,
-        requests_path=EVAL_REQUESTS_PATH,
-        dynamic_path=DYNAMIC_INFO_FILE_PATH,
-        cols=COLS,
-        benchmark_cols=BENCHMARK_COLS
     )
     update_collections(original_df.copy())
     leaderboard_df = original_df.copy()
@@ -90,7 +114,16 @@ def init_space(full_init: bool = True):
     return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
-leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
 # Searching and filtering
@@ -103,7 +136,13 @@ def update_table(
     hide_models: list,
     query: str,
 ):
-    filtered_df = filter_models(df=hidden_df, type_query=type_query, size_query=size_query, precision_query=precision_query, hide_models=hide_models)
     filtered_df = filter_queries(query, filtered_df)
     df = select_columns(filtered_df, columns)
     return df
@@ -111,43 +150,82 @@ def update_table(
 def load_query(request: gr.Request):  # triggered only once at startup => read query parameter if it exists
     query = request.query_params.get("query") or ""
-    return query, query # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
-def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
-    return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
 def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
     always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
     dummy_col = [AutoEvalColumn.dummy.name]
-        #AutoEvalColumn.model_type_symbol.name,
-        #AutoEvalColumn.model.name,
     # We use COLS to maintain sorting
-    filtered_df = df[
-        always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col
-    ]
     return filtered_df
-def filter_queries(query: str, filtered_df: pd.DataFrame):
-    """Added by Abishek"""
-    final_df = []
-    if query != "":
-        queries = [q.strip() for q in query.split(";")]
-        for _q in queries:
-            _q = _q.strip()
-            if _q != "":
-                temp_filtered_df = search_table(filtered_df, _q)
-                if len(temp_filtered_df) > 0:
-                    final_df.append(temp_filtered_df)
-        if len(final_df) > 0:
-            filtered_df = pd.concat(final_df)
-            filtered_df = filtered_df.drop_duplicates(
-                subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
-            )
-    return filtered_df
 def filter_models(
@@ -179,12 +257,13 @@ def filter_models(
     return filtered_df
 leaderboard_df = filter_models(
-    df=leaderboard_df,
-    type_query=[t.to_str(" : ") for t in ModelType],
-    size_query=list(NUMERIC_INTERVALS.keys()),
     precision_query=[i.value.name for i in Precision],
-    hide_models=["Private or deleted", "Contains a merge/moerge", "Flagged"], # Deleted, merges, flagged, MoEs
 )
 demo = gr.Blocks(css=custom_css)
@@ -198,7 +277,7 @@ with demo:
                 with gr.Column():
                     with gr.Row():
                         search_bar = gr.Textbox(
-                            placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
                             show_label=False,
                             elem_id="search-bar",
                         )
@@ -221,12 +300,12 @@ with demo:
                     with gr.Row():
                         hide_models = gr.CheckboxGroup(
                             label="Hide models",
-                            choices = ["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
                             value=["Private or deleted", "Contains a merge/moerge", "Flagged"],
-                            interactive=True
                         )
                 with gr.Column(min_width=320):
-                    #with gr.Box(elem_id="box-filter"):
                     filter_columns_type = gr.CheckboxGroup(
                         label="Model types",
                         choices=[t.to_str() for t in ModelType],
@@ -260,7 +339,7 @@ with demo:
                 elem_id="leaderboard-table",
                 interactive=False,
                 visible=True,
-                #column_widths=["2%", "33%"]
             )
             # Dummy leaderboard for handling the case when the user uses backspace key
@@ -301,8 +380,14 @@ with demo:
             )
             # Check query parameter once at startup and update search bar + hidden component
             demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
-            for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, hide_models]:
                 selector.change(
                     update_table,
                     [
@@ -326,14 +411,14 @@ with demo:
                         [AutoEvalColumn.average.name],
                         title="Average of Top Scores and Human Baseline Over Time (from last update)",
                     )
-                    gr.Plot(value=chart, min_width=500)
                 with gr.Column():
                     chart = create_metric_plot_obj(
                         plot_df,
                         BENCHMARK_COLS,
                         title="Top Scores and Human Baseline Over Time (from last update)",
                     )
-                    gr.Plot(value=chart, min_width=500)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -441,8 +526,8 @@ with demo:
             )
 scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h
-scheduler.add_job(update_dynamic_files, "interval", hours=2) # launched every 2 hour
 scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

+import os
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     EVALUATION_QUEUE_TEXT,
+    FAQ_TEXT,
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
     TYPES,
     AutoEvalColumn,
     ModelType,
+    Precision,
     WeightType,
+    fields,
+)
+from src.envs import (
+    API,
+    DYNAMIC_INFO_FILE_PATH,
+    DYNAMIC_INFO_PATH,
+    DYNAMIC_INFO_REPO,
+    EVAL_REQUESTS_PATH,
+    EVAL_RESULTS_PATH,
+    H4_TOKEN,
+    IS_PUBLIC,
+    QUEUE_REPO,
+    REPO_ID,
+    RESULTS_REPO,
 )
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.scripts.update_all_request_files import update_dynamic_files
+from src.submission.submit import add_new_eval
 from src.tools.collections import update_collections
+from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
 # Start ephemeral Spaces on PRs (see config in README.md)
+enable_space_ci()
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
         try:
             print(EVAL_REQUESTS_PATH)
             snapshot_download(
+                repo_id=QUEUE_REPO,
+                local_dir=EVAL_REQUESTS_PATH,
+                repo_type="dataset",
+                tqdm_class=None,
+                etag_timeout=30,
+                max_workers=8,
             )
         except Exception:
             restart_space()
         try:
             print(DYNAMIC_INFO_PATH)
             snapshot_download(
+                repo_id=DYNAMIC_INFO_REPO,
+                local_dir=DYNAMIC_INFO_PATH,
+                repo_type="dataset",
+                tqdm_class=None,
+                etag_timeout=30,
+                max_workers=8,
             )
         except Exception:
             restart_space()
         try:
             print(EVAL_RESULTS_PATH)
             snapshot_download(
+                repo_id=RESULTS_REPO,
+                local_dir=EVAL_RESULTS_PATH,
+                repo_type="dataset",
+                tqdm_class=None,
+                etag_timeout=30,
+                max_workers=8,
             )
         except Exception:
+            restart_space()
     raw_data, original_df = get_leaderboard_df(
+        results_path=EVAL_RESULTS_PATH,
+        requests_path=EVAL_REQUESTS_PATH,
+        dynamic_path=DYNAMIC_INFO_FILE_PATH,
+        cols=COLS,
+        benchmark_cols=BENCHMARK_COLS,
     )
     update_collections(original_df.copy())
     leaderboard_df = original_df.copy()
     return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
+# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
+# This controls whether a full initialization should be performed.
+do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
+# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
+# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
+leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = (
+    init_space(full_init=do_full_init)
+)
 # Searching and filtering
     hide_models: list,
     query: str,
 ):
+    filtered_df = filter_models(
+        df=hidden_df,
+        type_query=type_query,
+        size_query=size_query,
+        precision_query=precision_query,
+        hide_models=hide_models,
+    )
     filtered_df = filter_queries(query, filtered_df)
     df = select_columns(filtered_df, columns)
     return df
 def load_query(request: gr.Request):  # triggered only once at startup => read query parameter if it exists
     query = request.query_params.get("query") or ""
+    return (
+        query,
+        query,
+    )  # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
+def search_model(df: pd.DataFrame, query: str) -> pd.DataFrame:
+    return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False, na=False))]
+def search_license(df: pd.DataFrame, query: str) -> pd.DataFrame:
+    return df[df[AutoEvalColumn.license.name].str.contains(query, case=False, na=False)]
 def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
     always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
     dummy_col = [AutoEvalColumn.dummy.name]
+    # AutoEvalColumn.model_type_symbol.name,
+    # AutoEvalColumn.model.name,
     # We use COLS to maintain sorting
+    filtered_df = df[always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col]
     return filtered_df
+def filter_queries(query: str, df: pd.DataFrame):
+    tmp_result_df = []
+    # Empty query return the same df
+    if query == "":
+        return df
+    # all_queries = [q.strip() for q in query.split(";")]
+    # license_queries = []
+    all_queries = [q.strip() for q in query.split(";") if q.strip() != ""]
+    model_queries = [q for q in all_queries if not q.startswith("licence")]
+    license_queries_raw = [q for q in all_queries if q.startswith("license")]
+    license_queries = [
+        q.replace("license:", "").strip() for q in license_queries_raw if q.replace("license:", "").strip() != ""
+    ]
+    # Handling model name search
+    for query in model_queries:
+        tmp_df = search_model(df, query)
+        if len(tmp_df) > 0:
+            tmp_result_df.append(tmp_df)
+    if not tmp_result_df and not license_queries:
+        # Nothing is found, no license_queries -> return empty df
+        return pd.DataFrame(columns=df.columns)
+    if tmp_result_df:
+        df = pd.concat(tmp_result_df)
+        df = df.drop_duplicates(
+            subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
+        )
+    if not license_queries:
+        return df
+    # Handling license search
+    tmp_result_df = []
+    for query in license_queries:
+        tmp_df = search_license(df, query)
+        if len(tmp_df) > 0:
+            tmp_result_df.append(tmp_df)
+    if not tmp_result_df:
+        # Nothing is found, return empty df
+        return pd.DataFrame(columns=df.columns)
+    df = pd.concat(tmp_result_df)
+    df = df.drop_duplicates(
+        subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
+    )
+    return df
 def filter_models(
     return filtered_df
 leaderboard_df = filter_models(
+    df=leaderboard_df,
+    type_query=[t.to_str(" : ") for t in ModelType],
+    size_query=list(NUMERIC_INTERVALS.keys()),
     precision_query=[i.value.name for i in Precision],
+    hide_models=["Private or deleted", "Contains a merge/moerge", "Flagged"],  # Deleted, merges, flagged, MoEs
 )
 demo = gr.Blocks(css=custom_css)
                 with gr.Column():
                     with gr.Row():
                         search_bar = gr.Textbox(
+                            placeholder="🔍 Search models or licenses (e.g., 'model_name; license: MIT') and press ENTER...",
                             show_label=False,
                             elem_id="search-bar",
                         )
                     with gr.Row():
                         hide_models = gr.CheckboxGroup(
                             label="Hide models",
+                            choices=["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
                             value=["Private or deleted", "Contains a merge/moerge", "Flagged"],
+                            interactive=True,
                         )
                 with gr.Column(min_width=320):
+                    # with gr.Box(elem_id="box-filter"):
                     filter_columns_type = gr.CheckboxGroup(
                         label="Model types",
                         choices=[t.to_str() for t in ModelType],
                 elem_id="leaderboard-table",
                 interactive=False,
                 visible=True,
+                # column_widths=["2%", "33%"]
             )
             # Dummy leaderboard for handling the case when the user uses backspace key
             )
             # Check query parameter once at startup and update search bar + hidden component
             demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
+            for selector in [
+                shown_columns,
+                filter_columns_type,
+                filter_columns_precision,
+                filter_columns_size,
+                hide_models,
+            ]:
                 selector.change(
                     update_table,
                     [
                         [AutoEvalColumn.average.name],
                         title="Average of Top Scores and Human Baseline Over Time (from last update)",
                     )
+                    gr.Plot(value=chart, min_width=500)
                 with gr.Column():
                     chart = create_metric_plot_obj(
                         plot_df,
                         BENCHMARK_COLS,
                         title="Top Scores and Human Baseline Over Time (from last update)",
                     )
+                    gr.Plot(value=chart, min_width=500)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
             )
 scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", hours=3)  # restarted every 3h
+scheduler.add_job(update_dynamic_files, "interval", hours=2)  # launched every 2 hour
 scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -1,9 +1,9 @@
 [tool.ruff]
 # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
-select = ["E", "F"]
-ignore = ["E501"] # line too long (black is taking care of this)
 line-length = 119
-fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
 [tool.isort]
 profile = "black"
@@ -11,3 +11,36 @@ line_length = 119
 [tool.black]
 line-length = 119

 [tool.ruff]
 # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+lint.select = ["E", "F"]
+lint.ignore = ["E501"] # line too long (black is taking care of this)
 line-length = 119
+lint.fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
 [tool.isort]
 profile = "black"
 [tool.black]
 line-length = 119
+[tool.poetry]
+name = "open-llm-leaderboard"
+version = "0.1.0"
+description = ""
+authors = []
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "3.10.0"
+apscheduler = "3.10.1"
+black = "23.11.0"
+click = "8.1.3"
+datasets = "2.14.5"
+huggingface-hub = ">=0.18.0"
+matplotlib = "3.7.1"
+numpy = "1.24.2"
+pandas = "2.0.0"
+plotly = "5.14.1"
+python-dateutil = "2.8.2"
+requests = "2.28.2"
+sentencepiece = "^0.2.0"
+tqdm = "4.65.0"
+transformers = "4.39.0"
+tokenizers = ">=0.15.0"
+gradio-space-ci = {git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci", rev = "0.2.1"}
+gradio = "4.9.0"
+isort = "^5.13.2"
+ruff = "^0.3.5"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

src/display/about.py CHANGED Viewed

@@ -12,7 +12,7 @@ icons = f"""
 - {ModelType.chat.to_str(" : ")} model: chat like fine-tunes, either using IFT (datasets of task instruction), RLHF or DPO (changing the model loss a bit with an added policy), etc
 - {ModelType.merges.to_str(" : ")} model: merges or MoErges, models which have been merged or fused without additional fine-tuning.
 """
-LLM_BENCHMARKS_TEXT = f"""
 ## ABOUT
 With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
@@ -134,6 +134,15 @@ My model has been flagged improperly, what can I do?
 ---------------------------
 ## EDITING SUBMISSIONS
 I upgraded my model and want to re-submit, how can I do that?
 - *Please open an issue with the precise name of your model, and we'll remove your model from the leaderboard so you can resubmit. You can also resubmit directly with the new commit hash!*

 - {ModelType.chat.to_str(" : ")} model: chat like fine-tunes, either using IFT (datasets of task instruction), RLHF or DPO (changing the model loss a bit with an added policy), etc
 - {ModelType.merges.to_str(" : ")} model: merges or MoErges, models which have been merged or fused without additional fine-tuning.
 """
+LLM_BENCHMARKS_TEXT = """
 ## ABOUT
 With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
 ---------------------------
+## HOW TO SEARCH FOR A MODEL
+Search for models in the leaderboard by:
+1. Name, e.g., *model_name*
+2. Multiple names, separated by `;`, e.g., *model_name1;model_name2*
+3. License, prefix with `license:`, e.g., *license: MIT*
+4. Combination of name and license, order is irrelevant, e.g., *model_name; license: cc-by-sa-4.0*
+---------------------------
 ## EDITING SUBMISSIONS
 I upgraded my model and want to re-submit, how can I do that?
 - *Please open an issue with the precise name of your model, and we'll remove your model from the leaderboard so you can resubmit. You can also resubmit directly with the new commit hash!*

src/display/formatting.py CHANGED Viewed

@@ -1,12 +1,8 @@
-import os
-from datetime import datetime, timezone
 from huggingface_hub import HfApi
-from huggingface_hub.hf_api import ModelInfo
 API = HfApi()
 def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'

 from huggingface_hub import HfApi
 API = HfApi()
 def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'

src/display/utils.py CHANGED Viewed

@@ -3,6 +3,7 @@ from enum import Enum
 import pandas as pd
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -13,6 +14,7 @@ class Task:
     metric: str
     col_name: str
 class Tasks(Enum):
     arc = Task("arc:challenge", "acc_norm", "ARC")
     hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
@@ -21,6 +23,7 @@ class Tasks(Enum):
     winogrande = Task("winogrande", "acc", "Winogrande")
     gsm8k = Task("gsm8k", "acc", "GSM8K")
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
 # when a modif is needed
@@ -33,11 +36,12 @@ class ColumnContent:
     never_hidden: bool = False
     dummy: bool = False
 auto_eval_column_dict = []
 # Init
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
-#Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
@@ -50,7 +54,9 @@ auto_eval_column_dict.append(["merged", ColumnContent, ColumnContent("Merged", "
 auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
 auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
 auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
-auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, hidden=True)])
 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
 auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
@@ -60,6 +66,7 @@ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)
@@ -112,10 +119,11 @@ human_baseline_row = {
     AutoEvalColumn.flagged.name: False,
 }
 @dataclass
 class ModelDetails:
     name: str
-    symbol: str = "" # emoji, only for the model type
 class ModelType(Enum):
@@ -143,11 +151,13 @@ class ModelType(Enum):
             return ModelType.merges
         return ModelType.Unknown
 class WeightType(Enum):
     Adapter = ModelDetails("Adapter")
     Original = ModelDetails("Original")
     Delta = ModelDetails("Delta")
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
@@ -168,8 +178,6 @@ class Precision(Enum):
         if precision in ["GPTQ", "None"]:
             return Precision.qt_GPTQ
         return Precision.Unknown
 # Column selection

 import pandas as pd
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
     metric: str
     col_name: str
 class Tasks(Enum):
     arc = Task("arc:challenge", "acc_norm", "ARC")
     hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
     winogrande = Task("winogrande", "acc", "Winogrande")
     gsm8k = Task("gsm8k", "acc", "GSM8K")
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
 # when a modif is needed
     never_hidden: bool = False
     dummy: bool = False
 auto_eval_column_dict = []
 # Init
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+# Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
 auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
 auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+auto_eval_column_dict.append(
+    ["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, hidden=True)]
+)
 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
 auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)
     AutoEvalColumn.flagged.name: False,
 }
 @dataclass
 class ModelDetails:
     name: str
+    symbol: str = ""  # emoji, only for the model type
 class ModelType(Enum):
             return ModelType.merges
         return ModelType.Unknown
 class WeightType(Enum):
     Adapter = ModelDetails("Adapter")
     Original = ModelDetails("Original")
     Delta = ModelDetails("Delta")
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
         if precision in ["GPTQ", "None"]:
             return Precision.qt_GPTQ
         return Precision.Unknown
 # Column selection

src/envs.py CHANGED Viewed

@@ -15,7 +15,7 @@ PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
 IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
-CACHE_PATH=os.getenv("HF_HOME", ".")
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")

 IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
+CACHE_PATH = os.getenv("HF_HOME", ".")
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")

src/leaderboard/filter_models.py CHANGED Viewed

@@ -29,7 +29,7 @@ FLAGGED_MODELS = {
     "mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
-    "GreenNode/GreenNodeLM-7B-v1olet":  "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
@@ -43,7 +43,6 @@ FLAGGED_MODELS = {
     "dillfrescott/trinity-medium": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "udkai/Garrulus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/526",
     "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
-    "udkai/Turdus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
     "eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
     "abideen/NexoNimbus-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
     "alnrg2arg/test2_3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
@@ -101,19 +100,19 @@ FLAGGED_MODELS = {
     "bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
     "cookinai/OpenCM-14": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
     "bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "jan-hq/supermario-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
     # MoErges
-    "cloudyu/Yi-34Bx2-MoE-60B":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "cloudyu/Mixtral_34Bx2_MoE_60B":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "gagan3012/MetaModel_moe":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "macadeliccc/SOLAR-math-2x10.7b-v0.2":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "cloudyu/Mixtral_7Bx2_MoE":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "macadeliccc/SOLAR-math-2x10.7b":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "macadeliccc/Orca-SOLAR-4x10.7b":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "macadeliccc/piccolo-8x7b":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "cloudyu/Mixtral_7Bx4_MOE_24B":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "macadeliccc/laser-dolphin-mixtral-2x7b-dpo":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
-    "macadeliccc/polyglot-math-4x7b":"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
     # Other - contamination mostly
     "DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/566",
     "CultriX/MistralTrix-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/556",
@@ -124,16 +123,16 @@ FLAGGED_MODELS = {
 # Models which have been requested by orgs to not be submitted on the leaderboard
 DO_NOT_SUBMIT_MODELS = [
     "Voicelab/trurl-2-13b",  # trained on MMLU
-    "TigerResearch/tigerbot-70b-chat", # per authors request
-    "TigerResearch/tigerbot-70b-chat-v2", # per authors request
-    "TigerResearch/tigerbot-70b-chat-v4-4k", # per authors request
 ]
 def flag_models(leaderboard_data: list[dict]):
     for model_data in leaderboard_data:
         # Merges and moes are flagged automatically
-        if model_data[AutoEvalColumn.flagged.name] == True:
             flag_key = "merged"
         else:
             flag_key = model_data["model_name_for_query"]
@@ -144,9 +143,9 @@ def flag_models(leaderboard_data: list[dict]):
                 FLAGGED_MODELS[flag_key],
                 f"See discussion #{issue_num}",
             )
-            model_data[
-                AutoEvalColumn.model.name
-            ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
             model_data[AutoEvalColumn.flagged.name] = True
         else:
             model_data[AutoEvalColumn.flagged.name] = False

     "mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
+    "GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "dillfrescott/trinity-medium": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
     "udkai/Garrulus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/526",
     "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
     "eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
     "abideen/NexoNimbus-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
     "alnrg2arg/test2_3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
     "bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
     "cookinai/OpenCM-14": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
     "bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
+    "jan-hq/supermario-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
     # MoErges
+    "cloudyu/Yi-34Bx2-MoE-60B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
+    "cloudyu/Mixtral_34Bx2_MoE_60B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
+    "gagan3012/MetaModel_moe": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
+    "macadeliccc/SOLAR-math-2x10.7b-v0.2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
+    "cloudyu/Mixtral_7Bx2_MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
+    "macadeliccc/SOLAR-math-2x10.7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
+    "macadeliccc/Orca-SOLAR-4x10.7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
+    "macadeliccc/piccolo-8x7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
+    "cloudyu/Mixtral_7Bx4_MOE_24B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
+    "macadeliccc/laser-dolphin-mixtral-2x7b-dpo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
+    "macadeliccc/polyglot-math-4x7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/540",
     # Other - contamination mostly
     "DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/566",
     "CultriX/MistralTrix-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/556",
 # Models which have been requested by orgs to not be submitted on the leaderboard
 DO_NOT_SUBMIT_MODELS = [
     "Voicelab/trurl-2-13b",  # trained on MMLU
+    "TigerResearch/tigerbot-70b-chat",  # per authors request
+    "TigerResearch/tigerbot-70b-chat-v2",  # per authors request
+    "TigerResearch/tigerbot-70b-chat-v4-4k",  # per authors request
 ]
 def flag_models(leaderboard_data: list[dict]):
     for model_data in leaderboard_data:
         # Merges and moes are flagged automatically
+        if model_data[AutoEvalColumn.flagged.name]:
             flag_key = "merged"
         else:
             flag_key = model_data["model_name_for_query"]
                 FLAGGED_MODELS[flag_key],
                 f"See discussion #{issue_num}",
             )
+            model_data[AutoEvalColumn.model.name] = (
+                f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
+            )
             model_data[AutoEvalColumn.flagged.name] = True
         else:
             model_data[AutoEvalColumn.flagged.name] = False

src/leaderboard/read_evals.py CHANGED Viewed

@@ -7,29 +7,27 @@ from dataclasses import dataclass
 import dateutil
 import numpy as np
-from huggingface_hub import ModelCard
 from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 @dataclass
 class EvalResult:
     # Also see src.display.utils.AutoEvalColumn for what will be displayed.
-    eval_name: str # org_model_precision (uid)
-    full_model: str # org/model (path on hub)
-    org: str
     model: str
-    revision: str # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original # Original or Adapter
-    architecture: str = "Unknown" # From config file
     license: str = "?"
     likes: int = 0
     num_params: int = 0
-    date: str = "" # submission date of request file
     still_on_hub: bool = True
     is_merge: bool = False
     flagged: bool = False
@@ -96,8 +94,8 @@ class EvalResult:
             org=org,
             model=model,
             results=results,
-            precision=precision,
-            revision= config.get("model_sha", ""),
         )
     def update_with_request_file(self, requests_path):
@@ -113,7 +111,7 @@ class EvalResult:
             self.date = request.get("submitted_time", "")
             self.architecture = request.get("architectures", "Unknown")
             self.status = request.get("status", "FAILED")
-        except Exception as e:
             self.status = "FAILED"
             print(f"Could not find request file for {self.org}/{self.model}")
@@ -123,7 +121,6 @@ class EvalResult:
         self.still_on_hub = file_dict["still_on_hub"]
         self.tags = file_dict.get("tags", [])
         self.flagged = any("flagged" in tag for tag in self.tags)
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -145,7 +142,7 @@ class EvalResult:
             AutoEvalColumn.still_on_hub.name: self.still_on_hub,
             AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
             AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
-            AutoEvalColumn.flagged.name: self.flagged
         }
         for task in Tasks:
@@ -168,10 +165,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
     for tmp_request_file in request_files:
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
-            if (
-                req_content["status"] in ["FINISHED"]
-                and req_content["precision"] == precision.split(".")[-1]
-            ):
                 request_file = tmp_request_file
     return request_file
@@ -207,7 +201,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
         if eval_result.full_model in dynamic_data:
             eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
             # Hardcoding because of gating problem
-            if any([org in eval_result.full_model for org in ["meta-llama/", "google/", "tiiuae/"]]):
                 eval_result.still_on_hub = True
         # Store results of same eval together
@@ -221,7 +215,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
     for v in eval_results.values():
         try:
             if v.status == "FINISHED":
-                v.to_dict() # we test if the dict version is complete
                 results.append(v)
         except KeyError:  # not all eval values present
             continue

 import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
 @dataclass
 class EvalResult:
     # Also see src.display.utils.AutoEvalColumn for what will be displayed.
+    eval_name: str  # org_model_precision (uid)
+    full_model: str  # org/model (path on hub)
+    org: str
     model: str
+    revision: str  # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
+    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
+    weight_type: WeightType = WeightType.Original  # Original or Adapter
+    architecture: str = "Unknown"  # From config file
     license: str = "?"
     likes: int = 0
     num_params: int = 0
+    date: str = ""  # submission date of request file
     still_on_hub: bool = True
     is_merge: bool = False
     flagged: bool = False
             org=org,
             model=model,
             results=results,
+            precision=precision,
+            revision=config.get("model_sha", ""),
         )
     def update_with_request_file(self, requests_path):
             self.date = request.get("submitted_time", "")
             self.architecture = request.get("architectures", "Unknown")
             self.status = request.get("status", "FAILED")
+        except Exception:
             self.status = "FAILED"
             print(f"Could not find request file for {self.org}/{self.model}")
         self.still_on_hub = file_dict["still_on_hub"]
         self.tags = file_dict.get("tags", [])
         self.flagged = any("flagged" in tag for tag in self.tags)
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
             AutoEvalColumn.still_on_hub.name: self.still_on_hub,
             AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
             AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
+            AutoEvalColumn.flagged.name: self.flagged,
         }
         for task in Tasks:
     for tmp_request_file in request_files:
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
+            if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
                 request_file = tmp_request_file
     return request_file
         if eval_result.full_model in dynamic_data:
             eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
             # Hardcoding because of gating problem
+            if any([org in eval_result.full_model for org in ["meta-llama/", "google/", "tiiuae/"]]):
                 eval_result.still_on_hub = True
         # Store results of same eval together
     for v in eval_results.values():
         try:
             if v.status == "FINISHED":
+                v.to_dict()  # we test if the dict version is complete
                 results.append(v)
         except KeyError:  # not all eval values present
             continue

src/populate.py CHANGED Viewed

@@ -9,7 +9,9 @@ from src.leaderboard.filter_models import filter_models_flags
 from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
     all_data_json = [v.to_dict() for v in raw_data]
     all_data_json.append(baseline_row)

 from src.leaderboard.read_evals import get_raw_eval_results
+def get_leaderboard_df(
+    results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list
+) -> pd.DataFrame:
     raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
     all_data_json = [v.to_dict() for v in raw_data]
     all_data_json.append(baseline_row)

src/scripts/create_request_file.py CHANGED Viewed

@@ -7,8 +7,8 @@ import click
 from colorama import Fore
 from huggingface_hub import HfApi, snapshot_download
-from src.submission.check_validity import get_model_size
 from src.display.utils import ModelType, WeightType
 EVAL_REQUESTS_PATH = "eval-queue"
 QUEUE_REPO = "open-llm-leaderboard/requests"

 from colorama import Fore
 from huggingface_hub import HfApi, snapshot_download
 from src.display.utils import ModelType, WeightType
+from src.submission.check_validity import get_model_size
 EVAL_REQUESTS_PATH = "eval-queue"
 QUEUE_REPO = "open-llm-leaderboard/requests"

src/scripts/update_all_request_files.py CHANGED Viewed

@@ -1,37 +1,41 @@
-from huggingface_hub import ModelFilter, snapshot_download
-from huggingface_hub import ModelCard
 import json
 import os
 import time
-from src.submission.check_validity import is_model_on_hub, check_model_card, get_model_tags
-from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, API, H4_TOKEN
 def update_one_model(model_id, data, models_on_the_hub):
     # Model no longer on the hub at all
     if model_id not in models_on_the_hub:
-        data['still_on_hub'] = False
-        data['likes'] = 0
-        data['downloads'] = 0
-        data['created_at'] = ""
         data["tags"] = []
         return data
     # Grabbing model parameters
     model_cfg = models_on_the_hub[model_id]
-    data['likes'] = model_cfg.likes
-    data['downloads'] = model_cfg.downloads
-    data['created_at'] = str(model_cfg.created_at)
-    data['license'] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
     # Grabbing model details
     model_name = model_id
     if model_cfg.card_data is not None and model_cfg.card_data.base_model is not None:
         if isinstance(model_cfg.card_data.base_model, str):
-            model_name = model_cfg.card_data.base_model # for adapters, we look at the parent model
     still_on_hub, _, _ = is_model_on_hub(
-        model_name=model_name, revision=data.get("revision"), trust_remote_code=True, test_tokenizer=False, token=H4_TOKEN
     )
     # If the model doesn't have a model card or a license, we consider it's deleted
     if still_on_hub:
@@ -42,13 +46,14 @@ def update_one_model(model_id, data, models_on_the_hub):
         except Exception:
             model_card = None
             still_on_hub = False
-    data['still_on_hub'] = still_on_hub
     tags = get_model_tags(model_card, model_id) if still_on_hub else []
     data["tags"] = tags
     return data
 def update_models(file_path, models_on_the_hub):
     """
     Search through all JSON files in the specified root folder and its subfolders,
@@ -60,9 +65,7 @@ def update_models(file_path, models_on_the_hub):
         for model_id in model_infos.keys():
             seen_models.append(model_id)
             model_infos[model_id] = update_one_model(
-                model_id = model_id,
-                data=model_infos[model_id],
-                models_on_the_hub=models_on_the_hub
             )
     # If new requests files have been created since we started all this
@@ -70,7 +73,8 @@ def update_models(file_path, models_on_the_hub):
     all_models = []
     try:
         for ix, (root, _, files) in enumerate(os.walk(EVAL_REQUESTS_PATH)):
-            if ix == 0: continue
             for file in files:
                 if "eval_request" in file:
                     path = root.split("/")[-1] + "/" + file.split("_eval_request")[0]
@@ -81,18 +85,14 @@ def update_models(file_path, models_on_the_hub):
     for model_id in all_models:
         if model_id not in seen_models:
-            model_infos[model_id] = update_one_model(
-                model_id = model_id,
-                data={},
-                models_on_the_hub=models_on_the_hub
-            )
-    with open(file_path, 'w') as f:
         json.dump(model_infos, f, indent=2)
 def update_dynamic_files():
-    """ This will only update metadata for models already linked in the repo, not add missing ones.
-    """
     snapshot_download(
         repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     )
@@ -101,13 +101,15 @@ def update_dynamic_files():
     # Get models
     start = time.time()
-    models = list(API.list_models(
-        #filter=ModelFilter(task="text-generation"),
-        full=False,
-        cardData=True,
-        fetch_config=True,
-    ))
-    id_to_model = {model.id : model for model in models}
     print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
@@ -122,7 +124,6 @@ def update_dynamic_files():
         path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
         repo_id=DYNAMIC_INFO_REPO,
         repo_type="dataset",
-        commit_message=f"Daily request file update.",
     )
-    print(f"UPDATE_DYNAMIC: pushed to hub")

 import json
 import os
 import time
+from huggingface_hub import snapshot_download
+from src.envs import API, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_REPO, EVAL_REQUESTS_PATH, H4_TOKEN
+from src.submission.check_validity import check_model_card, get_model_tags, is_model_on_hub
 def update_one_model(model_id, data, models_on_the_hub):
     # Model no longer on the hub at all
     if model_id not in models_on_the_hub:
+        data["still_on_hub"] = False
+        data["likes"] = 0
+        data["downloads"] = 0
+        data["created_at"] = ""
         data["tags"] = []
         return data
     # Grabbing model parameters
     model_cfg = models_on_the_hub[model_id]
+    data["likes"] = model_cfg.likes
+    data["downloads"] = model_cfg.downloads
+    data["created_at"] = str(model_cfg.created_at)
+    data["license"] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
     # Grabbing model details
     model_name = model_id
     if model_cfg.card_data is not None and model_cfg.card_data.base_model is not None:
         if isinstance(model_cfg.card_data.base_model, str):
+            model_name = model_cfg.card_data.base_model  # for adapters, we look at the parent model
     still_on_hub, _, _ = is_model_on_hub(
+        model_name=model_name,
+        revision=data.get("revision"),
+        trust_remote_code=True,
+        test_tokenizer=False,
+        token=H4_TOKEN,
     )
     # If the model doesn't have a model card or a license, we consider it's deleted
     if still_on_hub:
         except Exception:
             model_card = None
             still_on_hub = False
+    data["still_on_hub"] = still_on_hub
     tags = get_model_tags(model_card, model_id) if still_on_hub else []
     data["tags"] = tags
     return data
 def update_models(file_path, models_on_the_hub):
     """
     Search through all JSON files in the specified root folder and its subfolders,
         for model_id in model_infos.keys():
             seen_models.append(model_id)
             model_infos[model_id] = update_one_model(
+                model_id=model_id, data=model_infos[model_id], models_on_the_hub=models_on_the_hub
             )
     # If new requests files have been created since we started all this
     all_models = []
     try:
         for ix, (root, _, files) in enumerate(os.walk(EVAL_REQUESTS_PATH)):
+            if ix == 0:
+                continue
             for file in files:
                 if "eval_request" in file:
                     path = root.split("/")[-1] + "/" + file.split("_eval_request")[0]
     for model_id in all_models:
         if model_id not in seen_models:
+            model_infos[model_id] = update_one_model(model_id=model_id, data={}, models_on_the_hub=models_on_the_hub)
+    with open(file_path, "w") as f:
         json.dump(model_infos, f, indent=2)
 def update_dynamic_files():
+    """This will only update metadata for models already linked in the repo, not add missing ones."""
     snapshot_download(
         repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     )
     # Get models
     start = time.time()
+    models = list(
+        API.list_models(
+            # filter=ModelFilter(task="text-generation"),
+            full=False,
+            cardData=True,
+            fetch_config=True,
+        )
+    )
+    id_to_model = {model.id: model for model in models}
     print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
         path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
         repo_id=DYNAMIC_INFO_REPO,
         repo_type="dataset",
+        commit_message="Daily request file update.",
     )
+    print("UPDATE_DYNAMIC: pushed to hub")

src/submission/check_validity.py CHANGED Viewed

@@ -24,10 +24,14 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
     # Enforce license metadata
     if card.data.license is None:
         if not ("license_name" in card.data and "license_link" in card.data):
-            return False, (
-                "License not found. Please add a license to your model card using the `license` metadata or a"
-                " `license_name`/`license_link` pair."
-            ), None
     # Enforce card content
     if len(card.text) < 200:
@@ -36,27 +40,33 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
     return True, "", card
-def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str, AutoConfig]:
     try:
-        config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) #, force_download=True)
         if test_tokenizer:
             try:
-                tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
             except ValueError as e:
                 return (
                     False,
-                    f"uses a tokenizer which is not in a transformers release: {e}",
-                    None
                 )
-            except Exception as e:
-                return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
         return True, None, config
-    except ValueError as e:
         return (
             False,
             "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
-            None
         )
     except Exception as e:
@@ -64,6 +74,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
             return True, "uses a gated model.", None
         return False, f"was not found or misconfigured on the hub! Error raised was {e.args[0]}", None
 def get_model_size(model_info: ModelInfo, precision: str):
     size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
     safetensors = None
@@ -79,16 +90,18 @@ def get_model_size(model_info: ModelInfo, precision: str):
             size_match = re.search(size_pattern, model_info.id.lower())
             model_size = size_match.group(0)
             model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
-        except AttributeError as e:
             return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
     size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
     model_size = size_factor * model_size
     return model_size
 def get_model_arch(model_info: ModelInfo):
     return model_info.config.get("architectures", "Unknown")
 def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
     if org_or_user not in users_to_submission_dates:
         return True, ""
@@ -135,6 +148,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
     return set(file_names), users_to_submission_dates
 def get_model_tags(model_card, model: str):
     is_merge_from_metadata = False
     is_moe_from_metadata = False
@@ -143,10 +157,14 @@ def get_model_tags(model_card, model: str):
     if model_card is None:
         return tags
     if model_card.data.tags:
-        is_merge_from_metadata = any([tag in model_card.data.tags for tag in ["merge", "moerge", "mergekit", "lazymergekit"]])
         is_moe_from_metadata = any([tag in model_card.data.tags for tag in ["moe", "moerge"]])
-    is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in ["merged model", "merge model", "moerge"])
     if is_merge_from_model_card or is_merge_from_metadata:
         tags.append("merge")
     is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in ["moe", "mixtral"])

     # Enforce license metadata
     if card.data.license is None:
         if not ("license_name" in card.data and "license_link" in card.data):
+            return (
+                False,
+                (
+                    "License not found. Please add a license to your model card using the `license` metadata or a"
+                    " `license_name`/`license_link` pair."
+                ),
+                None,
+            )
     # Enforce card content
     if len(card.text) < 200:
     return True, "", card
+def is_model_on_hub(
+    model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
+) -> tuple[bool, str, AutoConfig]:
     try:
+        config = AutoConfig.from_pretrained(
+            model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
+        )  # , force_download=True)
         if test_tokenizer:
             try:
+                tk = AutoTokenizer.from_pretrained(
+                    model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
+                )
             except ValueError as e:
+                return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
+            except Exception:
                 return (
                     False,
+                    "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
+                    None,
                 )
         return True, None, config
+    except ValueError:
         return (
             False,
             "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
+            None,
         )
     except Exception as e:
             return True, "uses a gated model.", None
         return False, f"was not found or misconfigured on the hub! Error raised was {e.args[0]}", None
 def get_model_size(model_info: ModelInfo, precision: str):
     size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
     safetensors = None
             size_match = re.search(size_pattern, model_info.id.lower())
             model_size = size_match.group(0)
             model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
+        except AttributeError:
             return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
     size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
     model_size = size_factor * model_size
     return model_size
 def get_model_arch(model_info: ModelInfo):
     return model_info.config.get("architectures", "Unknown")
 def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
     if org_or_user not in users_to_submission_dates:
         return True, ""
     return set(file_names), users_to_submission_dates
 def get_model_tags(model_card, model: str):
     is_merge_from_metadata = False
     is_moe_from_metadata = False
     if model_card is None:
         return tags
     if model_card.data.tags:
+        is_merge_from_metadata = any(
+            [tag in model_card.data.tags for tag in ["merge", "moerge", "mergekit", "lazymergekit"]]
+        )
         is_moe_from_metadata = any([tag in model_card.data.tags for tag in ["moe", "moerge"]])
+    is_merge_from_model_card = any(
+        keyword in model_card.text.lower() for keyword in ["merged model", "merge model", "moerge"]
+    )
     if is_merge_from_model_card or is_merge_from_metadata:
         tags.append("merge")
     is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in ["moe", "mixtral"])

src/submission/submit.py CHANGED Viewed

@@ -2,23 +2,34 @@ import json
 import os
 from datetime import datetime, timezone
-from huggingface_hub import ModelCard, snapshot_download
 from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_REPO, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
 from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
     get_model_size,
     is_model_on_hub,
     user_submission_permission,
-    get_model_tags
 )
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
     model: str,
     base_model: str,
@@ -58,7 +69,9 @@ def add_new_eval(
         return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
     if model == "CohereForAI/c4ai-command-r-plus":
-        return styled_warning("This model cannot be submitted manually on the leaderboard before the transformers release.")
     # Does the model actually exist?
     if revision == "":
@@ -66,7 +79,9 @@ def add_new_eval(
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True)
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')
@@ -81,10 +96,8 @@ def add_new_eval(
             architectures = getattr(model_config, "architectures", None)
             if architectures:
                 architecture = ";".join(architectures)
-            downloads = getattr(model_config, 'downloads', 0)
-            created_at = getattr(model_config, 'created_at', '')
     # Is the model info correctly filled?
     try:
@@ -103,7 +116,7 @@ def add_new_eval(
     modelcard_OK, error_msg, model_card = check_model_card(model)
     if not modelcard_OK:
         return styled_error(error_msg)
     tags = get_model_tags(model_card, model)
     # Seems good, creating the eval
@@ -130,8 +143,8 @@ def add_new_eval(
         "license": license,
         "still_on_hub": True,
         "tags": tags,
-        "downloads": downloads,
-        "created_at": created_at
     }
     # Check for duplicate submission
@@ -175,8 +188,6 @@ def add_new_eval(
         commit_message=f"Add {model} to dynamic info queue",
     )
     # Remove the local file
     os.remove(out_path)

 import os
 from datetime import datetime, timezone
+from huggingface_hub import snapshot_download
 from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import (
+    API,
+    DYNAMIC_INFO_FILE_PATH,
+    DYNAMIC_INFO_PATH,
+    DYNAMIC_INFO_REPO,
+    EVAL_REQUESTS_PATH,
+    H4_TOKEN,
+    QUEUE_REPO,
+    RATE_LIMIT_PERIOD,
+    RATE_LIMIT_QUOTA,
+)
 from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
     get_model_size,
+    get_model_tags,
     is_model_on_hub,
     user_submission_permission,
 )
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
     model: str,
     base_model: str,
         return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
     if model == "CohereForAI/c4ai-command-r-plus":
+        return styled_warning(
+            "This model cannot be submitted manually on the leaderboard before the transformers release."
+        )
     # Does the model actually exist?
     if revision == "":
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
+        base_model_on_hub, error, _ = is_model_on_hub(
+            model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True
+        )
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')
             architectures = getattr(model_config, "architectures", None)
             if architectures:
                 architecture = ";".join(architectures)
+            downloads = getattr(model_config, "downloads", 0)
+            created_at = getattr(model_config, "created_at", "")
     # Is the model info correctly filled?
     try:
     modelcard_OK, error_msg, model_card = check_model_card(model)
     if not modelcard_OK:
         return styled_error(error_msg)
     tags = get_model_tags(model_card, model)
     # Seems good, creating the eval
         "license": license,
         "still_on_hub": True,
         "tags": tags,
+        "downloads": downloads,
+        "created_at": created_at,
     }
     # Check for duplicate submission
         commit_message=f"Add {model} to dynamic info queue",
     )
     # Remove the local file
     os.remove(out_path)

src/tools/collections.py CHANGED Viewed

@@ -1,5 +1,3 @@
-import os
 import pandas as pd
 from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
 from huggingface_hub.utils._errors import HfHubHTTPError

 import pandas as pd
 from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
 from huggingface_hub.utils._errors import HfHubHTTPError

src/tools/plots.py CHANGED Viewed

@@ -1,14 +1,14 @@
-import pandas as pd
 import numpy as np
 import plotly.express as px
 from plotly.graph_objs import Figure
 from src.leaderboard.filter_models import FLAGGED_MODELS
-from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS
 from src.leaderboard.read_evals import EvalResult
 def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
     """
     Generates a DataFrame containing the maximum scores until each date.
@@ -18,7 +18,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
     """
     # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
     results_df = pd.DataFrame(raw_data)
-    #results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
     results_df.sort_values(by="date", inplace=True)
     # Step 2: Initialize the scores dictionary
@@ -31,8 +31,13 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
         column = task.col_name
         for _, row in results_df.iterrows():
             current_model = row["full_model"]
-            # We ignore models that are flagged/no longer on the hub/not finished
-            to_ignore = not row["still_on_hub"] or row["flagged"] or current_model in FLAGGED_MODELS or row["status"] != "FINISHED"
             if to_ignore:
                 continue
@@ -54,7 +59,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
     return {k: pd.DataFrame(v) for k, v in scores.items()}
-def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
     """
     Transforms the scores DataFrame into a new format suitable for plotting.
@@ -79,9 +84,7 @@ def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
     return concat_df
-def create_metric_plot_obj(
-    df: pd.DataFrame, metrics: list[str], title: str
-) -> Figure:
     """
     Create a Plotly figure object with lines representing different metrics
     and horizontal dotted lines representing human baselines.

 import numpy as np
+import pandas as pd
 import plotly.express as px
 from plotly.graph_objs import Figure
+from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
+from src.display.utils import human_baseline_row as HUMAN_BASELINE
 from src.leaderboard.filter_models import FLAGGED_MODELS
 from src.leaderboard.read_evals import EvalResult
 def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
     """
     Generates a DataFrame containing the maximum scores until each date.
     """
     # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
     results_df = pd.DataFrame(raw_data)
+    # results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
     results_df.sort_values(by="date", inplace=True)
     # Step 2: Initialize the scores dictionary
         column = task.col_name
         for _, row in results_df.iterrows():
             current_model = row["full_model"]
+            # We ignore models that are flagged/no longer on the hub/not finished
+            to_ignore = (
+                not row["still_on_hub"]
+                or row["flagged"]
+                or current_model in FLAGGED_MODELS
+                or row["status"] != "FINISHED"
+            )
             if to_ignore:
                 continue
     return {k: pd.DataFrame(v) for k, v in scores.items()}
+def create_plot_df(scores_df: dict[str : pd.DataFrame]) -> pd.DataFrame:
     """
     Transforms the scores DataFrame into a new format suitable for plotting.
     return concat_df
+def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) -> Figure:
     """
     Create a Plotly figure object with lines representing different metrics
     and horizontal dotted lines representing human baselines.

update_dynamic.py CHANGED Viewed

@@ -1,4 +1,4 @@
 from src.scripts.update_all_request_files import update_dynamic_files
 if __name__ == "__main__":
-    update_dynamic_files()

 from src.scripts.update_all_request_files import update_dynamic_files
 if __name__ == "__main__":
+    update_dynamic_files()