Spaces:

yangheng
/

OmniGenomeLeaderboard

Running

App Files Files Community

yangheng commited on Sep 12, 2024

Commit

1359887

1 Parent(s): f02c8f8

update_leaderboard

Browse files

Files changed (7) hide show

app.py +45 -181
requirements.txt +2 -1
src/about.py +16 -44
src/display/utils.py +1 -26
src/envs.py +2 -2
src/leaderboard/read_evals.py +20 -8
src/populate.py +13 -5

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
-import subprocess
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
@@ -18,8 +18,6 @@ from src.display.utils import (
     COLS,
     EVAL_COLS,
     EVAL_TYPES,
-    NUMERIC_INTERVALS,
-    TYPES,
     AutoEvalColumn,
     ModelType,
     fields,
@@ -34,6 +32,7 @@ from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
@@ -50,8 +49,7 @@ except Exception:
     restart_space()
-raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-leaderboard_df = original_df.copy()
 (
     finished_eval_queue_df,
@@ -59,77 +57,36 @@ leaderboard_df = original_df.copy()
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-# Searching and filtering
-def update_table(
-    hidden_df: pd.DataFrame,
-    columns: list,
-    type_query: list,
-    precision_query: str,
-    size_query: list,
-    show_deleted: bool,
-    query: str,
-):
-    filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
-    filtered_df = filter_queries(query, filtered_df)
-    df = select_columns(filtered_df, columns)
-    return df
-def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
-    return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
-def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
-    always_here_cols = [
-        AutoEvalColumn.model_type_symbol.name,
-        AutoEvalColumn.model.name,
-    ]
-    # We use COLS to maintain sorting
-    filtered_df = df[
-        always_here_cols + [c for c in COLS if c in df.columns and c in columns]
-    ]
-    return filtered_df
-def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
-    final_df = []
-    if query != "":
-        queries = [q.strip() for q in query.split(";")]
-        for _q in queries:
-            _q = _q.strip()
-            if _q != "":
-                temp_filtered_df = search_table(filtered_df, _q)
-                if len(temp_filtered_df) > 0:
-                    final_df.append(temp_filtered_df)
-        if len(final_df) > 0:
-            filtered_df = pd.concat(final_df)
-            filtered_df = filtered_df.drop_duplicates(
-                subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
-            )
-    return filtered_df
-def filter_models(
-    df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
-) -> pd.DataFrame:
-    # Show all models
-    if show_deleted:
-        filtered_df = df
-    else:  # Show only still on the hub models
-        filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
-    type_emoji = [t[0] for t in type_query]
-    filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
-    filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
-    numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
-    params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
-    mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
-    filtered_df = filtered_df.loc[mask]
-    return filtered_df
 demo = gr.Blocks(css=custom_css)
@@ -138,111 +95,18 @@ with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            with gr.Row():
-                with gr.Column():
-                    with gr.Row():
-                        search_bar = gr.Textbox(
-                            placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
-                            show_label=False,
-                            elem_id="search-bar",
-                        )
-                    with gr.Row():
-                        shown_columns = gr.CheckboxGroup(
-                            choices=[
-                                c.name
-                                for c in fields(AutoEvalColumn)
-                                if not c.hidden and not c.never_hidden
-                            ],
-                            value=[
-                                c.name
-                                for c in fields(AutoEvalColumn)
-                                if c.displayed_by_default and not c.hidden and not c.never_hidden
-                            ],
-                            label="Select columns to show",
-                            elem_id="column-select",
-                            interactive=True,
-                        )
-                    with gr.Row():
-                        deleted_models_visibility = gr.Checkbox(
-                            value=False, label="Show gated/private/deleted models", interactive=True
-                        )
-                with gr.Column(min_width=320):
-                    #with gr.Box(elem_id="box-filter"):
-                    filter_columns_type = gr.CheckboxGroup(
-                        label="Model types",
-                        choices=[t.to_str() for t in ModelType],
-                        value=[t.to_str() for t in ModelType],
-                        interactive=True,
-                        elem_id="filter-columns-type",
-                    )
-                    filter_columns_precision = gr.CheckboxGroup(
-                        label="Precision",
-                        choices=[i.value.name for i in Precision],
-                        value=[i.value.name for i in Precision],
-                        interactive=True,
-                        elem_id="filter-columns-precision",
-                    )
-                    filter_columns_size = gr.CheckboxGroup(
-                        label="Model sizes (in billions of parameters)",
-                        choices=list(NUMERIC_INTERVALS.keys()),
-                        value=list(NUMERIC_INTERVALS.keys()),
-                        interactive=True,
-                        elem_id="filter-columns-size",
-                    )
-            leaderboard_table = gr.components.Dataframe(
-                value=leaderboard_df[
-                    [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
-                    + shown_columns.value
-                ],
-                headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
-                datatype=TYPES,
-                elem_id="leaderboard-table",
-                interactive=False,
-                visible=True,
-            )
-            # Dummy leaderboard for handling the case when the user uses backspace key
-            hidden_leaderboard_table_for_search = gr.components.Dataframe(
-                value=original_df[COLS],
-                headers=COLS,
-                datatype=TYPES,
-                visible=False,
-            )
-            search_bar.submit(
-                update_table,
-                [
-                    hidden_leaderboard_table_for_search,
-                    shown_columns,
-                    filter_columns_type,
-                    filter_columns_precision,
-                    filter_columns_size,
-                    deleted_models_visibility,
-                    search_bar,
-                ],
-                leaderboard_table,
-            )
-            for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
-                selector.change(
-                    update_table,
-                    [
-                        hidden_leaderboard_table_for_search,
-                        shown_columns,
-                        filter_columns_type,
-                        filter_columns_precision,
-                        filter_columns_size,
-                        deleted_models_visibility,
-                        search_bar,
-                    ],
-                    leaderboard_table,
-                    queue=True,
-                )
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

 import gradio as gr
+from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
     COLS,
     EVAL_COLS,
     EVAL_TYPES,
     AutoEvalColumn,
     ModelType,
     fields,
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
+### Space initialisation
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
     restart_space()
+LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 (
     finished_eval_queue_df,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+def init_leaderboard(dataframe):
+    if dataframe is None or dataframe.empty:
+        raise ValueError("Leaderboard DataFrame is empty or None.")
+    return Leaderboard(
+        value=dataframe,
+        datatype=[c.type for c in fields(AutoEvalColumn)],
+        select_columns=SelectColumns(
+            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
+            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
+            label="Select Columns to Display:",
+        ),
+        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
+        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
+        filter_columns=[
+            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
+            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
+            ColumnFilter(
+                AutoEvalColumn.params.name,
+                type="slider",
+                min=0,
+                max=2000,
+                label="Select the number of parameters (M)",
+            ),
+            # ColumnFilter(
+            #     AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
+            # ),
+        ],
+        # bool_checkboxgroup_label="Hide models",
+        # interactive=False,
+    )
 demo = gr.Blocks(css=custom_css)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("RGB Benchmark", elem_id="rgb-benchmark-tab-table", id=0):
+            leaderboard = init_leaderboard(LEADERBOARD_DF)
+        # with gr.TabItem("PGB Benchmark", elem_id="pgb-benchmark-tab-table", id=0):
+        #     leaderboard1 = init_leaderboard(LEADERBOARD_DF)
+        # with gr.TabItem("GUE Benchmark", elem_id="gue-benchmark-tab-table", id=0):
+        #     leaderboard2 = init_leaderboard(LEADERBOARD_DF)
+        # with gr.TabItem("GB Benchmark", elem_id="gb-benchmark-tab-table", id=0):
+        #     leaderboard3 = init_leaderboard(LEADERBOARD_DF)
+        # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+        #     gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here! ", elem_id="rgb-benchmark-tab-table", id=3):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

requirements.txt CHANGED Viewed

@@ -15,4 +15,5 @@ transformers==4.35.2
 tokenizers>=0.15.0
 git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
 accelerate==0.24.1
-sentencepiece

 tokenizers>=0.15.0
 git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
 accelerate==0.24.1
+sentencepiece
+gradio_leaderboard

src/about.py CHANGED Viewed

@@ -12,8 +12,12 @@ class Task:
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
@@ -21,52 +25,20 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
-# What does your leaderboard evaluate?
-INTRODUCTION_TEXT = """
-Intro text
-"""
-# Which evaluations are you running? how can people reproduce what you have?
-LLM_BENCHMARKS_TEXT = f"""
-## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
-"""
-EVALUATION_QUEUE_TEXT = """
-## Some good practices before submitting a model
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
-```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
-```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-## In case of model failure
-If your model is displayed in the `FAILED` category, its execution stopped.
-Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
-CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
-CITATION_BUTTON_TEXT = r"""
-"""

 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("mRNA", "RMSE", "mRNA (RMSE)")
+    task1 = Task("SNMD", "AUC", "SNMD (AUC)")
+    task2 = Task("SNMR", "F1", "SNMR (F1)")
+    task3 = Task("ArchiveII", "F1", "ArchiveII (F1)")
+    task4 = Task("bpRNA", "F1", "bpRNA (F1)")
+    task5 = Task("RNAStralign", "F1", "RNAStralign (F1)")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">OmniGenomeBench Leaderboard</h1>"""
+LLM_BENCHMARKS_TEXT = f"""## Why do we need this benchmark?Large-scale foundation models for molecular biology constitute a vital and rapidly developing change in the computational biology and AI4Science landscape.As key parts of biology, such as DNA, RNA sequences, secondary structures, have a large effect on each other, the usage of this information within large-scale models allows for foundation models to be adapted and suited to multiple key tasks.However, with this trend comes significant issues, the primary one being the difficulty to comprehensively evaluate these models and compare them fairly.Here, we refer to the specific lack of real-world data to reflect the true performance of the models, rather than in-silico experiments only.This issue forces repeated benchmark testing and models being trained and adapted for a specific task that may not have any real-world benefit.Given the importance of this, we propose this genomic leaderboard on meticulously curated real-world datasets, to allow for a fair and comprehensive benchmark on the most important genomic downstream tasks.## Evaluation DatasetsTODO HERE## Reported Scores and RankingTODO HERE## How it worksDo we need this?## ReproducibilityTo reproduce our results, here are the commands you can run:"""
+EVALUATION_QUEUE_TEXT = """## Some good practices before submitting a model### 1) Make sure you can load your model and tokenizer using AutoClasses:```pythonfrom transformers import AutoConfig, AutoModel, AutoTokenizerconfig = AutoConfig.from_pretrained("your model name", revision=revision)model = AutoModel.from_pretrained("your model name", revision=revision)tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)```If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.Note: make sure your model is public!Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!### 3) Make sure your model has an open license!This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model :hugging_face:### 4) Fill up your model cardWhen we add extra information about models to the leaderboard, it will be automatically taken from the model card## In case of model failureIf your model is displayed in the `FAILED` category, its execution stopped.Make sure you have followed the above steps first.If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task)."""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@article{Yang2024,  author = {Yang, Heng and Li, Ke},  title = {Foundation Models Work},  journal = {arXiv},  year = {2024},  note = {arXiv preprint arXiv:XXXX.XXXXX}  url = {https://arxiv.org/abs/XXXX.XXXXX}}"""
+# What does your leaderboard evaluate?
+INTRODUCTION_TEXT = """
+The deciphering of RNA and DNA genomes has been ongoing for decades, with the aim of advancing genome analysis, including understanding and synthesizing genomes. Recently, Genomic Foundation Models (GFMs) have emerged as powerful tools for genome analysis and manipulation, leveraging advancements in natural language processing to model the "genomic language" encoded in genomes. However, GFMs face two significant challenges: the lack of benchmarking tools and open-source software for diverse genomics. This hinders progress in various genomic tasks, such as RNA design and structure prediction.
+We address these challenges by introducing a dedicated benchmarking toolkit, GFM-Bench. It integrates millions of genomic sequences across hundreds of tasks from four large-scale benchmarks, ensuring robust evaluation of GFMs under the FAIR principles. GFM-Bench tackles issues of data insufficiency, metric reliability, transfer benchmarking, and reproducibility—critical for identifying the limitations of GFMs.
+Additionally, we present an open-source software designed to simplify and democratize the use of GFMs for various in-silico genomic tasks. This software offers easy-to-use interfaces, tutorials, and broad compatibility with GFMs and genomic tasks, promoting transparency and innovation in the field. It also includes a public leaderboard for existing GFMs to drive advancements in genome modeling.
 """

src/display/utils.py CHANGED Viewed

@@ -26,7 +26,7 @@ auto_eval_column_dict = []
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
@@ -91,10 +91,6 @@ class WeightType(Enum):
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
-    float32 = ModelDetails("float32")
-    #qt_8bit = ModelDetails("8bit")
-    #qt_4bit = ModelDetails("4bit")
-    #qt_GPTQ = ModelDetails("GPTQ")
     Unknown = ModelDetails("?")
     def from_str(precision):
@@ -102,34 +98,13 @@ class Precision(Enum):
             return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
             return Precision.bfloat16
-        if precision in ["float32"]:
-            return Precision.float32
-        #if precision in ["8bit"]:
-        #    return Precision.qt_8bit
-        #if precision in ["4bit"]:
-        #    return Precision.qt_4bit
-        #if precision in ["GPTQ", "None"]:
-        #    return Precision.qt_GPTQ
         return Precision.Unknown
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
-TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
-COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
-TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]
-NUMERIC_INTERVALS = {
-    "?": pd.Interval(-1, 0, closed="right"),
-    "~1.5": pd.Interval(0, 2, closed="right"),
-    "~3": pd.Interval(2, 4, closed="right"),
-    "~7": pd.Interval(4, 9, closed="right"),
-    "~13": pd.Interval(9, 20, closed="right"),
-    "~35": pd.Interval(20, 45, closed="right"),
-    "~60": pd.Interval(45, 70, closed="right"),
-    "70+": pd.Interval(70, 10000, closed="right"),
-}

 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
+auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Rank", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
     Unknown = ModelDetails("?")
     def from_str(precision):
             return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
             return Precision.bfloat16
         return Precision.Unknown
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]

src/envs.py CHANGED Viewed

@@ -6,10 +6,10 @@ from huggingface_hub import HfApi
 # ----------------------------------
 TOKEN = os.environ.get("TOKEN") # A read/write token for your org
-OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
-REPO_ID = f"{OWNER}/leaderboard"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"

 # ----------------------------------
 TOKEN = os.environ.get("TOKEN") # A read/write token for your org
+OWNER = "yangheng" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
+REPO_ID = f"{OWNER}/OmniGenomeLeaderboard"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"

src/leaderboard/read_evals.py CHANGED Viewed

@@ -60,6 +60,7 @@ class EvalResult:
         still_on_hub, _, model_config = is_model_on_hub(
             full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
         if model_config is not None:
             architectures = getattr(model_config, "architectures", None)
@@ -70,13 +71,15 @@ class EvalResult:
         results = {}
         for task in Tasks:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
-            mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
         return self(
@@ -93,8 +96,8 @@ class EvalResult:
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
         request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
@@ -107,9 +110,11 @@ class EvalResult:
         except Exception:
             print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
-    def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
@@ -138,6 +143,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
         requests_path,
         f"{model_name}_eval_request_*.json",
     )
     request_files = glob.glob(request_files)
     # Select correct request file (precision)
@@ -146,6 +152,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
     for tmp_request_file in request_files:
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
             if (
                 req_content["status"] in ["FINISHED"]
                 and req_content["precision"] == precision.split(".")[-1]
@@ -186,9 +194,13 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
             eval_results[eval_name] = eval_result
     results = []
-    for v in eval_results.values():
         try:
-            v.to_dict() # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue

         still_on_hub, _, model_config = is_model_on_hub(
             full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
+        print("Is model on hub? \n", _)
         architecture = "?"
         if model_config is not None:
             architectures = getattr(model_config, "architectures", None)
         results = {}
         for task in Tasks:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
+            if task.benchmark == "mRNA":
+                # Keep RMSE at original value
+                mean_acc = np.mean(accs)
+            else:
+                mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
         return self(
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
+        # print("Requests Path: ", requests_path)
         request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
         except Exception:
             print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
+    def to_dict(self, rank):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
+        average = rank
+        # average = sorted(average, reverse=True)
+        # rank = [rank+1 for rank, value in enumerate(average)]
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
         requests_path,
         f"{model_name}_eval_request_*.json",
     )
+    # print("Request Files: ", request_files)
     request_files = glob.glob(request_files)
     # Select correct request file (precision)
     for tmp_request_file in request_files:
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
+            # print("Request File: ", tmp_request_file)
+            # print("Req Content: ", req_content)
             if (
                 req_content["status"] in ["FINISHED"]
                 and req_content["precision"] == precision.split(".")[-1]
             eval_results[eval_name] = eval_result
     results = []
+    for result in eval_results.values():
+        result.average = np.mean(list(result.results.values()))
+    sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
+    for i,v in enumerate(sorted_results):
         try:
+            v.to_dict(i) # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue

src/populate.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import json
 import os
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
@@ -11,15 +12,22 @@ from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
-    all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
-    df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
-    return raw_data, df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
@@ -55,4 +63,4 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
     df_running = pd.DataFrame.from_records(running_list, columns=cols)
     df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
-    return df_finished[cols], df_running[cols], df_pending[cols]

 import json
 import os
+import numpy as np
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
+    for result in raw_data:
+        result.average = np.mean(list(result.results.values()))
+    sorted_results = sorted(raw_data, key=lambda r: r.average, reverse=True)
+    print(sorted_results)
+    # ranks = [rank+1 for rank, value in enumerate(sorted_results)]
+    # rank = [rank+1 for rank, value in enumerate(average)]
+    all_data_json = [v.to_dict(i+1) for i, v in enumerate(raw_data)]
     df = pd.DataFrame.from_records(all_data_json)
+    # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
+    print(df)
+    return df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
     df_running = pd.DataFrame.from_records(running_list, columns=cols)
     df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
+    return df_finished[cols], df_running[cols], df_pending[cols]