yangheng commited on
Commit
1359887
·
1 Parent(s): f02c8f8

update_leaderboard

Browse files
Files changed (7) hide show
  1. app.py +45 -181
  2. requirements.txt +2 -1
  3. src/about.py +16 -44
  4. src/display/utils.py +1 -26
  5. src/envs.py +2 -2
  6. src/leaderboard/read_evals.py +20 -8
  7. src/populate.py +13 -5
app.py CHANGED
@@ -1,5 +1,5 @@
1
- import subprocess
2
  import gradio as gr
 
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
@@ -18,8 +18,6 @@ from src.display.utils import (
18
  COLS,
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
- NUMERIC_INTERVALS,
22
- TYPES,
23
  AutoEvalColumn,
24
  ModelType,
25
  fields,
@@ -34,6 +32,7 @@ from src.submission.submit import add_new_eval
34
  def restart_space():
35
  API.restart_space(repo_id=REPO_ID)
36
 
 
37
  try:
38
  print(EVAL_REQUESTS_PATH)
39
  snapshot_download(
@@ -50,8 +49,7 @@ except Exception:
50
  restart_space()
51
 
52
 
53
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
54
- leaderboard_df = original_df.copy()
55
 
56
  (
57
  finished_eval_queue_df,
@@ -59,77 +57,36 @@ leaderboard_df = original_df.copy()
59
  pending_eval_queue_df,
60
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
61
 
62
-
63
- # Searching and filtering
64
- def update_table(
65
- hidden_df: pd.DataFrame,
66
- columns: list,
67
- type_query: list,
68
- precision_query: str,
69
- size_query: list,
70
- show_deleted: bool,
71
- query: str,
72
- ):
73
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
74
- filtered_df = filter_queries(query, filtered_df)
75
- df = select_columns(filtered_df, columns)
76
- return df
77
-
78
-
79
- def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
80
- return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
81
-
82
-
83
- def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
84
- always_here_cols = [
85
- AutoEvalColumn.model_type_symbol.name,
86
- AutoEvalColumn.model.name,
87
- ]
88
- # We use COLS to maintain sorting
89
- filtered_df = df[
90
- always_here_cols + [c for c in COLS if c in df.columns and c in columns]
91
- ]
92
- return filtered_df
93
-
94
-
95
- def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
96
- final_df = []
97
- if query != "":
98
- queries = [q.strip() for q in query.split(";")]
99
- for _q in queries:
100
- _q = _q.strip()
101
- if _q != "":
102
- temp_filtered_df = search_table(filtered_df, _q)
103
- if len(temp_filtered_df) > 0:
104
- final_df.append(temp_filtered_df)
105
- if len(final_df) > 0:
106
- filtered_df = pd.concat(final_df)
107
- filtered_df = filtered_df.drop_duplicates(
108
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
109
- )
110
-
111
- return filtered_df
112
-
113
-
114
- def filter_models(
115
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
116
- ) -> pd.DataFrame:
117
- # Show all models
118
- if show_deleted:
119
- filtered_df = df
120
- else: # Show only still on the hub models
121
- filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
122
-
123
- type_emoji = [t[0] for t in type_query]
124
- filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
125
- filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
126
-
127
- numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
128
- params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
129
- mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
130
- filtered_df = filtered_df.loc[mask]
131
-
132
- return filtered_df
133
 
134
 
135
  demo = gr.Blocks(css=custom_css)
@@ -138,111 +95,18 @@ with demo:
138
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
139
 
140
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
141
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
142
- with gr.Row():
143
- with gr.Column():
144
- with gr.Row():
145
- search_bar = gr.Textbox(
146
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
147
- show_label=False,
148
- elem_id="search-bar",
149
- )
150
- with gr.Row():
151
- shown_columns = gr.CheckboxGroup(
152
- choices=[
153
- c.name
154
- for c in fields(AutoEvalColumn)
155
- if not c.hidden and not c.never_hidden
156
- ],
157
- value=[
158
- c.name
159
- for c in fields(AutoEvalColumn)
160
- if c.displayed_by_default and not c.hidden and not c.never_hidden
161
- ],
162
- label="Select columns to show",
163
- elem_id="column-select",
164
- interactive=True,
165
- )
166
- with gr.Row():
167
- deleted_models_visibility = gr.Checkbox(
168
- value=False, label="Show gated/private/deleted models", interactive=True
169
- )
170
- with gr.Column(min_width=320):
171
- #with gr.Box(elem_id="box-filter"):
172
- filter_columns_type = gr.CheckboxGroup(
173
- label="Model types",
174
- choices=[t.to_str() for t in ModelType],
175
- value=[t.to_str() for t in ModelType],
176
- interactive=True,
177
- elem_id="filter-columns-type",
178
- )
179
- filter_columns_precision = gr.CheckboxGroup(
180
- label="Precision",
181
- choices=[i.value.name for i in Precision],
182
- value=[i.value.name for i in Precision],
183
- interactive=True,
184
- elem_id="filter-columns-precision",
185
- )
186
- filter_columns_size = gr.CheckboxGroup(
187
- label="Model sizes (in billions of parameters)",
188
- choices=list(NUMERIC_INTERVALS.keys()),
189
- value=list(NUMERIC_INTERVALS.keys()),
190
- interactive=True,
191
- elem_id="filter-columns-size",
192
- )
193
-
194
- leaderboard_table = gr.components.Dataframe(
195
- value=leaderboard_df[
196
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
197
- + shown_columns.value
198
- ],
199
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
200
- datatype=TYPES,
201
- elem_id="leaderboard-table",
202
- interactive=False,
203
- visible=True,
204
- )
205
-
206
- # Dummy leaderboard for handling the case when the user uses backspace key
207
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
208
- value=original_df[COLS],
209
- headers=COLS,
210
- datatype=TYPES,
211
- visible=False,
212
- )
213
- search_bar.submit(
214
- update_table,
215
- [
216
- hidden_leaderboard_table_for_search,
217
- shown_columns,
218
- filter_columns_type,
219
- filter_columns_precision,
220
- filter_columns_size,
221
- deleted_models_visibility,
222
- search_bar,
223
- ],
224
- leaderboard_table,
225
- )
226
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
227
- selector.change(
228
- update_table,
229
- [
230
- hidden_leaderboard_table_for_search,
231
- shown_columns,
232
- filter_columns_type,
233
- filter_columns_precision,
234
- filter_columns_size,
235
- deleted_models_visibility,
236
- search_bar,
237
- ],
238
- leaderboard_table,
239
- queue=True,
240
- )
241
-
242
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
243
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
244
-
245
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
246
  with gr.Column():
247
  with gr.Row():
248
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
 
1
  import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
 
18
  COLS,
19
  EVAL_COLS,
20
  EVAL_TYPES,
 
 
21
  AutoEvalColumn,
22
  ModelType,
23
  fields,
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
+ ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
 
49
  restart_space()
50
 
51
 
52
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
53
 
54
  (
55
  finished_eval_queue_df,
 
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
+ def init_leaderboard(dataframe):
61
+ if dataframe is None or dataframe.empty:
62
+ raise ValueError("Leaderboard DataFrame is empty or None.")
63
+ return Leaderboard(
64
+ value=dataframe,
65
+ datatype=[c.type for c in fields(AutoEvalColumn)],
66
+ select_columns=SelectColumns(
67
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
+ label="Select Columns to Display:",
70
+ ),
71
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
+ filter_columns=[
74
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
+ ColumnFilter(
77
+ AutoEvalColumn.params.name,
78
+ type="slider",
79
+ min=0,
80
+ max=2000,
81
+ label="Select the number of parameters (M)",
82
+ ),
83
+ # ColumnFilter(
84
+ # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
+ # ),
86
+ ],
87
+ # bool_checkboxgroup_label="Hide models",
88
+ # interactive=False,
89
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
 
92
  demo = gr.Blocks(css=custom_css)
 
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
+ with gr.TabItem("RGB Benchmark", elem_id="rgb-benchmark-tab-table", id=0):
99
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
100
+ # with gr.TabItem("PGB Benchmark", elem_id="pgb-benchmark-tab-table", id=0):
101
+ # leaderboard1 = init_leaderboard(LEADERBOARD_DF)
102
+ # with gr.TabItem("GUE Benchmark", elem_id="gue-benchmark-tab-table", id=0):
103
+ # leaderboard2 = init_leaderboard(LEADERBOARD_DF)
104
+ # with gr.TabItem("GB Benchmark", elem_id="gb-benchmark-tab-table", id=0):
105
+ # leaderboard3 = init_leaderboard(LEADERBOARD_DF)
106
+ # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
107
+ # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
108
+
109
+ with gr.TabItem("🚀 Submit here! ", elem_id="rgb-benchmark-tab-table", id=3):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  with gr.Column():
111
  with gr.Row():
112
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
requirements.txt CHANGED
@@ -15,4 +15,5 @@ transformers==4.35.2
15
  tokenizers>=0.15.0
16
  git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
  accelerate==0.24.1
18
- sentencepiece
 
 
15
  tokenizers>=0.15.0
16
  git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
  accelerate==0.24.1
18
+ sentencepiece
19
+ gradio_leaderboard
src/about.py CHANGED
@@ -12,8 +12,12 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
@@ -21,52 +25,20 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
- # What does your leaderboard evaluate?
27
- INTRODUCTION_TEXT = """
28
- Intro text
29
- """
30
-
31
- # Which evaluations are you running? how can people reproduce what you have?
32
- LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
34
-
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
37
-
38
- """
39
-
40
- EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
42
 
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
- ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
-
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
-
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
 
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
 
60
 
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
63
 
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
  """
69
 
70
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
- CITATION_BUTTON_TEXT = r"""
72
- """
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("mRNA", "RMSE", "mRNA (RMSE)")
16
+ task1 = Task("SNMD", "AUC", "SNMD (AUC)")
17
+ task2 = Task("SNMR", "F1", "SNMR (F1)")
18
+ task3 = Task("ArchiveII", "F1", "ArchiveII (F1)")
19
+ task4 = Task("bpRNA", "F1", "bpRNA (F1)")
20
+ task5 = Task("RNAStralign", "F1", "RNAStralign (F1)")
21
 
22
  NUM_FEWSHOT = 0 # Change with your few shot
23
  # ---------------------------------------------------
 
25
 
26
 
27
  # Your leaderboard name
28
+ TITLE = """<h1 align="center" id="space-title">OmniGenomeBench Leaderboard</h1>"""
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ LLM_BENCHMARKS_TEXT = f"""## Why do we need this benchmark?Large-scale foundation models for molecular biology constitute a vital and rapidly developing change in the computational biology and AI4Science landscape.As key parts of biology, such as DNA, RNA sequences, secondary structures, have a large effect on each other, the usage of this information within large-scale models allows for foundation models to be adapted and suited to multiple key tasks.However, with this trend comes significant issues, the primary one being the difficulty to comprehensively evaluate these models and compare them fairly.Here, we refer to the specific lack of real-world data to reflect the true performance of the models, rather than in-silico experiments only.This issue forces repeated benchmark testing and models being trained and adapted for a specific task that may not have any real-world benefit.Given the importance of this, we propose this genomic leaderboard on meticulously curated real-world datasets, to allow for a fair and comprehensive benchmark on the most important genomic downstream tasks.## Evaluation DatasetsTODO HERE## Reported Scores and RankingTODO HERE## How it worksDo we need this?## ReproducibilityTo reproduce our results, here are the commands you can run:"""
32
+ EVALUATION_QUEUE_TEXT = """## Some good practices before submitting a model### 1) Make sure you can load your model and tokenizer using AutoClasses:```pythonfrom transformers import AutoConfig, AutoModel, AutoTokenizerconfig = AutoConfig.from_pretrained("your model name", revision=revision)model = AutoModel.from_pretrained("your model name", revision=revision)tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)```If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.Note: make sure your model is public!Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!### 3) Make sure your model has an open license!This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model :hugging_face:### 4) Fill up your model cardWhen we add extra information about models to the leaderboard, it will be automatically taken from the model card## In case of model failureIf your model is displayed in the `FAILED` category, its execution stopped.Make sure you have followed the above steps first.If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task)."""
33
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
34
+ CITATION_BUTTON_TEXT = r"""@article{Yang2024, author = {Yang, Heng and Li, Ke}, title = {Foundation Models Work}, journal = {arXiv}, year = {2024}, note = {arXiv preprint arXiv:XXXX.XXXXX} url = {https://arxiv.org/abs/XXXX.XXXXX}}"""
 
 
 
 
 
 
 
 
 
 
35
 
36
+ # What does your leaderboard evaluate?
37
+ INTRODUCTION_TEXT = """
38
+ The deciphering of RNA and DNA genomes has been ongoing for decades, with the aim of advancing genome analysis, including understanding and synthesizing genomes. Recently, Genomic Foundation Models (GFMs) have emerged as powerful tools for genome analysis and manipulation, leveraging advancements in natural language processing to model the "genomic language" encoded in genomes. However, GFMs face two significant challenges: the lack of benchmarking tools and open-source software for diverse genomics. This hinders progress in various genomic tasks, such as RNA design and structure prediction.
39
 
40
+ We address these challenges by introducing a dedicated benchmarking toolkit, GFM-Bench. It integrates millions of genomic sequences across hundreds of tasks from four large-scale benchmarks, ensuring robust evaluation of GFMs under the FAIR principles. GFM-Bench tackles issues of data insufficiency, metric reliability, transfer benchmarking, and reproducibility—critical for identifying the limitations of GFMs.
 
41
 
42
+ Additionally, we present an open-source software designed to simplify and democratize the use of GFMs for various in-silico genomic tasks. This software offers easy-to-use interfaces, tutorials, and broad compatibility with GFMs and genomic tasks, promoting transparency and innovation in the field. It also includes a public leaderboard for existing GFMs to drive advancements in genome modeling.
 
 
 
43
  """
44
 
 
 
 
src/display/utils.py CHANGED
@@ -26,7 +26,7 @@ auto_eval_column_dict = []
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
@@ -91,10 +91,6 @@ class WeightType(Enum):
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
94
- float32 = ModelDetails("float32")
95
- #qt_8bit = ModelDetails("8bit")
96
- #qt_4bit = ModelDetails("4bit")
97
- #qt_GPTQ = ModelDetails("GPTQ")
98
  Unknown = ModelDetails("?")
99
 
100
  def from_str(precision):
@@ -102,34 +98,13 @@ class Precision(Enum):
102
  return Precision.float16
103
  if precision in ["torch.bfloat16", "bfloat16"]:
104
  return Precision.bfloat16
105
- if precision in ["float32"]:
106
- return Precision.float32
107
- #if precision in ["8bit"]:
108
- # return Precision.qt_8bit
109
- #if precision in ["4bit"]:
110
- # return Precision.qt_4bit
111
- #if precision in ["GPTQ", "None"]:
112
- # return Precision.qt_GPTQ
113
  return Precision.Unknown
114
 
115
  # Column selection
116
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
117
- TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
118
- COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
119
- TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
120
 
121
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
122
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
123
 
124
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
125
 
126
- NUMERIC_INTERVALS = {
127
- "?": pd.Interval(-1, 0, closed="right"),
128
- "~1.5": pd.Interval(0, 2, closed="right"),
129
- "~3": pd.Interval(2, 4, closed="right"),
130
- "~7": pd.Interval(4, 9, closed="right"),
131
- "~13": pd.Interval(9, 20, closed="right"),
132
- "~35": pd.Interval(20, 45, closed="right"),
133
- "~60": pd.Interval(45, 70, closed="right"),
134
- "70+": pd.Interval(70, 10000, closed="right"),
135
- }
 
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Rank", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
 
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
 
 
 
 
94
  Unknown = ModelDetails("?")
95
 
96
  def from_str(precision):
 
98
  return Precision.float16
99
  if precision in ["torch.bfloat16", "bfloat16"]:
100
  return Precision.bfloat16
 
 
 
 
 
 
 
 
101
  return Precision.Unknown
102
 
103
  # Column selection
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
 
 
105
 
106
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
 
 
 
 
 
 
 
 
 
 
 
src/envs.py CHANGED
@@ -6,10 +6,10 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/leaderboard"
13
  QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "yangheng" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
+ REPO_ID = f"{OWNER}/OmniGenomeLeaderboard"
13
  QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
src/leaderboard/read_evals.py CHANGED
@@ -60,6 +60,7 @@ class EvalResult:
60
  still_on_hub, _, model_config = is_model_on_hub(
61
  full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
 
63
  architecture = "?"
64
  if model_config is not None:
65
  architectures = getattr(model_config, "architectures", None)
@@ -70,13 +71,15 @@ class EvalResult:
70
  results = {}
71
  for task in Tasks:
72
  task = task.value
73
-
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
-
79
- mean_acc = np.mean(accs) * 100.0
 
 
 
80
  results[task.benchmark] = mean_acc
81
 
82
  return self(
@@ -93,8 +96,8 @@ class EvalResult:
93
 
94
  def update_with_request_file(self, requests_path):
95
  """Finds the relevant request file for the current model and updates info with it"""
 
96
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
  try:
99
  with open(request_file, "r") as f:
100
  request = json.load(f)
@@ -107,9 +110,11 @@ class EvalResult:
107
  except Exception:
108
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
 
110
- def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -138,6 +143,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
138
  requests_path,
139
  f"{model_name}_eval_request_*.json",
140
  )
 
141
  request_files = glob.glob(request_files)
142
 
143
  # Select correct request file (precision)
@@ -146,6 +152,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
146
  for tmp_request_file in request_files:
147
  with open(tmp_request_file, "r") as f:
148
  req_content = json.load(f)
 
 
149
  if (
150
  req_content["status"] in ["FINISHED"]
151
  and req_content["precision"] == precision.split(".")[-1]
@@ -186,9 +194,13 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
186
  eval_results[eval_name] = eval_result
187
 
188
  results = []
189
- for v in eval_results.values():
 
 
 
 
190
  try:
191
- v.to_dict() # we test if the dict version is complete
192
  results.append(v)
193
  except KeyError: # not all eval values present
194
  continue
 
60
  still_on_hub, _, model_config = is_model_on_hub(
61
  full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
63
+ print("Is model on hub? \n", _)
64
  architecture = "?"
65
  if model_config is not None:
66
  architectures = getattr(model_config, "architectures", None)
 
71
  results = {}
72
  for task in Tasks:
73
  task = task.value
 
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
+ if task.benchmark == "mRNA":
79
+ # Keep RMSE at original value
80
+ mean_acc = np.mean(accs)
81
+ else:
82
+ mean_acc = np.mean(accs) * 100.0
83
  results[task.benchmark] = mean_acc
84
 
85
  return self(
 
96
 
97
  def update_with_request_file(self, requests_path):
98
  """Finds the relevant request file for the current model and updates info with it"""
99
+ # print("Requests Path: ", requests_path)
100
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
 
101
  try:
102
  with open(request_file, "r") as f:
103
  request = json.load(f)
 
110
  except Exception:
111
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
112
 
113
+ def to_dict(self, rank):
114
  """Converts the Eval Result to a dict compatible with our dataframe display"""
115
+ average = rank
116
+ # average = sorted(average, reverse=True)
117
+ # rank = [rank+1 for rank, value in enumerate(average)]
118
  data_dict = {
119
  "eval_name": self.eval_name, # not a column, just a save name,
120
  AutoEvalColumn.precision.name: self.precision.value.name,
 
143
  requests_path,
144
  f"{model_name}_eval_request_*.json",
145
  )
146
+ # print("Request Files: ", request_files)
147
  request_files = glob.glob(request_files)
148
 
149
  # Select correct request file (precision)
 
152
  for tmp_request_file in request_files:
153
  with open(tmp_request_file, "r") as f:
154
  req_content = json.load(f)
155
+ # print("Request File: ", tmp_request_file)
156
+ # print("Req Content: ", req_content)
157
  if (
158
  req_content["status"] in ["FINISHED"]
159
  and req_content["precision"] == precision.split(".")[-1]
 
194
  eval_results[eval_name] = eval_result
195
 
196
  results = []
197
+ for result in eval_results.values():
198
+ result.average = np.mean(list(result.results.values()))
199
+ sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
200
+
201
+ for i,v in enumerate(sorted_results):
202
  try:
203
+ v.to_dict(i) # we test if the dict version is complete
204
  results.append(v)
205
  except KeyError: # not all eval values present
206
  continue
src/populate.py CHANGED
@@ -1,8 +1,9 @@
1
  import json
2
  import os
3
-
4
  import pandas as pd
5
 
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
@@ -11,15 +12,22 @@ from src.leaderboard.read_evals import get_raw_eval_results
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
 
 
 
 
 
 
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
21
  df = df[has_no_nan_values(df, benchmark_cols)]
22
- return raw_data, df
 
23
 
24
 
25
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
@@ -55,4 +63,4 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
55
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
- return df_finished[cols], df_running[cols], df_pending[cols]
 
1
  import json
2
  import os
3
+ import numpy as np
4
  import pandas as pd
5
 
6
+
7
  from src.display.formatting import has_no_nan_values, make_clickable_model
8
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
9
  from src.leaderboard.read_evals import get_raw_eval_results
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path)
15
+ for result in raw_data:
16
+ result.average = np.mean(list(result.results.values()))
17
+ sorted_results = sorted(raw_data, key=lambda r: r.average, reverse=True)
18
+ print(sorted_results)
19
+ # ranks = [rank+1 for rank, value in enumerate(sorted_results)]
20
+ # rank = [rank+1 for rank, value in enumerate(average)]
21
+ all_data_json = [v.to_dict(i+1) for i, v in enumerate(raw_data)]
22
 
23
  df = pd.DataFrame.from_records(all_data_json)
24
+ # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
25
  df = df[cols].round(decimals=2)
26
 
27
  # filter out if any of the benchmarks have not been produced
28
  df = df[has_no_nan_values(df, benchmark_cols)]
29
+ print(df)
30
+ return df
31
 
32
 
33
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
63
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
64
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
65
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
66
+ return df_finished[cols], df_running[cols], df_pending[cols]