update_leaderboard
Browse files- app.py +45 -181
- requirements.txt +2 -1
- src/about.py +16 -44
- src/display/utils.py +1 -26
- src/envs.py +2 -2
- src/leaderboard/read_evals.py +20 -8
- src/populate.py +13 -5
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
import subprocess
|
2 |
import gradio as gr
|
|
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
from huggingface_hub import snapshot_download
|
@@ -18,8 +18,6 @@ from src.display.utils import (
|
|
18 |
COLS,
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
21 |
-
NUMERIC_INTERVALS,
|
22 |
-
TYPES,
|
23 |
AutoEvalColumn,
|
24 |
ModelType,
|
25 |
fields,
|
@@ -34,6 +32,7 @@ from src.submission.submit import add_new_eval
|
|
34 |
def restart_space():
|
35 |
API.restart_space(repo_id=REPO_ID)
|
36 |
|
|
|
37 |
try:
|
38 |
print(EVAL_REQUESTS_PATH)
|
39 |
snapshot_download(
|
@@ -50,8 +49,7 @@ except Exception:
|
|
50 |
restart_space()
|
51 |
|
52 |
|
53 |
-
|
54 |
-
leaderboard_df = original_df.copy()
|
55 |
|
56 |
(
|
57 |
finished_eval_queue_df,
|
@@ -59,77 +57,36 @@ leaderboard_df = original_df.copy()
|
|
59 |
pending_eval_queue_df,
|
60 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
)
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
return filtered_df
|
93 |
-
|
94 |
-
|
95 |
-
def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
96 |
-
final_df = []
|
97 |
-
if query != "":
|
98 |
-
queries = [q.strip() for q in query.split(";")]
|
99 |
-
for _q in queries:
|
100 |
-
_q = _q.strip()
|
101 |
-
if _q != "":
|
102 |
-
temp_filtered_df = search_table(filtered_df, _q)
|
103 |
-
if len(temp_filtered_df) > 0:
|
104 |
-
final_df.append(temp_filtered_df)
|
105 |
-
if len(final_df) > 0:
|
106 |
-
filtered_df = pd.concat(final_df)
|
107 |
-
filtered_df = filtered_df.drop_duplicates(
|
108 |
-
subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
|
109 |
-
)
|
110 |
-
|
111 |
-
return filtered_df
|
112 |
-
|
113 |
-
|
114 |
-
def filter_models(
|
115 |
-
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
|
116 |
-
) -> pd.DataFrame:
|
117 |
-
# Show all models
|
118 |
-
if show_deleted:
|
119 |
-
filtered_df = df
|
120 |
-
else: # Show only still on the hub models
|
121 |
-
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
122 |
-
|
123 |
-
type_emoji = [t[0] for t in type_query]
|
124 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
125 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
126 |
-
|
127 |
-
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
128 |
-
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
129 |
-
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
130 |
-
filtered_df = filtered_df.loc[mask]
|
131 |
-
|
132 |
-
return filtered_df
|
133 |
|
134 |
|
135 |
demo = gr.Blocks(css=custom_css)
|
@@ -138,111 +95,18 @@ with demo:
|
|
138 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
139 |
|
140 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
141 |
-
with gr.TabItem("
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
c.name
|
154 |
-
for c in fields(AutoEvalColumn)
|
155 |
-
if not c.hidden and not c.never_hidden
|
156 |
-
],
|
157 |
-
value=[
|
158 |
-
c.name
|
159 |
-
for c in fields(AutoEvalColumn)
|
160 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
161 |
-
],
|
162 |
-
label="Select columns to show",
|
163 |
-
elem_id="column-select",
|
164 |
-
interactive=True,
|
165 |
-
)
|
166 |
-
with gr.Row():
|
167 |
-
deleted_models_visibility = gr.Checkbox(
|
168 |
-
value=False, label="Show gated/private/deleted models", interactive=True
|
169 |
-
)
|
170 |
-
with gr.Column(min_width=320):
|
171 |
-
#with gr.Box(elem_id="box-filter"):
|
172 |
-
filter_columns_type = gr.CheckboxGroup(
|
173 |
-
label="Model types",
|
174 |
-
choices=[t.to_str() for t in ModelType],
|
175 |
-
value=[t.to_str() for t in ModelType],
|
176 |
-
interactive=True,
|
177 |
-
elem_id="filter-columns-type",
|
178 |
-
)
|
179 |
-
filter_columns_precision = gr.CheckboxGroup(
|
180 |
-
label="Precision",
|
181 |
-
choices=[i.value.name for i in Precision],
|
182 |
-
value=[i.value.name for i in Precision],
|
183 |
-
interactive=True,
|
184 |
-
elem_id="filter-columns-precision",
|
185 |
-
)
|
186 |
-
filter_columns_size = gr.CheckboxGroup(
|
187 |
-
label="Model sizes (in billions of parameters)",
|
188 |
-
choices=list(NUMERIC_INTERVALS.keys()),
|
189 |
-
value=list(NUMERIC_INTERVALS.keys()),
|
190 |
-
interactive=True,
|
191 |
-
elem_id="filter-columns-size",
|
192 |
-
)
|
193 |
-
|
194 |
-
leaderboard_table = gr.components.Dataframe(
|
195 |
-
value=leaderboard_df[
|
196 |
-
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
197 |
-
+ shown_columns.value
|
198 |
-
],
|
199 |
-
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
200 |
-
datatype=TYPES,
|
201 |
-
elem_id="leaderboard-table",
|
202 |
-
interactive=False,
|
203 |
-
visible=True,
|
204 |
-
)
|
205 |
-
|
206 |
-
# Dummy leaderboard for handling the case when the user uses backspace key
|
207 |
-
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
208 |
-
value=original_df[COLS],
|
209 |
-
headers=COLS,
|
210 |
-
datatype=TYPES,
|
211 |
-
visible=False,
|
212 |
-
)
|
213 |
-
search_bar.submit(
|
214 |
-
update_table,
|
215 |
-
[
|
216 |
-
hidden_leaderboard_table_for_search,
|
217 |
-
shown_columns,
|
218 |
-
filter_columns_type,
|
219 |
-
filter_columns_precision,
|
220 |
-
filter_columns_size,
|
221 |
-
deleted_models_visibility,
|
222 |
-
search_bar,
|
223 |
-
],
|
224 |
-
leaderboard_table,
|
225 |
-
)
|
226 |
-
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
|
227 |
-
selector.change(
|
228 |
-
update_table,
|
229 |
-
[
|
230 |
-
hidden_leaderboard_table_for_search,
|
231 |
-
shown_columns,
|
232 |
-
filter_columns_type,
|
233 |
-
filter_columns_precision,
|
234 |
-
filter_columns_size,
|
235 |
-
deleted_models_visibility,
|
236 |
-
search_bar,
|
237 |
-
],
|
238 |
-
leaderboard_table,
|
239 |
-
queue=True,
|
240 |
-
)
|
241 |
-
|
242 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
243 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
244 |
-
|
245 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
246 |
with gr.Column():
|
247 |
with gr.Row():
|
248 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
from huggingface_hub import snapshot_download
|
|
|
18 |
COLS,
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
|
|
|
|
21 |
AutoEvalColumn,
|
22 |
ModelType,
|
23 |
fields,
|
|
|
32 |
def restart_space():
|
33 |
API.restart_space(repo_id=REPO_ID)
|
34 |
|
35 |
+
### Space initialisation
|
36 |
try:
|
37 |
print(EVAL_REQUESTS_PATH)
|
38 |
snapshot_download(
|
|
|
49 |
restart_space()
|
50 |
|
51 |
|
52 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
53 |
|
54 |
(
|
55 |
finished_eval_queue_df,
|
|
|
57 |
pending_eval_queue_df,
|
58 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
|
60 |
+
def init_leaderboard(dataframe):
|
61 |
+
if dataframe is None or dataframe.empty:
|
62 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
63 |
+
return Leaderboard(
|
64 |
+
value=dataframe,
|
65 |
+
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
+
select_columns=SelectColumns(
|
67 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
+
label="Select Columns to Display:",
|
70 |
+
),
|
71 |
+
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
72 |
+
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
+
filter_columns=[
|
74 |
+
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
+
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
+
ColumnFilter(
|
77 |
+
AutoEvalColumn.params.name,
|
78 |
+
type="slider",
|
79 |
+
min=0,
|
80 |
+
max=2000,
|
81 |
+
label="Select the number of parameters (M)",
|
82 |
+
),
|
83 |
+
# ColumnFilter(
|
84 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
+
# ),
|
86 |
+
],
|
87 |
+
# bool_checkboxgroup_label="Hide models",
|
88 |
+
# interactive=False,
|
89 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
|
92 |
demo = gr.Blocks(css=custom_css)
|
|
|
95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
+
with gr.TabItem("RGB Benchmark", elem_id="rgb-benchmark-tab-table", id=0):
|
99 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
100 |
+
# with gr.TabItem("PGB Benchmark", elem_id="pgb-benchmark-tab-table", id=0):
|
101 |
+
# leaderboard1 = init_leaderboard(LEADERBOARD_DF)
|
102 |
+
# with gr.TabItem("GUE Benchmark", elem_id="gue-benchmark-tab-table", id=0):
|
103 |
+
# leaderboard2 = init_leaderboard(LEADERBOARD_DF)
|
104 |
+
# with gr.TabItem("GB Benchmark", elem_id="gb-benchmark-tab-table", id=0):
|
105 |
+
# leaderboard3 = init_leaderboard(LEADERBOARD_DF)
|
106 |
+
# with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
107 |
+
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
108 |
+
|
109 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="rgb-benchmark-tab-table", id=3):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
with gr.Column():
|
111 |
with gr.Row():
|
112 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
requirements.txt
CHANGED
@@ -15,4 +15,5 @@ transformers==4.35.2
|
|
15 |
tokenizers>=0.15.0
|
16 |
git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
|
17 |
accelerate==0.24.1
|
18 |
-
sentencepiece
|
|
|
|
15 |
tokenizers>=0.15.0
|
16 |
git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
|
17 |
accelerate==0.24.1
|
18 |
+
sentencepiece
|
19 |
+
gradio_leaderboard
|
src/about.py
CHANGED
@@ -12,8 +12,12 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("
|
16 |
-
task1 = Task("
|
|
|
|
|
|
|
|
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
@@ -21,52 +25,20 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
25 |
|
26 |
-
# What does your leaderboard evaluate?
|
27 |
-
INTRODUCTION_TEXT = """
|
28 |
-
Intro text
|
29 |
-
"""
|
30 |
-
|
31 |
-
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
-
LLM_BENCHMARKS_TEXT = f"""
|
33 |
-
## How it works
|
34 |
-
|
35 |
-
## Reproducibility
|
36 |
-
To reproduce our results, here is the commands you can run:
|
37 |
-
|
38 |
-
"""
|
39 |
-
|
40 |
-
EVALUATION_QUEUE_TEXT = """
|
41 |
-
## Some good practices before submitting a model
|
42 |
|
43 |
-
|
44 |
-
```
|
45 |
-
|
46 |
-
|
47 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
48 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
49 |
-
```
|
50 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
51 |
-
|
52 |
-
Note: make sure your model is public!
|
53 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
54 |
-
|
55 |
-
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
56 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
57 |
|
58 |
-
|
59 |
-
|
|
|
60 |
|
61 |
-
|
62 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
63 |
|
64 |
-
|
65 |
-
If your model is displayed in the `FAILED` category, its execution stopped.
|
66 |
-
Make sure you have followed the above steps first.
|
67 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
68 |
"""
|
69 |
|
70 |
-
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
71 |
-
CITATION_BUTTON_TEXT = r"""
|
72 |
-
"""
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("mRNA", "RMSE", "mRNA (RMSE)")
|
16 |
+
task1 = Task("SNMD", "AUC", "SNMD (AUC)")
|
17 |
+
task2 = Task("SNMR", "F1", "SNMR (F1)")
|
18 |
+
task3 = Task("ArchiveII", "F1", "ArchiveII (F1)")
|
19 |
+
task4 = Task("bpRNA", "F1", "bpRNA (F1)")
|
20 |
+
task5 = Task("RNAStralign", "F1", "RNAStralign (F1)")
|
21 |
|
22 |
NUM_FEWSHOT = 0 # Change with your few shot
|
23 |
# ---------------------------------------------------
|
|
|
25 |
|
26 |
|
27 |
# Your leaderboard name
|
28 |
+
TITLE = """<h1 align="center" id="space-title">OmniGenomeBench Leaderboard</h1>"""
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
+
LLM_BENCHMARKS_TEXT = f"""## Why do we need this benchmark?Large-scale foundation models for molecular biology constitute a vital and rapidly developing change in the computational biology and AI4Science landscape.As key parts of biology, such as DNA, RNA sequences, secondary structures, have a large effect on each other, the usage of this information within large-scale models allows for foundation models to be adapted and suited to multiple key tasks.However, with this trend comes significant issues, the primary one being the difficulty to comprehensively evaluate these models and compare them fairly.Here, we refer to the specific lack of real-world data to reflect the true performance of the models, rather than in-silico experiments only.This issue forces repeated benchmark testing and models being trained and adapted for a specific task that may not have any real-world benefit.Given the importance of this, we propose this genomic leaderboard on meticulously curated real-world datasets, to allow for a fair and comprehensive benchmark on the most important genomic downstream tasks.## Evaluation DatasetsTODO HERE## Reported Scores and RankingTODO HERE## How it worksDo we need this?## ReproducibilityTo reproduce our results, here are the commands you can run:"""
|
32 |
+
EVALUATION_QUEUE_TEXT = """## Some good practices before submitting a model### 1) Make sure you can load your model and tokenizer using AutoClasses:```pythonfrom transformers import AutoConfig, AutoModel, AutoTokenizerconfig = AutoConfig.from_pretrained("your model name", revision=revision)model = AutoModel.from_pretrained("your model name", revision=revision)tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)```If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.Note: make sure your model is public!Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!### 3) Make sure your model has an open license!This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model :hugging_face:### 4) Fill up your model cardWhen we add extra information about models to the leaderboard, it will be automatically taken from the model card## In case of model failureIf your model is displayed in the `FAILED` category, its execution stopped.Make sure you have followed the above steps first.If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task)."""
|
33 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
34 |
+
CITATION_BUTTON_TEXT = r"""@article{Yang2024, author = {Yang, Heng and Li, Ke}, title = {Foundation Models Work}, journal = {arXiv}, year = {2024}, note = {arXiv preprint arXiv:XXXX.XXXXX} url = {https://arxiv.org/abs/XXXX.XXXXX}}"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
+
# What does your leaderboard evaluate?
|
37 |
+
INTRODUCTION_TEXT = """
|
38 |
+
The deciphering of RNA and DNA genomes has been ongoing for decades, with the aim of advancing genome analysis, including understanding and synthesizing genomes. Recently, Genomic Foundation Models (GFMs) have emerged as powerful tools for genome analysis and manipulation, leveraging advancements in natural language processing to model the "genomic language" encoded in genomes. However, GFMs face two significant challenges: the lack of benchmarking tools and open-source software for diverse genomics. This hinders progress in various genomic tasks, such as RNA design and structure prediction.
|
39 |
|
40 |
+
We address these challenges by introducing a dedicated benchmarking toolkit, GFM-Bench. It integrates millions of genomic sequences across hundreds of tasks from four large-scale benchmarks, ensuring robust evaluation of GFMs under the FAIR principles. GFM-Bench tackles issues of data insufficiency, metric reliability, transfer benchmarking, and reproducibility—critical for identifying the limitations of GFMs.
|
|
|
41 |
|
42 |
+
Additionally, we present an open-source software designed to simplify and democratize the use of GFMs for various in-silico genomic tasks. This software offers easy-to-use interfaces, tutorials, and broad compatibility with GFMs and genomic tasks, promoting transparency and innovation in the field. It also includes a public leaderboard for existing GFMs to drive advancements in genome modeling.
|
|
|
|
|
|
|
43 |
"""
|
44 |
|
|
|
|
|
|
src/display/utils.py
CHANGED
@@ -26,7 +26,7 @@ auto_eval_column_dict = []
|
|
26 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
@@ -91,10 +91,6 @@ class WeightType(Enum):
|
|
91 |
class Precision(Enum):
|
92 |
float16 = ModelDetails("float16")
|
93 |
bfloat16 = ModelDetails("bfloat16")
|
94 |
-
float32 = ModelDetails("float32")
|
95 |
-
#qt_8bit = ModelDetails("8bit")
|
96 |
-
#qt_4bit = ModelDetails("4bit")
|
97 |
-
#qt_GPTQ = ModelDetails("GPTQ")
|
98 |
Unknown = ModelDetails("?")
|
99 |
|
100 |
def from_str(precision):
|
@@ -102,34 +98,13 @@ class Precision(Enum):
|
|
102 |
return Precision.float16
|
103 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
104 |
return Precision.bfloat16
|
105 |
-
if precision in ["float32"]:
|
106 |
-
return Precision.float32
|
107 |
-
#if precision in ["8bit"]:
|
108 |
-
# return Precision.qt_8bit
|
109 |
-
#if precision in ["4bit"]:
|
110 |
-
# return Precision.qt_4bit
|
111 |
-
#if precision in ["GPTQ", "None"]:
|
112 |
-
# return Precision.qt_GPTQ
|
113 |
return Precision.Unknown
|
114 |
|
115 |
# Column selection
|
116 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
117 |
-
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
118 |
-
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
119 |
-
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
120 |
|
121 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
122 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
123 |
|
124 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
125 |
|
126 |
-
NUMERIC_INTERVALS = {
|
127 |
-
"?": pd.Interval(-1, 0, closed="right"),
|
128 |
-
"~1.5": pd.Interval(0, 2, closed="right"),
|
129 |
-
"~3": pd.Interval(2, 4, closed="right"),
|
130 |
-
"~7": pd.Interval(4, 9, closed="right"),
|
131 |
-
"~13": pd.Interval(9, 20, closed="right"),
|
132 |
-
"~35": pd.Interval(20, 45, closed="right"),
|
133 |
-
"~60": pd.Interval(45, 70, closed="right"),
|
134 |
-
"70+": pd.Interval(70, 10000, closed="right"),
|
135 |
-
}
|
|
|
26 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Rank", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
|
|
91 |
class Precision(Enum):
|
92 |
float16 = ModelDetails("float16")
|
93 |
bfloat16 = ModelDetails("bfloat16")
|
|
|
|
|
|
|
|
|
94 |
Unknown = ModelDetails("?")
|
95 |
|
96 |
def from_str(precision):
|
|
|
98 |
return Precision.float16
|
99 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
100 |
return Precision.bfloat16
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
return Precision.Unknown
|
102 |
|
103 |
# Column selection
|
104 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
|
|
|
|
105 |
|
106 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
108 |
|
109 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/envs.py
CHANGED
@@ -6,10 +6,10 @@ from huggingface_hub import HfApi
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
8 |
|
9 |
-
OWNER = "
|
10 |
# ----------------------------------
|
11 |
|
12 |
-
REPO_ID = f"{OWNER}/
|
13 |
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
RESULTS_REPO = f"{OWNER}/results"
|
15 |
|
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "yangheng" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
+
REPO_ID = f"{OWNER}/OmniGenomeLeaderboard"
|
13 |
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
RESULTS_REPO = f"{OWNER}/results"
|
15 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -60,6 +60,7 @@ class EvalResult:
|
|
60 |
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
62 |
)
|
|
|
63 |
architecture = "?"
|
64 |
if model_config is not None:
|
65 |
architectures = getattr(model_config, "architectures", None)
|
@@ -70,13 +71,15 @@ class EvalResult:
|
|
70 |
results = {}
|
71 |
for task in Tasks:
|
72 |
task = task.value
|
73 |
-
|
74 |
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
continue
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
80 |
results[task.benchmark] = mean_acc
|
81 |
|
82 |
return self(
|
@@ -93,8 +96,8 @@ class EvalResult:
|
|
93 |
|
94 |
def update_with_request_file(self, requests_path):
|
95 |
"""Finds the relevant request file for the current model and updates info with it"""
|
|
|
96 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
97 |
-
|
98 |
try:
|
99 |
with open(request_file, "r") as f:
|
100 |
request = json.load(f)
|
@@ -107,9 +110,11 @@ class EvalResult:
|
|
107 |
except Exception:
|
108 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
109 |
|
110 |
-
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
-
average =
|
|
|
|
|
113 |
data_dict = {
|
114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -138,6 +143,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
138 |
requests_path,
|
139 |
f"{model_name}_eval_request_*.json",
|
140 |
)
|
|
|
141 |
request_files = glob.glob(request_files)
|
142 |
|
143 |
# Select correct request file (precision)
|
@@ -146,6 +152,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
146 |
for tmp_request_file in request_files:
|
147 |
with open(tmp_request_file, "r") as f:
|
148 |
req_content = json.load(f)
|
|
|
|
|
149 |
if (
|
150 |
req_content["status"] in ["FINISHED"]
|
151 |
and req_content["precision"] == precision.split(".")[-1]
|
@@ -186,9 +194,13 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
186 |
eval_results[eval_name] = eval_result
|
187 |
|
188 |
results = []
|
189 |
-
for
|
|
|
|
|
|
|
|
|
190 |
try:
|
191 |
-
v.to_dict() # we test if the dict version is complete
|
192 |
results.append(v)
|
193 |
except KeyError: # not all eval values present
|
194 |
continue
|
|
|
60 |
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
62 |
)
|
63 |
+
print("Is model on hub? \n", _)
|
64 |
architecture = "?"
|
65 |
if model_config is not None:
|
66 |
architectures = getattr(model_config, "architectures", None)
|
|
|
71 |
results = {}
|
72 |
for task in Tasks:
|
73 |
task = task.value
|
|
|
74 |
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
continue
|
78 |
+
if task.benchmark == "mRNA":
|
79 |
+
# Keep RMSE at original value
|
80 |
+
mean_acc = np.mean(accs)
|
81 |
+
else:
|
82 |
+
mean_acc = np.mean(accs) * 100.0
|
83 |
results[task.benchmark] = mean_acc
|
84 |
|
85 |
return self(
|
|
|
96 |
|
97 |
def update_with_request_file(self, requests_path):
|
98 |
"""Finds the relevant request file for the current model and updates info with it"""
|
99 |
+
# print("Requests Path: ", requests_path)
|
100 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
|
|
101 |
try:
|
102 |
with open(request_file, "r") as f:
|
103 |
request = json.load(f)
|
|
|
110 |
except Exception:
|
111 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
112 |
|
113 |
+
def to_dict(self, rank):
|
114 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
115 |
+
average = rank
|
116 |
+
# average = sorted(average, reverse=True)
|
117 |
+
# rank = [rank+1 for rank, value in enumerate(average)]
|
118 |
data_dict = {
|
119 |
"eval_name": self.eval_name, # not a column, just a save name,
|
120 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
143 |
requests_path,
|
144 |
f"{model_name}_eval_request_*.json",
|
145 |
)
|
146 |
+
# print("Request Files: ", request_files)
|
147 |
request_files = glob.glob(request_files)
|
148 |
|
149 |
# Select correct request file (precision)
|
|
|
152 |
for tmp_request_file in request_files:
|
153 |
with open(tmp_request_file, "r") as f:
|
154 |
req_content = json.load(f)
|
155 |
+
# print("Request File: ", tmp_request_file)
|
156 |
+
# print("Req Content: ", req_content)
|
157 |
if (
|
158 |
req_content["status"] in ["FINISHED"]
|
159 |
and req_content["precision"] == precision.split(".")[-1]
|
|
|
194 |
eval_results[eval_name] = eval_result
|
195 |
|
196 |
results = []
|
197 |
+
for result in eval_results.values():
|
198 |
+
result.average = np.mean(list(result.results.values()))
|
199 |
+
sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
|
200 |
+
|
201 |
+
for i,v in enumerate(sorted_results):
|
202 |
try:
|
203 |
+
v.to_dict(i) # we test if the dict version is complete
|
204 |
results.append(v)
|
205 |
except KeyError: # not all eval values present
|
206 |
continue
|
src/populate.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
-
|
4 |
import pandas as pd
|
5 |
|
|
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
@@ -11,15 +12,22 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
-
|
|
|
23 |
|
24 |
|
25 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
@@ -55,4 +63,4 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
55 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
56 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
57 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
58 |
-
return df_finished[cols], df_running[cols], df_pending[cols]
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
import numpy as np
|
4 |
import pandas as pd
|
5 |
|
6 |
+
|
7 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
8 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
|
|
12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
"""Creates a dataframe from all the individual experiment results"""
|
14 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
15 |
+
for result in raw_data:
|
16 |
+
result.average = np.mean(list(result.results.values()))
|
17 |
+
sorted_results = sorted(raw_data, key=lambda r: r.average, reverse=True)
|
18 |
+
print(sorted_results)
|
19 |
+
# ranks = [rank+1 for rank, value in enumerate(sorted_results)]
|
20 |
+
# rank = [rank+1 for rank, value in enumerate(average)]
|
21 |
+
all_data_json = [v.to_dict(i+1) for i, v in enumerate(raw_data)]
|
22 |
|
23 |
df = pd.DataFrame.from_records(all_data_json)
|
24 |
+
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
25 |
df = df[cols].round(decimals=2)
|
26 |
|
27 |
# filter out if any of the benchmarks have not been produced
|
28 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
29 |
+
print(df)
|
30 |
+
return df
|
31 |
|
32 |
|
33 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
63 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
64 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
65 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
66 |
+
return df_finished[cols], df_running[cols], df_pending[cols]
|