update space
Browse files- app.py +23 -21
- src/about.py +2 -2
- src/leaderboard/read_evals.py +6 -5
app.py
CHANGED
@@ -36,7 +36,7 @@ from src.submission.submit import add_new_eval
|
|
36 |
def restart_space():
|
37 |
API.restart_space(repo_id=REPO_ID)
|
38 |
|
39 |
-
|
40 |
try:
|
41 |
print(EVAL_REQUESTS_PATH)
|
42 |
snapshot_download(
|
@@ -54,6 +54,7 @@ except Exception:
|
|
54 |
|
55 |
|
56 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
57 |
|
58 |
(
|
59 |
finished_eval_queue_df,
|
@@ -61,6 +62,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
|
61 |
pending_eval_queue_df,
|
62 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
63 |
|
|
|
64 |
def init_leaderboard(dataframe):
|
65 |
if dataframe is None or dataframe.empty:
|
66 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
@@ -74,20 +76,20 @@ def init_leaderboard(dataframe):
|
|
74 |
),
|
75 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
76 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
77 |
-
filter_columns=[
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
],
|
91 |
bool_checkboxgroup_label="Hide models",
|
92 |
interactive=False,
|
93 |
)
|
@@ -97,7 +99,7 @@ demo = gr.Blocks(css=custom_css)
|
|
97 |
with demo:
|
98 |
gr.HTML(TITLE)
|
99 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
100 |
-
gr.Markdown(INTRODUCTION_TEXT_ZH, elem_classes="markdown-text")
|
101 |
|
102 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
103 |
with gr.TabItem("π
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
@@ -106,16 +108,16 @@ with demo:
|
|
106 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
107 |
with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1):
|
108 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
109 |
-
with gr.TabItem("ZH", elem_id="llm-benchmark-tab-table", id=2):
|
110 |
-
|
111 |
|
112 |
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
113 |
with gr.Column():
|
114 |
with gr.Row():
|
115 |
with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1):
|
116 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
117 |
-
with gr.TabItem("ZH", elem_id="llm-benchmark-tab-table", id=2):
|
118 |
-
|
119 |
|
120 |
with gr.Column():
|
121 |
with gr.Accordion(
|
@@ -221,4 +223,4 @@ with demo:
|
|
221 |
scheduler = BackgroundScheduler()
|
222 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
223 |
scheduler.start()
|
224 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
36 |
def restart_space():
|
37 |
API.restart_space(repo_id=REPO_ID)
|
38 |
|
39 |
+
# Space initialisation
|
40 |
try:
|
41 |
print(EVAL_REQUESTS_PATH)
|
42 |
snapshot_download(
|
|
|
54 |
|
55 |
|
56 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
57 |
+
# print("Before calling init_leaderboard:", LEADERBOARD_DF)
|
58 |
|
59 |
(
|
60 |
finished_eval_queue_df,
|
|
|
62 |
pending_eval_queue_df,
|
63 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
64 |
|
65 |
+
|
66 |
def init_leaderboard(dataframe):
|
67 |
if dataframe is None or dataframe.empty:
|
68 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
76 |
),
|
77 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
78 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
79 |
+
# filter_columns=[
|
80 |
+
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
81 |
+
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
82 |
+
# ColumnFilter(
|
83 |
+
# AutoEvalColumn.params.name,
|
84 |
+
# type="slider",
|
85 |
+
# min=0.01,
|
86 |
+
# max=150,
|
87 |
+
# label="Select the number of parameters (B)",
|
88 |
+
# ),
|
89 |
+
# ColumnFilter(
|
90 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
91 |
+
# ),
|
92 |
+
# ],
|
93 |
bool_checkboxgroup_label="Hide models",
|
94 |
interactive=False,
|
95 |
)
|
|
|
99 |
with demo:
|
100 |
gr.HTML(TITLE)
|
101 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
102 |
+
# gr.Markdown(INTRODUCTION_TEXT_ZH, elem_classes="markdown-text")
|
103 |
|
104 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
105 |
with gr.TabItem("π
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
|
|
108 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
109 |
with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1):
|
110 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
111 |
+
# with gr.TabItem("ZH", elem_id="llm-benchmark-tab-table", id=2):
|
112 |
+
# gr.Markdown(LLM_BENCHMARKS_TEXT_ZH, elem_classes="markdown-text")
|
113 |
|
114 |
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
115 |
with gr.Column():
|
116 |
with gr.Row():
|
117 |
with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1):
|
118 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
119 |
+
# with gr.TabItem("ZH", elem_id="llm-benchmark-tab-table", id=2):
|
120 |
+
# gr.Markdown(EVALUATION_QUEUE_TEXT_ZH, elem_classes="markdown-text")
|
121 |
|
122 |
with gr.Column():
|
123 |
with gr.Accordion(
|
|
|
223 |
scheduler = BackgroundScheduler()
|
224 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
225 |
scheduler.start()
|
226 |
+
demo.queue(default_concurrency_limit=40).launch(share=True)
|
src/about.py
CHANGED
@@ -13,8 +13,8 @@ class Task:
|
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
task0 = Task("mmmlu", "acc", "MMMLU")
|
16 |
-
task1 = Task("mmlu", "acc", "MMLU")
|
17 |
-
task2 = Task("cmmlu", "acc", "CMMLU")
|
18 |
task3 = Task("mmmlu_ar", "acc", "MMMLU_AR")
|
19 |
task4 = Task("mmmlu_bn", "acc", "MMMLU_BN")
|
20 |
task5 = Task("mmmlu_de", "acc", "MMMLU_DE")
|
|
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
task0 = Task("mmmlu", "acc", "MMMLU")
|
16 |
+
# task1 = Task("mmlu", "acc", "MMLU")
|
17 |
+
# task2 = Task("cmmlu", "acc", "CMMLU")
|
18 |
task3 = Task("mmmlu_ar", "acc", "MMMLU_AR")
|
19 |
task4 = Task("mmmlu_bn", "acc", "MMMLU_BN")
|
20 |
task5 = Task("mmmlu_de", "acc", "MMMLU_DE")
|
src/leaderboard/read_evals.py
CHANGED
@@ -96,7 +96,7 @@ class EvalResult:
|
|
96 |
def update_with_request_file(self, requests_path):
|
97 |
"""Finds the relevant request file for the current model and updates info with it"""
|
98 |
request_file = get_request_file_for_model(requests_path, self.full_model.split("/")[-1], self.precision.value.name)
|
99 |
-
# print("########",
|
100 |
|
101 |
try:
|
102 |
with open(request_file, "r") as f:
|
@@ -112,9 +112,10 @@ class EvalResult:
|
|
112 |
|
113 |
def to_dict(self):
|
114 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
115 |
-
keys_to_average = ['mmmlu', 'mmlu', 'cmmlu']
|
116 |
-
average = sum([self.results[key] for key in keys_to_average if self.results.get(key) is not None]) / len(
|
117 |
-
|
|
|
118 |
data_dict = {
|
119 |
"eval_name": self.eval_name, # not a column, just a save name,
|
120 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -182,6 +183,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
182 |
for model_result_filepath in model_result_filepaths:
|
183 |
# Creation of result
|
184 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
|
|
185 |
eval_result.update_with_request_file(requests_path)
|
186 |
|
187 |
# Store results of same eval together
|
@@ -198,5 +200,4 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
198 |
results.append(v)
|
199 |
except KeyError: # not all eval values present
|
200 |
continue
|
201 |
-
|
202 |
return results
|
|
|
96 |
def update_with_request_file(self, requests_path):
|
97 |
"""Finds the relevant request file for the current model and updates info with it"""
|
98 |
request_file = get_request_file_for_model(requests_path, self.full_model.split("/")[-1], self.precision.value.name)
|
99 |
+
# print("########",requests_path,self.full_model.split("/")[-1])
|
100 |
|
101 |
try:
|
102 |
with open(request_file, "r") as f:
|
|
|
112 |
|
113 |
def to_dict(self):
|
114 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
115 |
+
# keys_to_average = ['mmmlu', 'mmlu', 'cmmlu']
|
116 |
+
# average = sum([self.results[key] for key in keys_to_average if self.results.get(key) is not None]) / len(
|
117 |
+
# keys_to_average)
|
118 |
+
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
119 |
data_dict = {
|
120 |
"eval_name": self.eval_name, # not a column, just a save name,
|
121 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
183 |
for model_result_filepath in model_result_filepaths:
|
184 |
# Creation of result
|
185 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
186 |
+
print(results_path)
|
187 |
eval_result.update_with_request_file(requests_path)
|
188 |
|
189 |
# Store results of same eval together
|
|
|
200 |
results.append(v)
|
201 |
except KeyError: # not all eval values present
|
202 |
continue
|
|
|
203 |
return results
|