Spaces:
Paused
Paused
Clémentine
commited on
Commit
·
b323764
1
Parent(s):
217b585
Added icons for types + fixed pending queue
Browse files- app.py +9 -10
- src/assets/hardcoded_evals.py +3 -0
- src/assets/text_content.py +7 -0
- src/auto_leaderboard/load_results.py +5 -1
- src/auto_leaderboard/model_metadata_type.py +19 -16
- src/utils_display.py +2 -2
app.py
CHANGED
@@ -99,7 +99,6 @@ def get_leaderboard_df():
|
|
99 |
|
100 |
|
101 |
def get_evaluation_queue_df():
|
102 |
-
# todo @saylortwift: replace the repo by the one you created for the eval queue
|
103 |
if eval_queue:
|
104 |
print("Pulling changes for the evaluation queue.")
|
105 |
eval_queue.git_pull()
|
@@ -141,7 +140,7 @@ def get_evaluation_queue_df():
|
|
141 |
data["model"] = make_clickable_model(data["model"])
|
142 |
all_evals.append(data)
|
143 |
|
144 |
-
pending_list = [e for e in all_evals if e["status"]
|
145 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
146 |
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
|
147 |
df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
|
@@ -388,6 +387,14 @@ with demo:
|
|
388 |
private = gr.Checkbox(
|
389 |
False, label="Private", visible=not IS_PUBLIC
|
390 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
391 |
|
392 |
with gr.Column():
|
393 |
precision = gr.Dropdown(
|
@@ -398,14 +405,6 @@ with demo:
|
|
398 |
max_choices=1,
|
399 |
interactive=True,
|
400 |
)
|
401 |
-
model_type = gr.Dropdown(
|
402 |
-
choices=["pretrained", "fine-tuned", "with RL"],
|
403 |
-
label="Model type",
|
404 |
-
multiselect=False,
|
405 |
-
value="pretrained",
|
406 |
-
max_choices=1,
|
407 |
-
interactive=True,
|
408 |
-
)
|
409 |
weight_type = gr.Dropdown(
|
410 |
choices=["Original", "Delta", "Adapter"],
|
411 |
label="Weights type",
|
|
|
99 |
|
100 |
|
101 |
def get_evaluation_queue_df():
|
|
|
102 |
if eval_queue:
|
103 |
print("Pulling changes for the evaluation queue.")
|
104 |
eval_queue.git_pull()
|
|
|
140 |
data["model"] = make_clickable_model(data["model"])
|
141 |
all_evals.append(data)
|
142 |
|
143 |
+
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
144 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
145 |
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
|
146 |
df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
|
|
|
387 |
private = gr.Checkbox(
|
388 |
False, label="Private", visible=not IS_PUBLIC
|
389 |
)
|
390 |
+
model_type = gr.Dropdown(
|
391 |
+
choices=["pretrained", "fine-tuned", "with RL"],
|
392 |
+
label="Model type",
|
393 |
+
multiselect=False,
|
394 |
+
value="pretrained",
|
395 |
+
max_choices=1,
|
396 |
+
interactive=True,
|
397 |
+
)
|
398 |
|
399 |
with gr.Column():
|
400 |
precision = gr.Dropdown(
|
|
|
405 |
max_choices=1,
|
406 |
interactive=True,
|
407 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
408 |
weight_type = gr.Dropdown(
|
409 |
choices=["Original", "Delta", "Adapter"],
|
410 |
label="Weights type",
|
src/assets/hardcoded_evals.py
CHANGED
@@ -10,6 +10,7 @@ gpt4_values = {
|
|
10 |
AutoEvalColumn.mmlu.name: 86.4,
|
11 |
AutoEvalColumn.truthfulqa.name: 59.0,
|
12 |
AutoEvalColumn.dummy.name: "GPT-4",
|
|
|
13 |
}
|
14 |
|
15 |
gpt35_values = {
|
@@ -22,6 +23,7 @@ gpt35_values = {
|
|
22 |
AutoEvalColumn.mmlu.name: 70.0,
|
23 |
AutoEvalColumn.truthfulqa.name: 47.0,
|
24 |
AutoEvalColumn.dummy.name: "GPT-3.5",
|
|
|
25 |
}
|
26 |
|
27 |
baseline = {
|
@@ -34,5 +36,6 @@ baseline = {
|
|
34 |
AutoEvalColumn.mmlu.name: 25.0,
|
35 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
36 |
AutoEvalColumn.dummy.name: "baseline",
|
|
|
37 |
}
|
38 |
|
|
|
10 |
AutoEvalColumn.mmlu.name: 86.4,
|
11 |
AutoEvalColumn.truthfulqa.name: 59.0,
|
12 |
AutoEvalColumn.dummy.name: "GPT-4",
|
13 |
+
AutoEvalColumn.model_type.name: "",
|
14 |
}
|
15 |
|
16 |
gpt35_values = {
|
|
|
23 |
AutoEvalColumn.mmlu.name: 70.0,
|
24 |
AutoEvalColumn.truthfulqa.name: 47.0,
|
25 |
AutoEvalColumn.dummy.name: "GPT-3.5",
|
26 |
+
AutoEvalColumn.model_type.name: "",
|
27 |
}
|
28 |
|
29 |
baseline = {
|
|
|
36 |
AutoEvalColumn.mmlu.name: 25.0,
|
37 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
38 |
AutoEvalColumn.dummy.name: "baseline",
|
39 |
+
AutoEvalColumn.model_type.name: "",
|
40 |
}
|
41 |
|
src/assets/text_content.py
CHANGED
@@ -128,6 +128,13 @@ To get more information about quantization, see:
|
|
128 |
- 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
|
129 |
- 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
# In case of model failure
|
132 |
If your model is displayed in the `FAILED` category, its execution stopped.
|
133 |
Make sure you have followed the above steps first.
|
|
|
128 |
- 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
|
129 |
- 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
|
130 |
|
131 |
+
### Icons
|
132 |
+
🟢 means that the model is pretrained
|
133 |
+
🔶 that it is finetuned
|
134 |
+
🟦 that is was trained with RL.
|
135 |
+
If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
|
136 |
+
|
137 |
+
|
138 |
# In case of model failure
|
139 |
If your model is displayed in the `FAILED` category, its execution stopped.
|
140 |
Make sure you have followed the above steps first.
|
src/auto_leaderboard/load_results.py
CHANGED
@@ -26,6 +26,8 @@ class EvalResult:
|
|
26 |
revision: str
|
27 |
results: dict
|
28 |
precision: str = "16bit"
|
|
|
|
|
29 |
|
30 |
def to_dict(self):
|
31 |
if self.org is not None:
|
@@ -35,7 +37,9 @@ class EvalResult:
|
|
35 |
data_dict = {}
|
36 |
|
37 |
data_dict["eval_name"] = self.eval_name # not a column, just a save name
|
|
|
38 |
data_dict[AutoEvalColumn.precision.name] = self.precision
|
|
|
39 |
data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
|
40 |
data_dict[AutoEvalColumn.dummy.name] = base_model
|
41 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
@@ -92,7 +96,7 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
|
|
92 |
continue
|
93 |
mean_acc = round(np.mean(accs) * 100.0, 1)
|
94 |
eval_results.append(EvalResult(
|
95 |
-
result_key, org, model, model_sha, {benchmark: mean_acc}
|
96 |
))
|
97 |
|
98 |
return result_key, eval_results
|
|
|
26 |
revision: str
|
27 |
results: dict
|
28 |
precision: str = "16bit"
|
29 |
+
model_type: str = ""
|
30 |
+
weight_type: str = ""
|
31 |
|
32 |
def to_dict(self):
|
33 |
if self.org is not None:
|
|
|
37 |
data_dict = {}
|
38 |
|
39 |
data_dict["eval_name"] = self.eval_name # not a column, just a save name
|
40 |
+
data_dict["weight_type"] = self.weight_type # not a column, just a save name
|
41 |
data_dict[AutoEvalColumn.precision.name] = self.precision
|
42 |
+
data_dict[AutoEvalColumn.model_type.name] = self.model_type
|
43 |
data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
|
44 |
data_dict[AutoEvalColumn.dummy.name] = base_model
|
45 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
|
|
96 |
continue
|
97 |
mean_acc = round(np.mean(accs) * 100.0, 1)
|
98 |
eval_results.append(EvalResult(
|
99 |
+
eval_name=result_key, org=org, model=model, revision=model_sha, results={benchmark: mean_acc}, #todo model_type=, weight_type=
|
100 |
))
|
101 |
|
102 |
return result_key, eval_results
|
src/auto_leaderboard/model_metadata_type.py
CHANGED
@@ -2,6 +2,8 @@ from dataclasses import dataclass
|
|
2 |
from enum import Enum
|
3 |
from typing import Dict, List
|
4 |
|
|
|
|
|
5 |
@dataclass
|
6 |
class ModelInfo:
|
7 |
name: str
|
@@ -167,23 +169,24 @@ TYPE_METADATA: Dict[str, ModelType] = {
|
|
167 |
|
168 |
def get_model_type(leaderboard_data: List[dict]):
|
169 |
for model_data in leaderboard_data:
|
170 |
-
#
|
171 |
-
model_data["
|
172 |
-
model_data["Type"] = ""
|
173 |
-
|
174 |
# Stored information
|
175 |
if model_data["model_name_for_query"] in TYPE_METADATA:
|
176 |
-
model_data[
|
177 |
-
model_data[
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
188 |
|
189 |
|
|
|
2 |
from enum import Enum
|
3 |
from typing import Dict, List
|
4 |
|
5 |
+
from ..utils_display import AutoEvalColumn
|
6 |
+
|
7 |
@dataclass
|
8 |
class ModelInfo:
|
9 |
name: str
|
|
|
169 |
|
170 |
def get_model_type(leaderboard_data: List[dict]):
|
171 |
for model_data in leaderboard_data:
|
172 |
+
# Todo @clefourrier once requests are connected with results
|
173 |
+
is_delta = False # (model_data["weight_type"] != "Original")
|
|
|
|
|
174 |
# Stored information
|
175 |
if model_data["model_name_for_query"] in TYPE_METADATA:
|
176 |
+
model_data[AutoEvalColumn.model_type.name] = TYPE_METADATA[model_data["model_name_for_query"]].value.name
|
177 |
+
model_data[AutoEvalColumn.model_type_symbol.name] = TYPE_METADATA[model_data["model_name_for_query"]].value.symbol + ("🔺" if is_delta else "")
|
178 |
+
# Inferred from the name or the selected type
|
179 |
+
elif model_data[AutoEvalColumn.model_type.name] == "pretrained" or any([i in model_data["model_name_for_query"] for i in ["pretrained"]]):
|
180 |
+
model_data[AutoEvalColumn.model_type.name] = ModelType.PT.value.name
|
181 |
+
model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.PT.value.symbol + ("🔺" if is_delta else "")
|
182 |
+
elif model_data[AutoEvalColumn.model_type.name] == "finetuned" or any([i in model_data["model_name_for_query"] for i in ["finetuned", "-ft-"]]):
|
183 |
+
model_data[AutoEvalColumn.model_type.name] = ModelType.SFT.value.name
|
184 |
+
model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.SFT.value.symbol + ("🔺" if is_delta else "")
|
185 |
+
elif model_data[AutoEvalColumn.model_type.name] == "with RL" or any([i in model_data["model_name_for_query"] for i in ["-rl-", "-rlhf-"]]):
|
186 |
+
model_data[AutoEvalColumn.model_type.name] = ModelType.RL.value.name
|
187 |
+
model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.RL.value.symbol + ("🔺" if is_delta else "")
|
188 |
+
else:
|
189 |
+
model_data[AutoEvalColumn.model_type.name] = "N/A"
|
190 |
+
model_data[AutoEvalColumn.model_type_symbol.name] = ("🔺" if is_delta else "")
|
191 |
|
192 |
|
src/utils_display.py
CHANGED
@@ -14,14 +14,14 @@ def fields(raw_class):
|
|
14 |
|
15 |
@dataclass(frozen=True)
|
16 |
class AutoEvalColumn: # Auto evals column
|
17 |
-
model_type_symbol = ColumnContent("
|
18 |
model = ColumnContent("Model", "markdown", True)
|
19 |
average = ColumnContent("Average ⬆️", "number", True)
|
20 |
arc = ColumnContent("ARC", "number", True)
|
21 |
hellaswag = ColumnContent("HellaSwag", "number", True)
|
22 |
mmlu = ColumnContent("MMLU", "number", True)
|
23 |
truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
|
24 |
-
model_type = ColumnContent("Type
|
25 |
precision = ColumnContent("Precision", "str", False, True)
|
26 |
license = ColumnContent("Hub License", "str", False)
|
27 |
params = ColumnContent("#Params (B)", "number", False)
|
|
|
14 |
|
15 |
@dataclass(frozen=True)
|
16 |
class AutoEvalColumn: # Auto evals column
|
17 |
+
model_type_symbol = ColumnContent("T", "str", True)
|
18 |
model = ColumnContent("Model", "markdown", True)
|
19 |
average = ColumnContent("Average ⬆️", "number", True)
|
20 |
arc = ColumnContent("ARC", "number", True)
|
21 |
hellaswag = ColumnContent("HellaSwag", "number", True)
|
22 |
mmlu = ColumnContent("MMLU", "number", True)
|
23 |
truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
|
24 |
+
model_type = ColumnContent("Type", "str", False)
|
25 |
precision = ColumnContent("Precision", "str", False, True)
|
26 |
license = ColumnContent("Hub License", "str", False)
|
27 |
params = ColumnContent("#Params (B)", "number", False)
|