Spaces:
Paused
Paused
Clémentine
commited on
Commit
•
12cea14
1
Parent(s):
99b25b8
FT: precision and adapter models
Browse files
app.py
CHANGED
@@ -28,7 +28,6 @@ PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
|
|
28 |
PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
|
29 |
|
30 |
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
31 |
-
ADD_PLOTS = False
|
32 |
|
33 |
EVAL_REQUESTS_PATH = "eval-queue"
|
34 |
EVAL_RESULTS_PATH = "eval-results"
|
@@ -56,8 +55,8 @@ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default an
|
|
56 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
57 |
|
58 |
if not IS_PUBLIC:
|
59 |
-
COLS.insert(2, AutoEvalColumn.
|
60 |
-
TYPES.insert(2, AutoEvalColumn.
|
61 |
|
62 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
63 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
@@ -177,25 +176,27 @@ def add_new_eval(
|
|
177 |
model: str,
|
178 |
base_model: str,
|
179 |
revision: str,
|
180 |
-
|
181 |
private: bool,
|
182 |
-
|
183 |
):
|
|
|
184 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
185 |
|
186 |
# check the model actually exists before adding the eval
|
187 |
if revision == "":
|
188 |
revision = "main"
|
189 |
|
190 |
-
if
|
191 |
base_model_on_hub, error = is_model_on_hub(base_model, revision)
|
192 |
if not base_model_on_hub:
|
193 |
return styled_error(f'Base model "{base_model}" {error}')
|
|
|
194 |
|
195 |
model_on_hub, error = is_model_on_hub(model, revision)
|
196 |
if not model_on_hub:
|
197 |
return styled_error(f'Model "{model}" {error}')
|
198 |
-
|
199 |
print("adding new eval")
|
200 |
|
201 |
eval_entry = {
|
@@ -203,8 +204,8 @@ def add_new_eval(
|
|
203 |
"base_model": base_model,
|
204 |
"revision": revision,
|
205 |
"private": private,
|
206 |
-
"
|
207 |
-
"
|
208 |
"status": "PENDING",
|
209 |
"submitted_time": current_time,
|
210 |
}
|
@@ -217,7 +218,7 @@ def add_new_eval(
|
|
217 |
|
218 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
219 |
os.makedirs(OUT_DIR, exist_ok=True)
|
220 |
-
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{
|
221 |
|
222 |
# Check for duplicate submission
|
223 |
if out_path.split("eval-queue/")[1].lower() in requested_models:
|
@@ -381,17 +382,29 @@ with demo:
|
|
381 |
revision_name_textbox = gr.Textbox(
|
382 |
label="revision", placeholder="main"
|
383 |
)
|
|
|
|
|
|
|
384 |
|
385 |
with gr.Column():
|
386 |
-
|
387 |
-
|
|
|
|
|
|
|
|
|
|
|
388 |
)
|
389 |
-
|
390 |
-
|
|
|
|
|
|
|
|
|
|
|
391 |
)
|
392 |
-
is_delta_weight = gr.Checkbox(False, label="Delta weights")
|
393 |
base_model_name_textbox = gr.Textbox(
|
394 |
-
label="
|
395 |
)
|
396 |
|
397 |
submit_button = gr.Button("Submit Eval")
|
@@ -402,9 +415,9 @@ with demo:
|
|
402 |
model_name_textbox,
|
403 |
base_model_name_textbox,
|
404 |
revision_name_textbox,
|
405 |
-
|
406 |
private,
|
407 |
-
|
408 |
],
|
409 |
submission_result,
|
410 |
)
|
|
|
28 |
PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
|
29 |
|
30 |
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
|
|
31 |
|
32 |
EVAL_REQUESTS_PATH = "eval-queue"
|
33 |
EVAL_RESULTS_PATH = "eval-results"
|
|
|
55 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
56 |
|
57 |
if not IS_PUBLIC:
|
58 |
+
COLS.insert(2, AutoEvalColumn.precision.name)
|
59 |
+
TYPES.insert(2, AutoEvalColumn.precision.type)
|
60 |
|
61 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
62 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
|
|
176 |
model: str,
|
177 |
base_model: str,
|
178 |
revision: str,
|
179 |
+
precision: str,
|
180 |
private: bool,
|
181 |
+
weight_type: str,
|
182 |
):
|
183 |
+
precision = precision.split(" ")[0]
|
184 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
185 |
|
186 |
# check the model actually exists before adding the eval
|
187 |
if revision == "":
|
188 |
revision = "main"
|
189 |
|
190 |
+
if weight_type in ["Delta", "Adapter"]:
|
191 |
base_model_on_hub, error = is_model_on_hub(base_model, revision)
|
192 |
if not base_model_on_hub:
|
193 |
return styled_error(f'Base model "{base_model}" {error}')
|
194 |
+
|
195 |
|
196 |
model_on_hub, error = is_model_on_hub(model, revision)
|
197 |
if not model_on_hub:
|
198 |
return styled_error(f'Model "{model}" {error}')
|
199 |
+
|
200 |
print("adding new eval")
|
201 |
|
202 |
eval_entry = {
|
|
|
204 |
"base_model": base_model,
|
205 |
"revision": revision,
|
206 |
"private": private,
|
207 |
+
"precision": precision,
|
208 |
+
"weight_type": weight_type,
|
209 |
"status": "PENDING",
|
210 |
"submitted_time": current_time,
|
211 |
}
|
|
|
218 |
|
219 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
220 |
os.makedirs(OUT_DIR, exist_ok=True)
|
221 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
|
222 |
|
223 |
# Check for duplicate submission
|
224 |
if out_path.split("eval-queue/")[1].lower() in requested_models:
|
|
|
382 |
revision_name_textbox = gr.Textbox(
|
383 |
label="revision", placeholder="main"
|
384 |
)
|
385 |
+
private = gr.Checkbox(
|
386 |
+
False, label="Private", visible=not IS_PUBLIC
|
387 |
+
)
|
388 |
|
389 |
with gr.Column():
|
390 |
+
precision = gr.Dropdown(
|
391 |
+
choices=["float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)"],
|
392 |
+
label="Precision",
|
393 |
+
multiselect=False,
|
394 |
+
value="float16",
|
395 |
+
max_choices=1,
|
396 |
+
interactive=True,
|
397 |
)
|
398 |
+
weight_type = gr.Dropdown(
|
399 |
+
choices=["Original", "Delta", "Adapter"],
|
400 |
+
label="Weights type",
|
401 |
+
multiselect=False,
|
402 |
+
value="Original",
|
403 |
+
max_choices=1,
|
404 |
+
interactive=True,
|
405 |
)
|
|
|
406 |
base_model_name_textbox = gr.Textbox(
|
407 |
+
label="Base model (for delta or adapter weights)"
|
408 |
)
|
409 |
|
410 |
submit_button = gr.Button("Submit Eval")
|
|
|
415 |
model_name_textbox,
|
416 |
base_model_name_textbox,
|
417 |
revision_name_textbox,
|
418 |
+
precision,
|
419 |
private,
|
420 |
+
weight_type,
|
421 |
],
|
422 |
submission_result,
|
423 |
)
|
src/assets/hardcoded_evals.py
CHANGED
@@ -3,7 +3,7 @@ from src.utils_display import AutoEvalColumn, model_hyperlink
|
|
3 |
gpt4_values = {
|
4 |
AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
|
5 |
AutoEvalColumn.revision.name: "tech report",
|
6 |
-
AutoEvalColumn.
|
7 |
AutoEvalColumn.average.name: 84.3,
|
8 |
AutoEvalColumn.arc.name: 96.3,
|
9 |
AutoEvalColumn.hellaswag.name: 95.3,
|
@@ -15,7 +15,7 @@ gpt4_values = {
|
|
15 |
gpt35_values = {
|
16 |
AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
|
17 |
AutoEvalColumn.revision.name: "tech report",
|
18 |
-
AutoEvalColumn.
|
19 |
AutoEvalColumn.average.name: 71.9,
|
20 |
AutoEvalColumn.arc.name: 85.2,
|
21 |
AutoEvalColumn.hellaswag.name: 85.5,
|
@@ -27,7 +27,7 @@ gpt35_values = {
|
|
27 |
baseline = {
|
28 |
AutoEvalColumn.model.name: "<p>Baseline</p>",
|
29 |
AutoEvalColumn.revision.name: "N/A",
|
30 |
-
AutoEvalColumn.
|
31 |
AutoEvalColumn.average.name: 25.0,
|
32 |
AutoEvalColumn.arc.name: 25.0,
|
33 |
AutoEvalColumn.hellaswag.name: 25.0,
|
|
|
3 |
gpt4_values = {
|
4 |
AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
|
5 |
AutoEvalColumn.revision.name: "tech report",
|
6 |
+
AutoEvalColumn.precision.name: None,
|
7 |
AutoEvalColumn.average.name: 84.3,
|
8 |
AutoEvalColumn.arc.name: 96.3,
|
9 |
AutoEvalColumn.hellaswag.name: 95.3,
|
|
|
15 |
gpt35_values = {
|
16 |
AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
|
17 |
AutoEvalColumn.revision.name: "tech report",
|
18 |
+
AutoEvalColumn.precision.name: None,
|
19 |
AutoEvalColumn.average.name: 71.9,
|
20 |
AutoEvalColumn.arc.name: 85.2,
|
21 |
AutoEvalColumn.hellaswag.name: 85.5,
|
|
|
27 |
baseline = {
|
28 |
AutoEvalColumn.model.name: "<p>Baseline</p>",
|
29 |
AutoEvalColumn.revision.name: "N/A",
|
30 |
+
AutoEvalColumn.precision.name: None,
|
31 |
AutoEvalColumn.average.name: 25.0,
|
32 |
AutoEvalColumn.arc.name: 25.0,
|
33 |
AutoEvalColumn.hellaswag.name: 25.0,
|
src/assets/text_content.py
CHANGED
@@ -122,12 +122,16 @@ The tasks and few shots parameters are:
|
|
122 |
- TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
|
123 |
- MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (`acc` of `all`)
|
124 |
|
|
|
|
|
|
|
|
|
|
|
125 |
# In case of model failure
|
126 |
If your model is displayed in the `FAILED` category, its execution stopped.
|
127 |
Make sure you have followed the above steps first.
|
128 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
129 |
|
130 |
-
|
131 |
"""
|
132 |
|
133 |
EVALUATION_QUEUE_TEXT = f"""
|
|
|
122 |
- TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
|
123 |
- MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (`acc` of `all`)
|
124 |
|
125 |
+
### Quantization
|
126 |
+
To get more information about quantization, see:
|
127 |
+
- 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
|
128 |
+
- 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
|
129 |
+
|
130 |
# In case of model failure
|
131 |
If your model is displayed in the `FAILED` category, its execution stopped.
|
132 |
Make sure you have followed the above steps first.
|
133 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
134 |
|
|
|
135 |
"""
|
136 |
|
137 |
EVALUATION_QUEUE_TEXT = f"""
|
src/auto_leaderboard/get_model_metadata.py
CHANGED
@@ -36,7 +36,7 @@ def get_model_license(model_info):
|
|
36 |
def get_model_likes(model_info):
|
37 |
return model_info.likes
|
38 |
|
39 |
-
size_pattern = re.compile(r"\d+(b|m)")
|
40 |
|
41 |
def get_model_size(model_name, model_info):
|
42 |
# In billions
|
@@ -46,7 +46,7 @@ def get_model_size(model_name, model_info):
|
|
46 |
try:
|
47 |
size_match = re.search(size_pattern, model_name.lower())
|
48 |
size = size_match.group(0)
|
49 |
-
return round(
|
50 |
except AttributeError:
|
51 |
return None
|
52 |
|
|
|
36 |
def get_model_likes(model_info):
|
37 |
return model_info.likes
|
38 |
|
39 |
+
size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
40 |
|
41 |
def get_model_size(model_name, model_info):
|
42 |
# In billions
|
|
|
46 |
try:
|
47 |
size_match = re.search(size_pattern, model_name.lower())
|
48 |
size = size_match.group(0)
|
49 |
+
return round(float(size[:-1]) if size[-1] == "b" else float(size[:-1]) / 1e3, 3)
|
50 |
except AttributeError:
|
51 |
return None
|
52 |
|
src/auto_leaderboard/load_results.py
CHANGED
@@ -24,7 +24,7 @@ class EvalResult:
|
|
24 |
model: str
|
25 |
revision: str
|
26 |
results: dict
|
27 |
-
|
28 |
|
29 |
def to_dict(self):
|
30 |
if self.org is not None:
|
@@ -34,7 +34,7 @@ class EvalResult:
|
|
34 |
data_dict = {}
|
35 |
|
36 |
data_dict["eval_name"] = self.eval_name # not a column, just a save name
|
37 |
-
data_dict[AutoEvalColumn.
|
38 |
data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
|
39 |
data_dict[AutoEvalColumn.dummy.name] = base_model
|
40 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
|
|
24 |
model: str
|
25 |
revision: str
|
26 |
results: dict
|
27 |
+
precision: str = "16bit"
|
28 |
|
29 |
def to_dict(self):
|
30 |
if self.org is not None:
|
|
|
34 |
data_dict = {}
|
35 |
|
36 |
data_dict["eval_name"] = self.eval_name # not a column, just a save name
|
37 |
+
data_dict[AutoEvalColumn.precision.name] = self.precision
|
38 |
data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
|
39 |
data_dict[AutoEvalColumn.dummy.name] = base_model
|
40 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
src/auto_leaderboard/model_metadata_type.py
CHANGED
@@ -161,3 +161,12 @@ TYPE_METADATA: Dict[str, ModelType] = {
|
|
161 |
def get_model_type(leaderboard_data: List[dict]):
|
162 |
for model_data in leaderboard_data:
|
163 |
model_data["Type"] = TYPE_METADATA.get(model_data["model_name_for_query"], "N/A")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
def get_model_type(leaderboard_data: List[dict]):
|
162 |
for model_data in leaderboard_data:
|
163 |
model_data["Type"] = TYPE_METADATA.get(model_data["model_name_for_query"], "N/A")
|
164 |
+
if model_data["Type"] == "N/A":
|
165 |
+
if any([i in model_data["model_name_for_query"] for i in ["finetuned", "-ft-"]]):
|
166 |
+
model_data["Type"] = ModelType.SFT
|
167 |
+
elif any([i in model_data["model_name_for_query"] for i in ["pretrained"]]):
|
168 |
+
model_data["Type"] = ModelType.PT
|
169 |
+
elif any([i in model_data["model_name_for_query"] for i in ["-rl-", "-rlhf-"]]):
|
170 |
+
model_data["Type"] = ModelType.RL
|
171 |
+
|
172 |
+
|
src/utils_display.py
CHANGED
@@ -20,8 +20,8 @@ class AutoEvalColumn: # Auto evals column
|
|
20 |
hellaswag = ColumnContent("HellaSwag ⬆️", "number", True)
|
21 |
mmlu = ColumnContent("MMLU ⬆️", "number", True)
|
22 |
truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
|
23 |
-
model_type = ColumnContent("Type", "
|
24 |
-
|
25 |
license = ColumnContent("Hub License", "str", False)
|
26 |
params = ColumnContent("#Params (B)", "number", False)
|
27 |
likes = ColumnContent("Hub ❤️", "number", False)
|
@@ -42,8 +42,8 @@ class EvalQueueColumn: # Queue column
|
|
42 |
model = ColumnContent("model", "markdown", True)
|
43 |
revision = ColumnContent("revision", "str", True)
|
44 |
private = ColumnContent("private", "bool", True)
|
45 |
-
|
46 |
-
|
47 |
status = ColumnContent("status", "str", True)
|
48 |
|
49 |
LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
|
|
|
20 |
hellaswag = ColumnContent("HellaSwag ⬆️", "number", True)
|
21 |
mmlu = ColumnContent("MMLU ⬆️", "number", True)
|
22 |
truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
|
23 |
+
model_type = ColumnContent("Type", "str", False)
|
24 |
+
precision = ColumnContent("Precision", "str", False, True)
|
25 |
license = ColumnContent("Hub License", "str", False)
|
26 |
params = ColumnContent("#Params (B)", "number", False)
|
27 |
likes = ColumnContent("Hub ❤️", "number", False)
|
|
|
42 |
model = ColumnContent("model", "markdown", True)
|
43 |
revision = ColumnContent("revision", "str", True)
|
44 |
private = ColumnContent("private", "bool", True)
|
45 |
+
precision = ColumnContent("precision", "bool", True)
|
46 |
+
weight_type = ColumnContent("weight_type", "str", "Original")
|
47 |
status = ColumnContent("status", "str", True)
|
48 |
|
49 |
LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
|