round results
Browse files- functions.py +7 -7
functions.py
CHANGED
@@ -40,7 +40,7 @@ def get_task_summary(results):
|
|
40 |
{"dataset_type":"HuggingFaceH4/ifeval",
|
41 |
"dataset_name":"IFEval (0-Shot)",
|
42 |
"metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
|
43 |
-
"metric_value":results["IFEval"],
|
44 |
"dataset_config": None, # don't know
|
45 |
"dataset_split": None, # don't know
|
46 |
"dataset_revision":None,
|
@@ -51,7 +51,7 @@ def get_task_summary(results):
|
|
51 |
{"dataset_type":"BBH",
|
52 |
"dataset_name":"BBH (3-Shot)",
|
53 |
"metric_type":"acc_norm",
|
54 |
-
"metric_value":results["BBH"],
|
55 |
"dataset_config": None, # don't know
|
56 |
"dataset_split": None, # don't know
|
57 |
"dataset_revision":None,
|
@@ -63,7 +63,7 @@ def get_task_summary(results):
|
|
63 |
"dataset_type":"hendrycks/competition_math",
|
64 |
"dataset_name":"MATH Lvl 5 (4-Shot)",
|
65 |
"metric_type":"exact_match",
|
66 |
-
"metric_value":results["MATH Lvl 5"],
|
67 |
"dataset_config": None, # don't know
|
68 |
"dataset_split": None, # don't know
|
69 |
"dataset_revision":None,
|
@@ -75,7 +75,7 @@ def get_task_summary(results):
|
|
75 |
"dataset_type":"Idavidrein/gpqa",
|
76 |
"dataset_name":"GPQA (0-shot)",
|
77 |
"metric_type":"acc_norm",
|
78 |
-
"metric_value":results["GPQA"],
|
79 |
"dataset_config": None, # don't know
|
80 |
"dataset_split": None, # don't know
|
81 |
"dataset_revision":None,
|
@@ -87,7 +87,7 @@ def get_task_summary(results):
|
|
87 |
"dataset_type":"TAUR-Lab/MuSR",
|
88 |
"dataset_name":"MuSR (0-shot)",
|
89 |
"metric_type":"acc_norm",
|
90 |
-
"metric_value":results["MUSR"],
|
91 |
"dataset_config": None, # don't know
|
92 |
"dataset_split": None, # don't know
|
93 |
"dataset_args":{"num_few_shot": 0},
|
@@ -98,7 +98,7 @@ def get_task_summary(results):
|
|
98 |
"dataset_type":"TIGER-Lab/MMLU-Pro",
|
99 |
"dataset_name":"MMLU-PRO (5-shot)",
|
100 |
"metric_type":"acc",
|
101 |
-
"metric_value":results["MMLU-PRO"],
|
102 |
"dataset_config":"main",
|
103 |
"dataset_split":"test",
|
104 |
"dataset_args":{"num_few_shot": 5},
|
@@ -113,7 +113,7 @@ def get_eval_results(repo):
|
|
113 |
task_summary = get_task_summary(results)
|
114 |
md_writer = MarkdownTableWriter()
|
115 |
md_writer.headers = ["Metric", "Value"]
|
116 |
-
md_writer.value_matrix = [["Avg.", results['Average ⬆️']]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
|
117 |
|
118 |
text = f"""
|
119 |
# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
|
|
|
40 |
{"dataset_type":"HuggingFaceH4/ifeval",
|
41 |
"dataset_name":"IFEval (0-Shot)",
|
42 |
"metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
|
43 |
+
"metric_value": round(results["IFEval"], 2),
|
44 |
"dataset_config": None, # don't know
|
45 |
"dataset_split": None, # don't know
|
46 |
"dataset_revision":None,
|
|
|
51 |
{"dataset_type":"BBH",
|
52 |
"dataset_name":"BBH (3-Shot)",
|
53 |
"metric_type":"acc_norm",
|
54 |
+
"metric_value": round(results["BBH"], 2),
|
55 |
"dataset_config": None, # don't know
|
56 |
"dataset_split": None, # don't know
|
57 |
"dataset_revision":None,
|
|
|
63 |
"dataset_type":"hendrycks/competition_math",
|
64 |
"dataset_name":"MATH Lvl 5 (4-Shot)",
|
65 |
"metric_type":"exact_match",
|
66 |
+
"metric_value": round(results["MATH Lvl 5"], 2),
|
67 |
"dataset_config": None, # don't know
|
68 |
"dataset_split": None, # don't know
|
69 |
"dataset_revision":None,
|
|
|
75 |
"dataset_type":"Idavidrein/gpqa",
|
76 |
"dataset_name":"GPQA (0-shot)",
|
77 |
"metric_type":"acc_norm",
|
78 |
+
"metric_value": round(results["GPQA"], 2),
|
79 |
"dataset_config": None, # don't know
|
80 |
"dataset_split": None, # don't know
|
81 |
"dataset_revision":None,
|
|
|
87 |
"dataset_type":"TAUR-Lab/MuSR",
|
88 |
"dataset_name":"MuSR (0-shot)",
|
89 |
"metric_type":"acc_norm",
|
90 |
+
"metric_value": round(results["MUSR"], 2),
|
91 |
"dataset_config": None, # don't know
|
92 |
"dataset_split": None, # don't know
|
93 |
"dataset_args":{"num_few_shot": 0},
|
|
|
98 |
"dataset_type":"TIGER-Lab/MMLU-Pro",
|
99 |
"dataset_name":"MMLU-PRO (5-shot)",
|
100 |
"metric_type":"acc",
|
101 |
+
"metric_value": round(results["MMLU-PRO"], 2),
|
102 |
"dataset_config":"main",
|
103 |
"dataset_split":"test",
|
104 |
"dataset_args":{"num_few_shot": 5},
|
|
|
113 |
task_summary = get_task_summary(results)
|
114 |
md_writer = MarkdownTableWriter()
|
115 |
md_writer.headers = ["Metric", "Value"]
|
116 |
+
md_writer.value_matrix = [["Avg.", round(results['Average ⬆️']]], 2) + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
|
117 |
|
118 |
text = f"""
|
119 |
# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
|