Weyaxi commited on
Commit
56c2f84
1 Parent(s): d6aff0a

round results

Browse files
Files changed (1) hide show
  1. functions.py +7 -7
functions.py CHANGED
@@ -40,7 +40,7 @@ def get_task_summary(results):
40
  {"dataset_type":"HuggingFaceH4/ifeval",
41
  "dataset_name":"IFEval (0-Shot)",
42
  "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
43
- "metric_value":results["IFEval"],
44
  "dataset_config": None, # don't know
45
  "dataset_split": None, # don't know
46
  "dataset_revision":None,
@@ -51,7 +51,7 @@ def get_task_summary(results):
51
  {"dataset_type":"BBH",
52
  "dataset_name":"BBH (3-Shot)",
53
  "metric_type":"acc_norm",
54
- "metric_value":results["BBH"],
55
  "dataset_config": None, # don't know
56
  "dataset_split": None, # don't know
57
  "dataset_revision":None,
@@ -63,7 +63,7 @@ def get_task_summary(results):
63
  "dataset_type":"hendrycks/competition_math",
64
  "dataset_name":"MATH Lvl 5 (4-Shot)",
65
  "metric_type":"exact_match",
66
- "metric_value":results["MATH Lvl 5"],
67
  "dataset_config": None, # don't know
68
  "dataset_split": None, # don't know
69
  "dataset_revision":None,
@@ -75,7 +75,7 @@ def get_task_summary(results):
75
  "dataset_type":"Idavidrein/gpqa",
76
  "dataset_name":"GPQA (0-shot)",
77
  "metric_type":"acc_norm",
78
- "metric_value":results["GPQA"],
79
  "dataset_config": None, # don't know
80
  "dataset_split": None, # don't know
81
  "dataset_revision":None,
@@ -87,7 +87,7 @@ def get_task_summary(results):
87
  "dataset_type":"TAUR-Lab/MuSR",
88
  "dataset_name":"MuSR (0-shot)",
89
  "metric_type":"acc_norm",
90
- "metric_value":results["MUSR"],
91
  "dataset_config": None, # don't know
92
  "dataset_split": None, # don't know
93
  "dataset_args":{"num_few_shot": 0},
@@ -98,7 +98,7 @@ def get_task_summary(results):
98
  "dataset_type":"TIGER-Lab/MMLU-Pro",
99
  "dataset_name":"MMLU-PRO (5-shot)",
100
  "metric_type":"acc",
101
- "metric_value":results["MMLU-PRO"],
102
  "dataset_config":"main",
103
  "dataset_split":"test",
104
  "dataset_args":{"num_few_shot": 5},
@@ -113,7 +113,7 @@ def get_eval_results(repo):
113
  task_summary = get_task_summary(results)
114
  md_writer = MarkdownTableWriter()
115
  md_writer.headers = ["Metric", "Value"]
116
- md_writer.value_matrix = [["Avg.", results['Average ⬆️']]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
117
 
118
  text = f"""
119
  # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
 
40
  {"dataset_type":"HuggingFaceH4/ifeval",
41
  "dataset_name":"IFEval (0-Shot)",
42
  "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
43
+ "metric_value": round(results["IFEval"], 2),
44
  "dataset_config": None, # don't know
45
  "dataset_split": None, # don't know
46
  "dataset_revision":None,
 
51
  {"dataset_type":"BBH",
52
  "dataset_name":"BBH (3-Shot)",
53
  "metric_type":"acc_norm",
54
+ "metric_value": round(results["BBH"], 2),
55
  "dataset_config": None, # don't know
56
  "dataset_split": None, # don't know
57
  "dataset_revision":None,
 
63
  "dataset_type":"hendrycks/competition_math",
64
  "dataset_name":"MATH Lvl 5 (4-Shot)",
65
  "metric_type":"exact_match",
66
+ "metric_value": round(results["MATH Lvl 5"], 2),
67
  "dataset_config": None, # don't know
68
  "dataset_split": None, # don't know
69
  "dataset_revision":None,
 
75
  "dataset_type":"Idavidrein/gpqa",
76
  "dataset_name":"GPQA (0-shot)",
77
  "metric_type":"acc_norm",
78
+ "metric_value": round(results["GPQA"], 2),
79
  "dataset_config": None, # don't know
80
  "dataset_split": None, # don't know
81
  "dataset_revision":None,
 
87
  "dataset_type":"TAUR-Lab/MuSR",
88
  "dataset_name":"MuSR (0-shot)",
89
  "metric_type":"acc_norm",
90
+ "metric_value": round(results["MUSR"], 2),
91
  "dataset_config": None, # don't know
92
  "dataset_split": None, # don't know
93
  "dataset_args":{"num_few_shot": 0},
 
98
  "dataset_type":"TIGER-Lab/MMLU-Pro",
99
  "dataset_name":"MMLU-PRO (5-shot)",
100
  "metric_type":"acc",
101
+ "metric_value": round(results["MMLU-PRO"], 2),
102
  "dataset_config":"main",
103
  "dataset_split":"test",
104
  "dataset_args":{"num_few_shot": 5},
 
113
  task_summary = get_task_summary(results)
114
  md_writer = MarkdownTableWriter()
115
  md_writer.headers = ["Metric", "Value"]
116
+ md_writer.value_matrix = [["Avg.", round(results['Average ⬆️']]], 2) + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
117
 
118
  text = f"""
119
  # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)