Abhaykoul commited on
Commit
85e64a4
1 Parent(s): 7b57662

Update functions.py

Browse files
Files changed (1) hide show
  1. functions.py +68 -102
functions.py CHANGED
@@ -16,128 +16,95 @@ finished_models = get_datas(data)
16
  df = pd.DataFrame(finished_models)
17
 
18
  desc = """
19
- This is an automated PR created with https://huggingface.co/spaces/eduagarcia-temp/portuguese-leaderboard-results-to-modelcard
20
-
21
- The purpose of this PR is to add evaluation results from the Open Portuguese LLM Leaderboard to your model card.
22
-
23
- If you encounter any issues, please report them to https://huggingface.co/spaces/eduagarcia-temp/portuguese-leaderboard-results-to-modelcard/discussions
24
  """
25
 
26
  def search(df, value):
27
- result_df = df[df["Model Name"] == value]
28
  return result_df.iloc[0].to_dict() if not result_df.empty else None
29
 
30
 
31
  def get_details_url(repo):
32
- #author, model = repo.split("/")
33
- return f"https://huggingface.co/datasets/eduagarcia-temp/llm_pt_leaderboard_raw_results/tree/main/{repo}"
34
 
35
 
36
  def get_query_url(repo):
37
- return f"https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard?query={repo}"
38
 
39
 
40
  def get_task_summary(results):
41
  return {
42
- "ENEM":
43
- {"dataset_type":"eduagarcia/enem_challenge",
44
- "dataset_name":"ENEM Challenge (No Images)",
45
- "metric_type":"acc",
46
- "metric_value":results["ENEM"],
47
- "dataset_config": None,
48
- "dataset_split":"train",
49
- "dataset_revision":None,
50
- "dataset_args":{"num_few_shot": 3},
51
- "metric_name":"accuracy"
52
- },
53
- "BLUEX":
54
- {"dataset_type":"eduagarcia-temp/BLUEX_without_images",
55
- "dataset_name":"BLUEX (No Images)",
56
- "metric_type":"acc",
57
- "metric_value":results["BLUEX"],
58
- "dataset_config": None,
59
- "dataset_split":"train",
60
- "dataset_revision":None,
61
- "dataset_args":{"num_few_shot": 3},
62
- "metric_name":"accuracy"
63
- },
64
- "OAB Exams":
65
- {"dataset_type":"eduagarcia/oab_exams",
66
- "dataset_name":"OAB Exams",
67
- "metric_type":"acc",
68
- "metric_value":results["OAB Exams"],
69
- "dataset_config": None,
70
- "dataset_split":"train",
71
- "dataset_revision":None,
72
- "dataset_args":{"num_few_shot": 3},
73
- "metric_name":"accuracy"
74
- },
75
- "ASSIN2 RTE":
76
- {"dataset_type":"assin2",
77
- "dataset_name":"Assin2 RTE",
78
- "metric_type":"f1_macro",
79
- "metric_value":results["ASSIN2 RTE"],
80
- "dataset_config": None,
81
- "dataset_split":"test",
82
- "dataset_revision":None,
83
- "dataset_args":{"num_few_shot": 15},
84
- "metric_name":"f1-macro"
85
- },
86
- "ASSIN2 STS":
87
- {"dataset_type":"assin2",
88
- "dataset_name":"Assin2 STS",
89
- "metric_type":"pearson",
90
- "metric_value":results["ASSIN2 STS"],
91
- "dataset_config": None,
92
  "dataset_split":"test",
93
  "dataset_revision":None,
94
- "dataset_args":{"num_few_shot": 15},
95
- "metric_name":"pearson"
96
  },
97
- "FAQUAD NLI":
98
- {"dataset_type":"ruanchaves/faquad-nli",
99
- "dataset_name":"FaQuAD NLI",
100
- "metric_type":"f1_macro",
101
- "metric_value":results["FAQUAD NLI"],
102
- "dataset_config": None,
103
- "dataset_split":"test",
104
  "dataset_revision":None,
105
- "dataset_args":{"num_few_shot": 15},
106
- "metric_name":"f1-macro"
107
  },
108
- "HateBR":
109
- {"dataset_type":"eduagarcia/portuguese_benchmark",
110
- "dataset_name":"HateBR Binary",
111
- "metric_type":"f1_macro",
112
- "metric_value":results["HateBR"],
113
- "dataset_config": None,
 
114
  "dataset_split":"test",
115
  "dataset_revision":None,
116
- "dataset_args":{"num_few_shot": 25},
117
- "metric_name":"f1-macro"
118
- },
119
- "PT Hate Speech":
120
- {"dataset_type":"eduagarcia/portuguese_benchmark",
121
- "dataset_name":"PT Hate Speech Binary",
122
- "metric_type":"f1_macro",
123
- "metric_value":results["PT Hate Speech"],
124
- "dataset_config": None,
125
- "dataset_split":"test",
 
126
  "dataset_revision":None,
127
- "dataset_args":{"num_few_shot": 25},
128
- "metric_name":"f1-macro"
129
- },
130
- "tweetSentBR":
131
- {"dataset_type":"eduagarcia-temp/tweetsentbr",
132
- "dataset_name":"tweetSentBR",
133
- "metric_type":"f1_macro",
134
- "metric_value":results["tweetSentBR"],
135
- "dataset_config": None,
 
 
 
 
 
 
 
 
 
 
 
 
136
  "dataset_split":"test",
137
- "dataset_revision":None,
138
- "dataset_args":{"num_few_shot": 25},
139
- "metric_name":"f1-macro"
140
- }
141
  }
142
 
143
 
@@ -147,12 +114,11 @@ def get_eval_results(repo):
147
  task_summary = get_task_summary(results)
148
  md_writer = MarkdownTableWriter()
149
  md_writer.headers = ["Metric", "Value"]
150
- md_writer.value_matrix = [["Average", f"**{results['Average ⬆️']}**"]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
151
 
152
  text = f"""
153
- # [Open Portuguese LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard)
154
  Detailed results can be found [here]({get_details_url(repo)})
155
-
156
  {md_writer.dumps()}
157
  """
158
  return text
 
16
  df = pd.DataFrame(finished_models)
17
 
18
  desc = """
19
+ This is an automated PR created with https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr
20
+ The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.
21
+ If you encounter any issues, please report them to https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr/discussions
 
 
22
  """
23
 
24
  def search(df, value):
25
+ result_df = df[df["Model"] == value]
26
  return result_df.iloc[0].to_dict() if not result_df.empty else None
27
 
28
 
29
  def get_details_url(repo):
30
+ author, model = repo.split("/")
31
+ return f"https://huggingface.co/datasets/open-llm-leaderboard/details_{author}__{model}"
32
 
33
 
34
  def get_query_url(repo):
35
+ return f"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query={repo}"
36
 
37
 
38
  def get_task_summary(results):
39
  return {
40
+ "ARC":
41
+ {"dataset_type":"ai2_arc",
42
+ "dataset_name":"AI2 Reasoning Challenge (25-Shot)",
43
+ "metric_type":"acc_norm",
44
+ "metric_value":results["ARC"],
45
+ "dataset_config":"ARC-Challenge",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  "dataset_split":"test",
47
  "dataset_revision":None,
48
+ "dataset_args":{"num_few_shot": 25},
49
+ "metric_name":"normalized accuracy"
50
  },
51
+ "HellaSwag":
52
+ {"dataset_type":"hellaswag",
53
+ "dataset_name":"HellaSwag (10-Shot)",
54
+ "metric_type":"acc_norm",
55
+ "metric_value":results["HellaSwag"],
56
+ "dataset_config":None,
57
+ "dataset_split":"validation",
58
  "dataset_revision":None,
59
+ "dataset_args":{"num_few_shot": 10},
60
+ "metric_name":"normalized accuracy"
61
  },
62
+ "MMLU":
63
+ {
64
+ "dataset_type":"cais/mmlu",
65
+ "dataset_name":"MMLU (5-Shot)",
66
+ "metric_type":"acc",
67
+ "metric_value":results["MMLU"],
68
+ "dataset_config":"all",
69
  "dataset_split":"test",
70
  "dataset_revision":None,
71
+ "dataset_args":{"num_few_shot": 5},
72
+ "metric_name":"accuracy"
73
+ },
74
+ "TruthfulQA":
75
+ {
76
+ "dataset_type":"truthful_qa",
77
+ "dataset_name":"TruthfulQA (0-shot)",
78
+ "metric_type":"mc2",
79
+ "metric_value":results["TruthfulQA"],
80
+ "dataset_config":"multiple_choice",
81
+ "dataset_split":"validation",
82
  "dataset_revision":None,
83
+ "dataset_args":{"num_few_shot": 0},
84
+ "metric_name":None
85
+ },
86
+ "Winogrande":
87
+ {
88
+ "dataset_type":"winogrande",
89
+ "dataset_name":"Winogrande (5-shot)",
90
+ "metric_type":"acc",
91
+ "metric_value":results["Winogrande"],
92
+ "dataset_config":"winogrande_xl",
93
+ "dataset_split":"validation",
94
+ "dataset_args":{"num_few_shot": 5},
95
+ "metric_name":"accuracy"
96
+ },
97
+ "GSM8K":
98
+ {
99
+ "dataset_type":"gsm8k",
100
+ "dataset_name":"GSM8k (5-shot)",
101
+ "metric_type":"acc",
102
+ "metric_value":results["GSM8K"],
103
+ "dataset_config":"main",
104
  "dataset_split":"test",
105
+ "dataset_args":{"num_few_shot": 5},
106
+ "metric_name":"accuracy"
107
+ }
 
108
  }
109
 
110
 
 
114
  task_summary = get_task_summary(results)
115
  md_writer = MarkdownTableWriter()
116
  md_writer.headers = ["Metric", "Value"]
117
+ md_writer.value_matrix = [["Avg.", results['Average ⬆️']]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
118
 
119
  text = f"""
120
+ # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
121
  Detailed results can be found [here]({get_details_url(repo)})
 
122
  {md_writer.dumps()}
123
  """
124
  return text