eduagarcia
commited on
Commit
·
e9177b9
1
Parent(s):
7bdbf7b
Fix bugs, fix datasets path, added test functions
Browse files- functions.py +24 -19
- openllm.py +7 -1
functions.py
CHANGED
@@ -7,6 +7,7 @@ from pytablewriter import MarkdownTableWriter
|
|
7 |
import gradio as gr
|
8 |
from openllm import get_json_format_data, get_datas
|
9 |
import pandas as pd
|
|
|
10 |
|
11 |
BOT_HF_TOKEN = os.getenv('BOT_HF_TOKEN')
|
12 |
|
@@ -23,7 +24,7 @@ If you encounter any issues, please report them to https://huggingface.co/spaces
|
|
23 |
"""
|
24 |
|
25 |
def search(df, value):
|
26 |
-
result_df = df[df["Model"] == value]
|
27 |
return result_df.iloc[0].to_dict() if not result_df.empty else None
|
28 |
|
29 |
|
@@ -39,8 +40,8 @@ def get_query_url(repo):
|
|
39 |
def get_task_summary(results):
|
40 |
return {
|
41 |
"ENEM":
|
42 |
-
{"dataset_type":"enem_challenge",
|
43 |
-
"dataset_name":"ENEM Challenge",
|
44 |
"metric_type":"acc",
|
45 |
"metric_value":results["ENEM"],
|
46 |
"dataset_config": None,
|
@@ -50,8 +51,8 @@ def get_task_summary(results):
|
|
50 |
"metric_name":"accuracy"
|
51 |
},
|
52 |
"BLUEX":
|
53 |
-
{"dataset_type":"
|
54 |
-
"dataset_name":"BLUEX",
|
55 |
"metric_type":"acc",
|
56 |
"metric_value":results["BLUEX"],
|
57 |
"dataset_config": None,
|
@@ -61,7 +62,7 @@ def get_task_summary(results):
|
|
61 |
"metric_name":"accuracy"
|
62 |
},
|
63 |
"OAB Exams":
|
64 |
-
{"dataset_type":"oab_exams",
|
65 |
"dataset_name":"OAB Exams",
|
66 |
"metric_type":"acc",
|
67 |
"metric_value":results["OAB Exams"],
|
@@ -72,8 +73,8 @@ def get_task_summary(results):
|
|
72 |
"metric_name":"accuracy"
|
73 |
},
|
74 |
"ASSIN2 RTE":
|
75 |
-
{"dataset_type":"
|
76 |
-
"dataset_name":"
|
77 |
"metric_type":"f1_macro",
|
78 |
"metric_value":results["ASSIN2 RTE"],
|
79 |
"dataset_config": None,
|
@@ -83,8 +84,8 @@ def get_task_summary(results):
|
|
83 |
"metric_name":"f1-macro"
|
84 |
},
|
85 |
"ASSIN2 STS":
|
86 |
-
{"dataset_type":"
|
87 |
-
"dataset_name":"
|
88 |
"metric_type":"pearson",
|
89 |
"metric_value":results["ASSIN2 STS"],
|
90 |
"dataset_config": None,
|
@@ -94,8 +95,8 @@ def get_task_summary(results):
|
|
94 |
"metric_name":"pearson"
|
95 |
},
|
96 |
"FAQUAD NLI":
|
97 |
-
{"dataset_type":"
|
98 |
-
"dataset_name":"
|
99 |
"metric_type":"f1_macro",
|
100 |
"metric_value":results["FAQUAD NLI"],
|
101 |
"dataset_config": None,
|
@@ -105,8 +106,8 @@ def get_task_summary(results):
|
|
105 |
"metric_name":"f1-macro"
|
106 |
},
|
107 |
"HateBR":
|
108 |
-
{"dataset_type":"
|
109 |
-
"dataset_name":"HateBR",
|
110 |
"metric_type":"f1_macro",
|
111 |
"metric_value":results["HateBR"],
|
112 |
"dataset_config": None,
|
@@ -116,8 +117,8 @@ def get_task_summary(results):
|
|
116 |
"metric_name":"f1-macro"
|
117 |
},
|
118 |
"PT Hate Speech":
|
119 |
-
{"dataset_type":"
|
120 |
-
"dataset_name":"PT Hate Speech",
|
121 |
"metric_type":"f1_macro",
|
122 |
"metric_value":results["PT Hate Speech"],
|
123 |
"dataset_config": None,
|
@@ -127,7 +128,7 @@ def get_task_summary(results):
|
|
127 |
"metric_name":"f1-macro"
|
128 |
},
|
129 |
"tweetSentBR":
|
130 |
-
{"dataset_type":"tweetsentbr",
|
131 |
"dataset_name":"tweetSentBR",
|
132 |
"metric_type":"f1_macro",
|
133 |
"metric_value":results["tweetSentBR"],
|
@@ -146,7 +147,7 @@ def get_eval_results(repo):
|
|
146 |
task_summary = get_task_summary(results)
|
147 |
md_writer = MarkdownTableWriter()
|
148 |
md_writer.headers = ["Metric", "Value"]
|
149 |
-
md_writer.value_matrix = [["
|
150 |
|
151 |
text = f"""
|
152 |
# [Open Portuguese LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard)
|
@@ -201,6 +202,7 @@ def commit(repo, pr_number=None, message="Adding Evaluation Results", oauth_toke
|
|
201 |
if "Repo card metadata block was not found." in str(e): # There is no readme
|
202 |
readme_text = get_edited_yaml_readme(repo, token=token)
|
203 |
else:
|
|
|
204 |
print(f"Something went wrong: {e}")
|
205 |
|
206 |
liste = [CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=readme_text.encode())]
|
@@ -217,4 +219,7 @@ def commit(repo, pr_number=None, message="Adding Evaluation Results", oauth_toke
|
|
217 |
elif "Repository Not Found" in str(e):
|
218 |
return "Repository Not Found"
|
219 |
else:
|
220 |
-
return e
|
|
|
|
|
|
|
|
7 |
import gradio as gr
|
8 |
from openllm import get_json_format_data, get_datas
|
9 |
import pandas as pd
|
10 |
+
import traceback
|
11 |
|
12 |
BOT_HF_TOKEN = os.getenv('BOT_HF_TOKEN')
|
13 |
|
|
|
24 |
"""
|
25 |
|
26 |
def search(df, value):
|
27 |
+
result_df = df[df["Model Name"] == value]
|
28 |
return result_df.iloc[0].to_dict() if not result_df.empty else None
|
29 |
|
30 |
|
|
|
40 |
def get_task_summary(results):
|
41 |
return {
|
42 |
"ENEM":
|
43 |
+
{"dataset_type":"eduagarcia/enem_challenge",
|
44 |
+
"dataset_name":"ENEM Challenge (No Images)",
|
45 |
"metric_type":"acc",
|
46 |
"metric_value":results["ENEM"],
|
47 |
"dataset_config": None,
|
|
|
51 |
"metric_name":"accuracy"
|
52 |
},
|
53 |
"BLUEX":
|
54 |
+
{"dataset_type":"eduagarcia-temp/BLUEX_without_images",
|
55 |
+
"dataset_name":"BLUEX (No Images)",
|
56 |
"metric_type":"acc",
|
57 |
"metric_value":results["BLUEX"],
|
58 |
"dataset_config": None,
|
|
|
62 |
"metric_name":"accuracy"
|
63 |
},
|
64 |
"OAB Exams":
|
65 |
+
{"dataset_type":"eduagarcia/oab_exams",
|
66 |
"dataset_name":"OAB Exams",
|
67 |
"metric_type":"acc",
|
68 |
"metric_value":results["OAB Exams"],
|
|
|
73 |
"metric_name":"accuracy"
|
74 |
},
|
75 |
"ASSIN2 RTE":
|
76 |
+
{"dataset_type":"assin2",
|
77 |
+
"dataset_name":"Assin2 RTE",
|
78 |
"metric_type":"f1_macro",
|
79 |
"metric_value":results["ASSIN2 RTE"],
|
80 |
"dataset_config": None,
|
|
|
84 |
"metric_name":"f1-macro"
|
85 |
},
|
86 |
"ASSIN2 STS":
|
87 |
+
{"dataset_type":"assin2",
|
88 |
+
"dataset_name":"Assin2 STS",
|
89 |
"metric_type":"pearson",
|
90 |
"metric_value":results["ASSIN2 STS"],
|
91 |
"dataset_config": None,
|
|
|
95 |
"metric_name":"pearson"
|
96 |
},
|
97 |
"FAQUAD NLI":
|
98 |
+
{"dataset_type":"ruanchaves/faquad-nli",
|
99 |
+
"dataset_name":"FaQuAD NLI",
|
100 |
"metric_type":"f1_macro",
|
101 |
"metric_value":results["FAQUAD NLI"],
|
102 |
"dataset_config": None,
|
|
|
106 |
"metric_name":"f1-macro"
|
107 |
},
|
108 |
"HateBR":
|
109 |
+
{"dataset_type":"eduagarcia/portuguese_benchmark",
|
110 |
+
"dataset_name":"HateBR Binary",
|
111 |
"metric_type":"f1_macro",
|
112 |
"metric_value":results["HateBR"],
|
113 |
"dataset_config": None,
|
|
|
117 |
"metric_name":"f1-macro"
|
118 |
},
|
119 |
"PT Hate Speech":
|
120 |
+
{"dataset_type":"eduagarcia/portuguese_benchmark",
|
121 |
+
"dataset_name":"PT Hate Speech Binary",
|
122 |
"metric_type":"f1_macro",
|
123 |
"metric_value":results["PT Hate Speech"],
|
124 |
"dataset_config": None,
|
|
|
128 |
"metric_name":"f1-macro"
|
129 |
},
|
130 |
"tweetSentBR":
|
131 |
+
{"dataset_type":"eduagarcia-temp/tweetsentbr",
|
132 |
"dataset_name":"tweetSentBR",
|
133 |
"metric_type":"f1_macro",
|
134 |
"metric_value":results["tweetSentBR"],
|
|
|
147 |
task_summary = get_task_summary(results)
|
148 |
md_writer = MarkdownTableWriter()
|
149 |
md_writer.headers = ["Metric", "Value"]
|
150 |
+
md_writer.value_matrix = [["Average", f"**{results['Average ⬆️']}**"]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
|
151 |
|
152 |
text = f"""
|
153 |
# [Open Portuguese LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard)
|
|
|
202 |
if "Repo card metadata block was not found." in str(e): # There is no readme
|
203 |
readme_text = get_edited_yaml_readme(repo, token=token)
|
204 |
else:
|
205 |
+
traceback.print_exc()
|
206 |
print(f"Something went wrong: {e}")
|
207 |
|
208 |
liste = [CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=readme_text.encode())]
|
|
|
219 |
elif "Repository Not Found" in str(e):
|
220 |
return "Repository Not Found"
|
221 |
else:
|
222 |
+
return e
|
223 |
+
|
224 |
+
if __name__ == "__main__":
|
225 |
+
print(get_eval_results("Qwen/Qwen1.5-72B-Chat"))
|
openllm.py
CHANGED
@@ -41,4 +41,10 @@ def get_datas(data):
|
|
41 |
except (KeyError, TypeError):
|
42 |
continue
|
43 |
|
44 |
-
return result_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
except (KeyError, TypeError):
|
42 |
continue
|
43 |
|
44 |
+
return result_list
|
45 |
+
|
46 |
+
if __name__ == "__main__":
|
47 |
+
data = get_json_format_data()
|
48 |
+
print(data)
|
49 |
+
finished_models = get_datas(data)
|
50 |
+
print(finished_models)
|