Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
adding dutch
Browse files- src/display/about.py +12 -2
- src/leaderboard/read_evals.py +6 -3
src/display/about.py
CHANGED
@@ -42,6 +42,12 @@ class Tasks(Enum):
|
|
42 |
task22 = Task("harness|belebele_spa_Latn|5", "acc_norm,none", "🇪🇸Belebele ES")
|
43 |
task23 = Task("harness|hellaswag_es|10", "acc_norm,none", "🇪🇸HellaSwag ES")
|
44 |
task24 = Task("harness|mmlu_m_es|5", "acc,none", "🇪🇸MMLU ES")
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
class Languages(Enum):
|
47 |
lng0 = Language("🇩🇪 DE", "de", "deu")
|
@@ -49,6 +55,8 @@ class Languages(Enum):
|
|
49 |
lng2 = Language("🇮🇹 IT", "it", "ita")
|
50 |
lng3 = Language("🇪🇸 ES", "es", "spa")
|
51 |
lng4 = Language("🇬🇧 EN", "", "eng")
|
|
|
|
|
52 |
|
53 |
# Your leaderboard name
|
54 |
TITLE = """<h1 align="center" id="space-title">Occiglot Euro LLM Leaderboard</h1>"""
|
@@ -58,11 +66,13 @@ INTRODUCTION_TEXT = """
|
|
58 |
<div border="2px">
|
59 |
<p style="float: left;"><img src="https://huggingface.co/datasets/malteos/images/resolve/main/occiglot.medium.png" alt="Image" style="width:200px; margin-right:10px;" border="2px"/></p>
|
60 |
|
61 |
-
<p border="2px">The Occiglot euro LLM leaderboard evaluates a subset of the tasks from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">Open LLM Leaderboard</a> machine-translated into the four main languages from the <a href="https://github.com/nlp-uoregon/Okapi" target="_blank">Okapi benchmark</a> and <a href="https://arxiv.org/abs/2308.16884" target="_blank">Belebele </a> (French, Italian, German and
|
62 |
|
63 |
The translated tasks are uploaded to a fork of the great [Eleuther AI Language Model Evaluation Harness](https://github.com/occiglot/euro-lm-evaluation-harness).
|
64 |
|
65 |
-
|
|
|
|
|
66 |
</div>
|
67 |
"""
|
68 |
|
|
|
42 |
task22 = Task("harness|belebele_spa_Latn|5", "acc_norm,none", "🇪🇸Belebele ES")
|
43 |
task23 = Task("harness|hellaswag_es|10", "acc_norm,none", "🇪🇸HellaSwag ES")
|
44 |
task24 = Task("harness|mmlu_m_es|5", "acc,none", "🇪🇸MMLU ES")
|
45 |
+
# add a new benchmark group for the language
|
46 |
+
task25 = Task("harness|mmlu_m_nl|5", "acc,none", "🇳🇱MMLU NL")
|
47 |
+
task26 = Task("harness|arc_challenge_m_nl|25", "acc_norm,none", "🇳🇱ARC NL")
|
48 |
+
task27 = Task("harness|truthfulqa_mc2_m_nl|0", "acc,none", "🇳🇱TruthfulQA NL")
|
49 |
+
task28 = Task("harness|belebele_nld_Latn|5", "acc_norm,none", "🇳🇱Belebele NL")
|
50 |
+
task29 = Task("harness|hellaswag_nl|10", "acc_norm,none", "🇳🇱HellaSwag NL")
|
51 |
|
52 |
class Languages(Enum):
|
53 |
lng0 = Language("🇩🇪 DE", "de", "deu")
|
|
|
55 |
lng2 = Language("🇮🇹 IT", "it", "ita")
|
56 |
lng3 = Language("🇪🇸 ES", "es", "spa")
|
57 |
lng4 = Language("🇬🇧 EN", "", "eng")
|
58 |
+
# add a new language
|
59 |
+
lng5 = Language("🇳🇱 NL", "nl", "nld")
|
60 |
|
61 |
# Your leaderboard name
|
62 |
TITLE = """<h1 align="center" id="space-title">Occiglot Euro LLM Leaderboard</h1>"""
|
|
|
66 |
<div border="2px">
|
67 |
<p style="float: left;"><img src="https://huggingface.co/datasets/malteos/images/resolve/main/occiglot.medium.png" alt="Image" style="width:200px; margin-right:10px;" border="2px"/></p>
|
68 |
|
69 |
+
<p border="2px">The Occiglot euro LLM leaderboard evaluates a subset of the tasks from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">Open LLM Leaderboard</a> machine-translated into the four main languages from the <a href="https://github.com/nlp-uoregon/Okapi" target="_blank">Okapi benchmark</a> and <a href="https://arxiv.org/abs/2308.16884" target="_blank">Belebele </a> (French, Italian, German, Spanish and Dutch).
|
70 |
|
71 |
The translated tasks are uploaded to a fork of the great [Eleuther AI Language Model Evaluation Harness](https://github.com/occiglot/euro-lm-evaluation-harness).
|
72 |
|
73 |
+
**UPDATE**: We added a Dutch benchmark to the leaderboard. From now on all models will also be evaluated on Dutch (🇳🇱) benchmarks. However, for better comparison, we exclude those results for now from the Average results.
|
74 |
+
|
75 |
+
Disclaimer: A language is not represented by a country. Different languages can be spoken and spread in all countries around the globe. For the sake of simplicity, we have used flag emojis (🇬🇧🇮🇹🇫🇷🇪🇸🇩🇪🇳🇱) to represent the language, not the countries.</p>
|
76 |
</div>
|
77 |
"""
|
78 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -71,9 +71,12 @@ class EvalResult:
|
|
71 |
|
72 |
# We average all scores of a given metric (not all metrics are present in all files)
|
73 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
|
|
|
|
|
|
74 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
75 |
continue
|
76 |
-
|
77 |
mean_acc = np.mean(accs) * 100.0
|
78 |
results[task.benchmark] = mean_acc
|
79 |
|
@@ -106,8 +109,8 @@ class EvalResult:
|
|
106 |
print(f"Could not find request file for {self.org}/{self.model}")
|
107 |
|
108 |
def to_dict(self):
|
109 |
-
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
110 |
-
average = sum([v for v in self.results.
|
111 |
data_dict = {
|
112 |
"eval_name": self.eval_name, # not a column, just a save name,
|
113 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
71 |
|
72 |
# We average all scores of a given metric (not all metrics are present in all files)
|
73 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
74 |
+
# TODO: change for new files (maybe time stamp, maybe just if accs is None)
|
75 |
+
if "_nl" in task.benchmark and accs.size == 0:
|
76 |
+
accs = np.zeros(1)
|
77 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
78 |
continue
|
79 |
+
|
80 |
mean_acc = np.mean(accs) * 100.0
|
81 |
results[task.benchmark] = mean_acc
|
82 |
|
|
|
109 |
print(f"Could not find request file for {self.org}/{self.model}")
|
110 |
|
111 |
def to_dict(self):
|
112 |
+
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
113 |
+
average = sum([v for k, v in self.results.items() if v is not None and "_nl" not in k]) / (len(Tasks) - 5)
|
114 |
data_dict = {
|
115 |
"eval_name": self.eval_name, # not a column, just a save name,
|
116 |
AutoEvalColumn.precision.name: self.precision.value.name,
|