Spaces:

occiglot
/

euro-llm-leaderboard

Running on CPU Upgrade

App Files Files Community

barthfab commited on Jun 18, 2024

Commit

90f6dd4

1 Parent(s): 4e7dd3e

adding dutch

Browse files

Files changed (2) hide show

src/display/about.py +12 -2
src/leaderboard/read_evals.py +6 -3

src/display/about.py CHANGED Viewed

@@ -42,6 +42,12 @@ class Tasks(Enum):
     task22 = Task("harness|belebele_spa_Latn|5", "acc_norm,none", "🇪🇸Belebele ES")
     task23 = Task("harness|hellaswag_es|10", "acc_norm,none", "🇪🇸HellaSwag ES")
     task24 = Task("harness|mmlu_m_es|5", "acc,none", "🇪🇸MMLU ES")
 class Languages(Enum):
     lng0 = Language("🇩🇪 DE", "de", "deu")
@@ -49,6 +55,8 @@ class Languages(Enum):
     lng2 = Language("🇮🇹 IT", "it", "ita")
     lng3 = Language("🇪🇸 ES", "es", "spa")
     lng4 = Language("🇬🇧 EN", "", "eng")
 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">Occiglot Euro LLM Leaderboard</h1>"""
@@ -58,11 +66,13 @@ INTRODUCTION_TEXT = """
 <div border="2px">
     <p style="float: left;"><img src="https://huggingface.co/datasets/malteos/images/resolve/main/occiglot.medium.png" alt="Image" style="width:200px; margin-right:10px;" border="2px"/></p>
-<p border="2px">The Occiglot euro LLM leaderboard evaluates a subset of the tasks from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">Open LLM Leaderboard</a> machine-translated into the four main languages from the <a href="https://github.com/nlp-uoregon/Okapi" target="_blank">Okapi benchmark</a> and <a href="https://arxiv.org/abs/2308.16884" target="_blank">Belebele </a> (French, Italian, German and Spanish).
 The translated tasks are uploaded to a fork of the great [Eleuther AI Language Model Evaluation Harness](https://github.com/occiglot/euro-lm-evaluation-harness).
-Disclaimer: A language is not represented by a country. Different languages can be spoken and spread in all countries around the globe. For the sake of simplicity, we have used flag emojis (🇬🇧🇮🇹🇫🇷🇪🇸🇩🇪) to represent the language, not the countries.</p>
 </div>
 """

     task22 = Task("harness|belebele_spa_Latn|5", "acc_norm,none", "🇪🇸Belebele ES")
     task23 = Task("harness|hellaswag_es|10", "acc_norm,none", "🇪🇸HellaSwag ES")
     task24 = Task("harness|mmlu_m_es|5", "acc,none", "🇪🇸MMLU ES")
+    # add a new benchmark group for the language
+    task25 = Task("harness|mmlu_m_nl|5", "acc,none", "🇳🇱MMLU NL")
+    task26 = Task("harness|arc_challenge_m_nl|25", "acc_norm,none", "🇳🇱ARC NL")
+    task27 = Task("harness|truthfulqa_mc2_m_nl|0", "acc,none", "🇳🇱TruthfulQA NL")
+    task28 = Task("harness|belebele_nld_Latn|5", "acc_norm,none", "🇳🇱Belebele NL")
+    task29 = Task("harness|hellaswag_nl|10", "acc_norm,none", "🇳🇱HellaSwag NL")
 class Languages(Enum):
     lng0 = Language("🇩🇪 DE", "de", "deu")
     lng2 = Language("🇮🇹 IT", "it", "ita")
     lng3 = Language("🇪🇸 ES", "es", "spa")
     lng4 = Language("🇬🇧 EN", "", "eng")
+    # add a new language
+    lng5 = Language("🇳🇱 NL", "nl", "nld")
 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">Occiglot Euro LLM Leaderboard</h1>"""
 <div border="2px">
     <p style="float: left;"><img src="https://huggingface.co/datasets/malteos/images/resolve/main/occiglot.medium.png" alt="Image" style="width:200px; margin-right:10px;" border="2px"/></p>
+<p border="2px">The Occiglot euro LLM leaderboard evaluates a subset of the tasks from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">Open LLM Leaderboard</a> machine-translated into the four main languages from the <a href="https://github.com/nlp-uoregon/Okapi" target="_blank">Okapi benchmark</a> and <a href="https://arxiv.org/abs/2308.16884" target="_blank">Belebele </a> (French, Italian, German, Spanish and Dutch).
 The translated tasks are uploaded to a fork of the great [Eleuther AI Language Model Evaluation Harness](https://github.com/occiglot/euro-lm-evaluation-harness).
+**UPDATE**: We added a Dutch benchmark to the leaderboard. From now on all models will also be evaluated on Dutch (🇳🇱) benchmarks. However, for better comparison, we exclude those results for now from the Average results.
+Disclaimer: A language is not represented by a country. Different languages can be spoken and spread in all countries around the globe. For the sake of simplicity, we have used flag emojis (🇬🇧🇮🇹🇫🇷🇪🇸🇩🇪🇳🇱) to represent the language, not the countries.</p>
 </div>
 """

src/leaderboard/read_evals.py CHANGED Viewed

@@ -71,9 +71,12 @@ class EvalResult:
             # We average all scores of a given metric (not all metrics are present in all files)
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
@@ -106,8 +109,8 @@ class EvalResult:
             print(f"Could not find request file for {self.org}/{self.model}")
     def to_dict(self):
-        """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,

             # We average all scores of a given metric (not all metrics are present in all files)
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
+            # TODO: change for new files (maybe time stamp, maybe just if accs is None)
+            if "_nl" in task.benchmark and accs.size == 0:
+                accs = np.zeros(1)
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
             print(f"Could not find request file for {self.org}/{self.model}")
     def to_dict(self):
+        """Converts the Eval Result to a dict compatible with our dataframe display"""
+        average = sum([v for k, v in self.results.items() if v is not None and "_nl" not in k]) / (len(Tasks) - 5)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,