barthfab commited on
Commit
90f6dd4
1 Parent(s): 4e7dd3e

adding dutch

Browse files
src/display/about.py CHANGED
@@ -42,6 +42,12 @@ class Tasks(Enum):
42
  task22 = Task("harness|belebele_spa_Latn|5", "acc_norm,none", "🇪🇸Belebele ES")
43
  task23 = Task("harness|hellaswag_es|10", "acc_norm,none", "🇪🇸HellaSwag ES")
44
  task24 = Task("harness|mmlu_m_es|5", "acc,none", "🇪🇸MMLU ES")
 
 
 
 
 
 
45
 
46
  class Languages(Enum):
47
  lng0 = Language("🇩🇪 DE", "de", "deu")
@@ -49,6 +55,8 @@ class Languages(Enum):
49
  lng2 = Language("🇮🇹 IT", "it", "ita")
50
  lng3 = Language("🇪🇸 ES", "es", "spa")
51
  lng4 = Language("🇬🇧 EN", "", "eng")
 
 
52
 
53
  # Your leaderboard name
54
  TITLE = """<h1 align="center" id="space-title">Occiglot Euro LLM Leaderboard</h1>"""
@@ -58,11 +66,13 @@ INTRODUCTION_TEXT = """
58
  <div border="2px">
59
  <p style="float: left;"><img src="https://huggingface.co/datasets/malteos/images/resolve/main/occiglot.medium.png" alt="Image" style="width:200px; margin-right:10px;" border="2px"/></p>
60
 
61
- <p border="2px">The Occiglot euro LLM leaderboard evaluates a subset of the tasks from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">Open LLM Leaderboard</a> machine-translated into the four main languages from the <a href="https://github.com/nlp-uoregon/Okapi" target="_blank">Okapi benchmark</a> and <a href="https://arxiv.org/abs/2308.16884" target="_blank">Belebele </a> (French, Italian, German and Spanish).
62
 
63
  The translated tasks are uploaded to a fork of the great [Eleuther AI Language Model Evaluation Harness](https://github.com/occiglot/euro-lm-evaluation-harness).
64
 
65
- Disclaimer: A language is not represented by a country. Different languages can be spoken and spread in all countries around the globe. For the sake of simplicity, we have used flag emojis (🇬🇧🇮🇹🇫🇷🇪🇸🇩🇪) to represent the language, not the countries.</p>
 
 
66
  </div>
67
  """
68
 
 
42
  task22 = Task("harness|belebele_spa_Latn|5", "acc_norm,none", "🇪🇸Belebele ES")
43
  task23 = Task("harness|hellaswag_es|10", "acc_norm,none", "🇪🇸HellaSwag ES")
44
  task24 = Task("harness|mmlu_m_es|5", "acc,none", "🇪🇸MMLU ES")
45
+ # add a new benchmark group for the language
46
+ task25 = Task("harness|mmlu_m_nl|5", "acc,none", "🇳🇱MMLU NL")
47
+ task26 = Task("harness|arc_challenge_m_nl|25", "acc_norm,none", "🇳🇱ARC NL")
48
+ task27 = Task("harness|truthfulqa_mc2_m_nl|0", "acc,none", "🇳🇱TruthfulQA NL")
49
+ task28 = Task("harness|belebele_nld_Latn|5", "acc_norm,none", "🇳🇱Belebele NL")
50
+ task29 = Task("harness|hellaswag_nl|10", "acc_norm,none", "🇳🇱HellaSwag NL")
51
 
52
  class Languages(Enum):
53
  lng0 = Language("🇩🇪 DE", "de", "deu")
 
55
  lng2 = Language("🇮🇹 IT", "it", "ita")
56
  lng3 = Language("🇪🇸 ES", "es", "spa")
57
  lng4 = Language("🇬🇧 EN", "", "eng")
58
+ # add a new language
59
+ lng5 = Language("🇳🇱 NL", "nl", "nld")
60
 
61
  # Your leaderboard name
62
  TITLE = """<h1 align="center" id="space-title">Occiglot Euro LLM Leaderboard</h1>"""
 
66
  <div border="2px">
67
  <p style="float: left;"><img src="https://huggingface.co/datasets/malteos/images/resolve/main/occiglot.medium.png" alt="Image" style="width:200px; margin-right:10px;" border="2px"/></p>
68
 
69
+ <p border="2px">The Occiglot euro LLM leaderboard evaluates a subset of the tasks from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">Open LLM Leaderboard</a> machine-translated into the four main languages from the <a href="https://github.com/nlp-uoregon/Okapi" target="_blank">Okapi benchmark</a> and <a href="https://arxiv.org/abs/2308.16884" target="_blank">Belebele </a> (French, Italian, German, Spanish and Dutch).
70
 
71
  The translated tasks are uploaded to a fork of the great [Eleuther AI Language Model Evaluation Harness](https://github.com/occiglot/euro-lm-evaluation-harness).
72
 
73
+ **UPDATE**: We added a Dutch benchmark to the leaderboard. From now on all models will also be evaluated on Dutch (🇳🇱) benchmarks. However, for better comparison, we exclude those results for now from the Average results.
74
+
75
+ Disclaimer: A language is not represented by a country. Different languages can be spoken and spread in all countries around the globe. For the sake of simplicity, we have used flag emojis (🇬🇧🇮🇹🇫🇷🇪🇸🇩🇪🇳🇱) to represent the language, not the countries.</p>
76
  </div>
77
  """
78
 
src/leaderboard/read_evals.py CHANGED
@@ -71,9 +71,12 @@ class EvalResult:
71
 
72
  # We average all scores of a given metric (not all metrics are present in all files)
73
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
 
 
 
74
  if accs.size == 0 or any([acc is None for acc in accs]):
75
  continue
76
-
77
  mean_acc = np.mean(accs) * 100.0
78
  results[task.benchmark] = mean_acc
79
 
@@ -106,8 +109,8 @@ class EvalResult:
106
  print(f"Could not find request file for {self.org}/{self.model}")
107
 
108
  def to_dict(self):
109
- """Converts the Eval Result to a dict compatible with our dataframe display"""
110
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
111
  data_dict = {
112
  "eval_name": self.eval_name, # not a column, just a save name,
113
  AutoEvalColumn.precision.name: self.precision.value.name,
 
71
 
72
  # We average all scores of a given metric (not all metrics are present in all files)
73
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
74
+ # TODO: change for new files (maybe time stamp, maybe just if accs is None)
75
+ if "_nl" in task.benchmark and accs.size == 0:
76
+ accs = np.zeros(1)
77
  if accs.size == 0 or any([acc is None for acc in accs]):
78
  continue
79
+
80
  mean_acc = np.mean(accs) * 100.0
81
  results[task.benchmark] = mean_acc
82
 
 
109
  print(f"Could not find request file for {self.org}/{self.model}")
110
 
111
  def to_dict(self):
112
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
113
+ average = sum([v for k, v in self.results.items() if v is not None and "_nl" not in k]) / (len(Tasks) - 5)
114
  data_dict = {
115
  "eval_name": self.eval_name, # not a column, just a save name,
116
  AutoEvalColumn.precision.name: self.precision.value.name,