Bram Vanroy commited on
Commit
cb090d6
2 Parent(s): 38b8e33 172dde4

Merge branch 'main' of https://huggingface.co/spaces/BramVanroy/open_dutch_llm_leaderboard

Browse files
app.py CHANGED
@@ -80,7 +80,7 @@ class ResultSet:
80
  "num_parameters": "Size",
81
  "average": "Avg.",
82
  "arc": "ARC (25-shot)",
83
- "hellaswag": "HellaSwag (10-shot)",
84
  "mmlu": "MMLU (5-shot)",
85
  "truthfulqa": "TruthfulQA (0-shot)",
86
  }
@@ -91,7 +91,7 @@ class ResultSet:
91
  "Size": "str",
92
  "Avg.": "number",
93
  "ARC (25-shot)": "number",
94
- "HellaSwag (10-shot)": "number",
95
  "MMLU (5-shot)": "number",
96
  "TruthfulQA (0-shot)": "number",
97
  }
 
80
  "num_parameters": "Size",
81
  "average": "Avg.",
82
  "arc": "ARC (25-shot)",
83
+ "hellaswag": "HellaSwag (10-shot)",
84
  "mmlu": "MMLU (5-shot)",
85
  "truthfulqa": "TruthfulQA (0-shot)",
86
  }
 
91
  "Size": "str",
92
  "Avg.": "number",
93
  "ARC (25-shot)": "number",
94
+ "HellaSwag (10-shot)": "number",
95
  "MMLU (5-shot)": "number",
96
  "TruthfulQA (0-shot)": "number",
97
  }
content.py CHANGED
@@ -21,7 +21,9 @@ DISCLAIMER = """## Disclaimer
21
  I did not verify the (translation) quality of the benchmarks. If you encounter issues with the benchmark contents, please contact the original authors.
22
 
23
  I am aware that benchmarking models on *translated* data is not ideal. However, for Dutch there are no other options for generative models at the moment. Because the benchmarks were automatically translated, some translationese effects may occur: the translations may not be fluent Dutch or still contain artifacts of the source text (like word order, literal translation, certain vocabulary items). Because of that, an unfair advantage may be given to the non-Dutch models: Dutch is closely related to English, so if the benchmarks are in automatically translated Dutch that still has English properties, those English models may not have too many issues with that. If the benchmarks were to have been manually translated or, even better, created from scratch in Dutch, those non-Dutch models may have a harder time. Maybe not. We cannot know for sure until we have high-quality, manually crafted benchmarks for Dutch.
24
-
 
 
25
  If you have any suggestions for other Dutch benchmarks, please [let me know](https://twitter.com/BramVanroy) so I can add them!
26
  """
27
 
 
21
  I did not verify the (translation) quality of the benchmarks. If you encounter issues with the benchmark contents, please contact the original authors.
22
 
23
  I am aware that benchmarking models on *translated* data is not ideal. However, for Dutch there are no other options for generative models at the moment. Because the benchmarks were automatically translated, some translationese effects may occur: the translations may not be fluent Dutch or still contain artifacts of the source text (like word order, literal translation, certain vocabulary items). Because of that, an unfair advantage may be given to the non-Dutch models: Dutch is closely related to English, so if the benchmarks are in automatically translated Dutch that still has English properties, those English models may not have too many issues with that. If the benchmarks were to have been manually translated or, even better, created from scratch in Dutch, those non-Dutch models may have a harder time. Maybe not. We cannot know for sure until we have high-quality, manually crafted benchmarks for Dutch.
24
+
25
+ Another shortcoming is that we do not calculate significancy scores or confidence intervals. When results are close together in the leaderboard I therefore urge caution when interpreting the model ranks.
26
+
27
  If you have any suggestions for other Dutch benchmarks, please [let me know](https://twitter.com/BramVanroy) so I can add them!
28
  """
29
 
evals/models.json CHANGED
@@ -59,7 +59,7 @@
59
  "compute_dtype": "bfloat16",
60
  "dutch_coverage": "none",
61
  "model_name": "meta-llama/Llama-2-13b-chat-hf",
62
- "model_type": "instruction-tuned",
63
  "num_parameters": 13015864320,
64
  "quantization": "8-bit"
65
  },
@@ -75,7 +75,7 @@
75
  "compute_dtype": "bfloat16",
76
  "dutch_coverage": "none",
77
  "model_name": "meta-llama/Llama-2-7b-chat-hf",
78
- "model_type": "instruction-tuned",
79
  "num_parameters": 6738415616,
80
  "quantization": "8-bit"
81
  },
@@ -123,7 +123,7 @@
123
  "compute_dtype": "bfloat16",
124
  "dutch_coverage": "none",
125
  "model_name": "microsoft/Orca-2-13b",
126
- "model_type": "fine-tuned",
127
  "num_parameters": 13015895040,
128
  "quantization": "8-bit"
129
  },
@@ -131,7 +131,7 @@
131
  "compute_dtype": "bfloat16",
132
  "dutch_coverage": "none",
133
  "model_name": "microsoft/Orca-2-7b",
134
- "model_type": "fine-tuned",
135
  "num_parameters": 6738440192,
136
  "quantization": "8-bit"
137
  },
 
59
  "compute_dtype": "bfloat16",
60
  "dutch_coverage": "none",
61
  "model_name": "meta-llama/Llama-2-13b-chat-hf",
62
+ "model_type": "RL-tuned",
63
  "num_parameters": 13015864320,
64
  "quantization": "8-bit"
65
  },
 
75
  "compute_dtype": "bfloat16",
76
  "dutch_coverage": "none",
77
  "model_name": "meta-llama/Llama-2-7b-chat-hf",
78
+ "model_type": "RL-tuned",
79
  "num_parameters": 6738415616,
80
  "quantization": "8-bit"
81
  },
 
123
  "compute_dtype": "bfloat16",
124
  "dutch_coverage": "none",
125
  "model_name": "microsoft/Orca-2-13b",
126
+ "model_type": "instruction-tuned",
127
  "num_parameters": 13015895040,
128
  "quantization": "8-bit"
129
  },
 
131
  "compute_dtype": "bfloat16",
132
  "dutch_coverage": "none",
133
  "model_name": "microsoft/Orca-2-7b",
134
+ "model_type": "instruction-tuned",
135
  "num_parameters": 6738440192,
136
  "quantization": "8-bit"
137
  },
evals/truthfulqa/truthfulqa_nl_Mixtral-8x7B-v0.1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "truthfulqa_nl": {
4
- "mc1": 0.310828025477707,
5
- "mc1_stderr": 0.016529733724696277,
6
- "mc2": 0.4629846929417042,
7
- "mc2_stderr": 0.01508660067127546
8
- }
9
- },
10
- "versions": {
11
- "truthfulqa_nl": 1
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=mistralai/Mixtral-8x7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=auto,load_in_8bit=True",
16
- "batch_size": 2,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }