Bram Vanroy commited on
Commit
dbe0b3a
·
2 Parent(s): 6a37317 0658988
app.py CHANGED
@@ -28,6 +28,8 @@ MODEL_TYPE_EMOJIS = {
28
  "RL-tuned": "🟦",
29
  }
30
 
 
 
31
 
32
  @dataclass
33
  class Result:
@@ -44,12 +46,14 @@ class Result:
44
  num_parameters_kmb: str = field(init=False)
45
 
46
  def __post_init__(self):
47
- if self.model_type not in ["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned"]:
 
 
 
 
48
  raise ValueError(
49
- f"Model type {self.model_type} must be one of 'pretrained', 'fine-tuned', 'instruction-tuned', 'RL-tuned'"
50
  )
51
- if self.dutch_coverage not in ["none", "pretrained", "fine-tuned"]:
52
- raise ValueError(f"Dutch coverage {self.dutch_coverage} must be one of 'none', 'pretrained', 'fine-tuned'")
53
 
54
  field_names = {f.name for f in fields(self)}
55
  for task_name in TASK_METRICS:
@@ -128,8 +132,10 @@ class ResultSet:
128
  f" dotted;'>{result.short_name}</a>"
129
  )
130
  if attr == "short_name"
131
- else MODEL_TYPE_EMOJIS[result.model_type]
132
  if attr == "model_type"
 
 
133
  else getattr(result, attr)
134
  for attr, col_name in self.column_names.items()
135
  }
@@ -203,8 +209,16 @@ def collect_results() -> ResultSet:
203
 
204
  if "results" not in data:
205
  continue
 
206
  task_results = data["results"]
207
  short_name = pfin.stem.split("_", 2)[2].lower()
 
 
 
 
 
 
 
208
  if short_name not in model_results:
209
  model_results[short_name] = {
210
  "short_name": short_name,
@@ -228,7 +242,10 @@ with gr.Blocks() as demo:
228
  gr.HTML(TITLE)
229
  gr.Markdown(INTRO_TEXT)
230
 
231
- gr.Markdown(f"## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!")
 
 
 
232
 
233
  results = collect_results()
234
 
 
28
  "RL-tuned": "🟦",
29
  }
30
 
31
+ NOT_GIVEN_SYMBOL = "❔"
32
+
33
 
34
  @dataclass
35
  class Result:
 
46
  num_parameters_kmb: str = field(init=False)
47
 
48
  def __post_init__(self):
49
+ if self.model_type not in ["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned", "not-given"]:
50
+ raise ValueError(
51
+ f"Model type {self.model_type} must be one of 'pretrained', 'fine-tuned', 'instruction-tuned', 'RL-tuned', 'not-given"
52
+ )
53
+ if self.dutch_coverage not in ["none", "pretrained", "fine-tuned", "not-given"]:
54
  raise ValueError(
55
+ f"Dutch coverage {self.dutch_coverage} must be one of 'none', 'pretrained', 'fine-tuned', 'not-given"
56
  )
 
 
57
 
58
  field_names = {f.name for f in fields(self)}
59
  for task_name in TASK_METRICS:
 
132
  f" dotted;'>{result.short_name}</a>"
133
  )
134
  if attr == "short_name"
135
+ else MODEL_TYPE_EMOJIS.get(result.model_type, NOT_GIVEN_SYMBOL)
136
  if attr == "model_type"
137
+ else (result.dutch_coverage if result.dutch_coverage != "not-given" else NOT_GIVEN_SYMBOL)
138
+ if attr == "dutch_coverage"
139
  else getattr(result, attr)
140
  for attr, col_name in self.column_names.items()
141
  }
 
209
 
210
  if "results" not in data:
211
  continue
212
+
213
  task_results = data["results"]
214
  short_name = pfin.stem.split("_", 2)[2].lower()
215
+
216
+ if short_name not in model_info:
217
+ raise KeyError(
218
+ f"Model {short_name} not found in overview file {pf_overview.name}. This means that a results JSON"
219
+ f" file exists that has not yet been processed. First run the `generate_overview_json.py` script."
220
+ )
221
+
222
  if short_name not in model_results:
223
  model_results[short_name] = {
224
  "short_name": short_name,
 
242
  gr.HTML(TITLE)
243
  gr.Markdown(INTRO_TEXT)
244
 
245
+ gr.Markdown(
246
+ f"## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!"
247
+ " All models have been benchmarked in 8-bit."
248
+ )
249
 
250
  results = collect_results()
251
 
evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "hellaswag_nl": {
4
- "acc": 0.44079870480302213,
5
- "acc_stderr": 0.005158280633507224,
6
- "acc_norm": 0.5840259039395574,
7
- "acc_norm_stderr": 0.005120942804814836
8
- }
9
- },
10
- "versions": {
11
- "hellaswag_nl": 1
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
16
- "batch_size": "auto",
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/hellaswag/hellaswag_nl_Mixtral-8x7B-v0.1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "hellaswag_nl": {
4
- "acc": 0.5143011332973556,
5
- "acc_stderr": 0.0051926973681393875,
6
- "acc_norm": 0.67835941716136,
7
- "acc_norm_stderr": 0.004853064643337017
8
- }
9
- },
10
- "versions": {
11
- "hellaswag_nl": 1
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=mistralai/Mixtral-8x7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=auto",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/models.json CHANGED
@@ -105,9 +105,9 @@
105
  },
106
  "mixtral-8x7b-v0.1": {
107
  "compute_dtype": "auto",
108
- "dutch_coverage": "not-given",
109
  "model_name": "mistralai/Mixtral-8x7B-v0.1",
110
- "model_type": "not-given",
111
  "num_parameters": 46702792704,
112
  "quantization": "8-bit"
113
  },
 
105
  },
106
  "mixtral-8x7b-v0.1": {
107
  "compute_dtype": "auto",
108
+ "dutch_coverage": "none",
109
  "model_name": "mistralai/Mixtral-8x7B-v0.1",
110
+ "model_type": "pretrained",
111
  "num_parameters": 46702792704,
112
  "quantization": "8-bit"
113
  },
generate_overview_json.py CHANGED
@@ -40,7 +40,8 @@ def main():
40
  "model_type": results[short_name]["model_type"]
41
  if short_name in results and "model_type" in results[short_name]
42
  else "not-given",
43
- "dutch_coverage": results[short_name]["dutch_coverage"] if short_name in results and "dutch_coverage" in results[short_name]
 
44
  else "not-given",
45
  }
46
 
 
40
  "model_type": results[short_name]["model_type"]
41
  if short_name in results and "model_type" in results[short_name]
42
  else "not-given",
43
+ "dutch_coverage": results[short_name]["dutch_coverage"]
44
+ if short_name in results and "dutch_coverage" in results[short_name]
45
  else "not-given",
46
  }
47