Bram Vanroy
commited on
Commit
β’
5693ee5
1
Parent(s):
0ae29b2
revision for Dutch only
Browse files- .gitignore +107 -126
- app.py +58 -91
- content.py +15 -23
- css.py +0 -13
- evals/arc/arc_nl_Llama-2-7b-chat-hf.json +6 -6
- evals/arc/arc_nl_Llama-2-7b-hf.json +6 -6
- evals/arc/{arc_nl_Mistral-7B-v0.1.json β arc_nl_Orca-2-7b.json} +6 -6
- evals/{truthfulqa/truthfulqa_nl-Llama-2-13b-hf.json β arc/arc_nl_gpt2-large-dutch.json} +8 -8
- evals/arc/arc_nl_gpt2-medium-dutch.json +23 -0
- evals/arc/arc_nl_zephyr-7b-beta.json +6 -6
- evals/hellaswag/hellaswag_nl_Llama-2-7b-chat-hf.json +6 -6
- evals/hellaswag/hellaswag_nl_Llama-2-7b-hf.json +6 -6
- evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json +6 -6
- evals/hellaswag/{hellaswag_nl_zephyr-7b-beta.json β hellaswag_nl_Orca-2-7b.json} +6 -6
- evals/hellaswag/hellaswag_nl_gpt2-large-dutch.json +23 -0
- evals/hellaswag/hellaswag_nl_gpt2-medium-dutch.json +23 -0
- evals/hellaswag/hellaswag_nl_neural-chat-7b-v3-1.json +23 -0
- evals/{truthfulqa/truthfulqa_nl_Mistral-7B-v0.1.json β mmlu/mmlu_nl_Mistral-7B-v0.1.json} +8 -8
- evals/mmlu/mmlu_nl_gpt2-large-dutch.json +23 -0
- evals/mmlu/mmlu_nl_gpt2-medium-dutch.json +23 -0
- evals/truthfulqa/truthfulqa_nl_Llama-2-13b-hf.json +0 -23
- evals/truthfulqa/truthfulqa_nl_Llama-2-7b-chat-hf.json +6 -6
- evals/truthfulqa/truthfulqa_nl_Llama-2-7b-hf.json +4 -4
- evals/truthfulqa/{truthfulqa_nl-falcon-40b-ft-alpaca-dolly-dutch.json β truthfulqa_nl_Orca-2-7b.json} +6 -6
- evals/truthfulqa/truthfulqa_nl_falcon-40b-ft-alpaca-dolly-dutch.json +0 -23
- evals/truthfulqa/truthfulqa_nl_falcon-40b.json +0 -23
- evals/truthfulqa/{truthfulqa_nl-llama2-13b-ft-mc4_nl_cleaned_tiny.json β truthfulqa_nl_gpt2-large-dutch.json} +6 -6
- evals/truthfulqa/{truthfulqa_nl-falcon-40b.json β truthfulqa_nl_gpt2-medium-dutch.json} +6 -6
- evals/truthfulqa/truthfulqa_nl_llama2-13b-ft-mc4_nl_cleaned_tiny.json +0 -23
- evals/truthfulqa/truthfulqa_nl_zephyr-7b-beta.json +0 -23
.gitignore
CHANGED
@@ -1,92 +1,42 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
wandb*
|
5 |
-
Pipfile*
|
6 |
-
data/*
|
7 |
-
muss
|
8 |
-
models/*
|
9 |
-
*config.json
|
10 |
-
|
11 |
-
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
|
12 |
-
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
13 |
-
|
14 |
-
.idea/
|
15 |
-
# User-specific stuff
|
16 |
-
.idea/**/workspace.xml
|
17 |
-
.idea/**/tasks.xml
|
18 |
-
.idea/**/usage.statistics.xml
|
19 |
-
.idea/**/dictionaries
|
20 |
-
.idea/**/shelf
|
21 |
-
|
22 |
-
# AWS User-specific
|
23 |
-
.idea/**/aws.xml
|
24 |
-
|
25 |
-
# Generated files
|
26 |
-
.idea/**/contentModel.xml
|
27 |
-
|
28 |
-
# Sensitive or high-churn files
|
29 |
-
.idea/**/dataSources/
|
30 |
-
.idea/**/dataSources.ids
|
31 |
-
.idea/**/dataSources.local.xml
|
32 |
-
.idea/**/sqlDataSources.xml
|
33 |
-
.idea/**/dynamic.xml
|
34 |
-
.idea/**/uiDesigner.xml
|
35 |
-
.idea/**/dbnavigator.xml
|
36 |
-
|
37 |
-
# Gradle
|
38 |
-
.idea/**/gradle.xml
|
39 |
-
.idea/**/libraries
|
40 |
-
|
41 |
-
# Gradle and Maven with auto-import
|
42 |
-
# When using Gradle or Maven with auto-import, you should exclude module files,
|
43 |
-
# since they will be recreated, and may cause churn. Uncomment if using
|
44 |
-
# auto-import.
|
45 |
-
# .idea/artifacts
|
46 |
-
# .idea/compiler.xml
|
47 |
-
# .idea/jarRepositories.xml
|
48 |
-
# .idea/modules.xml
|
49 |
-
# .idea/*.iml
|
50 |
-
# .idea/modules
|
51 |
-
# *.iml
|
52 |
-
# *.ipr
|
53 |
-
|
54 |
-
# CMake
|
55 |
-
cmake-build-*/
|
56 |
|
57 |
-
#
|
58 |
-
|
59 |
|
60 |
-
#
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
out/
|
65 |
|
66 |
-
#
|
67 |
-
|
68 |
|
69 |
-
#
|
70 |
-
|
71 |
|
72 |
-
#
|
73 |
-
|
74 |
|
75 |
-
#
|
76 |
-
.
|
|
|
|
|
|
|
|
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
crashlytics-build.properties
|
82 |
-
fabric.properties
|
83 |
|
84 |
-
# Editor-based Rest Client
|
85 |
-
.idea/httpRequests
|
86 |
|
87 |
-
#
|
88 |
-
|
|
|
89 |
|
|
|
|
|
90 |
|
91 |
# Byte-compiled / optimized / DLL files
|
92 |
__pycache__/
|
@@ -110,7 +60,6 @@ parts/
|
|
110 |
sdist/
|
111 |
var/
|
112 |
wheels/
|
113 |
-
share/python-wheels/
|
114 |
*.egg-info/
|
115 |
.installed.cfg
|
116 |
*.egg
|
@@ -129,17 +78,14 @@ pip-delete-this-directory.txt
|
|
129 |
# Unit test / coverage reports
|
130 |
htmlcov/
|
131 |
.tox/
|
132 |
-
.nox/
|
133 |
.coverage
|
134 |
.coverage.*
|
135 |
.cache
|
136 |
nosetests.xml
|
137 |
coverage.xml
|
138 |
*.cover
|
139 |
-
*.py,cover
|
140 |
.hypothesis/
|
141 |
.pytest_cache/
|
142 |
-
cover/
|
143 |
|
144 |
# Translations
|
145 |
*.mo
|
@@ -149,7 +95,6 @@ cover/
|
|
149 |
*.log
|
150 |
local_settings.py
|
151 |
db.sqlite3
|
152 |
-
db.sqlite3-journal
|
153 |
|
154 |
# Flask stuff:
|
155 |
instance/
|
@@ -162,41 +107,16 @@ instance/
|
|
162 |
docs/_build/
|
163 |
|
164 |
# PyBuilder
|
165 |
-
.pybuilder/
|
166 |
target/
|
167 |
|
168 |
# Jupyter Notebook
|
169 |
.ipynb_checkpoints
|
170 |
|
171 |
-
# IPython
|
172 |
-
profile_default/
|
173 |
-
ipython_config.py
|
174 |
-
|
175 |
# pyenv
|
176 |
-
|
177 |
-
|
178 |
-
#
|
179 |
-
|
180 |
-
# pipenv
|
181 |
-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
182 |
-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
183 |
-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
184 |
-
# install all needed dependencies.
|
185 |
-
#Pipfile.lock
|
186 |
-
|
187 |
-
# poetry
|
188 |
-
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
189 |
-
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
190 |
-
# commonly ignored for libraries.
|
191 |
-
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
192 |
-
#poetry.lock
|
193 |
-
|
194 |
-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
195 |
-
__pypackages__/
|
196 |
-
|
197 |
-
# Celery stuff
|
198 |
celerybeat-schedule
|
199 |
-
celerybeat.pid
|
200 |
|
201 |
# SageMath parsed files
|
202 |
*.sage.py
|
@@ -222,21 +142,82 @@ venv.bak/
|
|
222 |
|
223 |
# mypy
|
224 |
.mypy_cache/
|
225 |
-
.
|
226 |
-
dmypy.json
|
227 |
|
228 |
-
#
|
229 |
-
.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
|
231 |
-
#
|
232 |
-
.
|
|
|
|
|
|
|
233 |
|
234 |
-
#
|
235 |
-
|
236 |
|
237 |
-
#
|
238 |
-
|
239 |
-
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
240 |
-
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
241 |
-
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
242 |
-
#.idea/
|
|
|
1 |
+
run-backend.ps
|
2 |
+
.eslintrc.js
|
3 |
+
.venv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
+
# ignore compiled styles
|
6 |
+
*.css
|
7 |
|
8 |
+
# dependencies
|
9 |
+
**/node_modules/
|
10 |
+
**/.pnp
|
11 |
+
*.pnp.js
|
|
|
12 |
|
13 |
+
# testing
|
14 |
+
/coverage
|
15 |
|
16 |
+
# VSCode
|
17 |
+
**/.vscode/
|
18 |
|
19 |
+
# production
|
20 |
+
**/build/
|
21 |
|
22 |
+
# misc
|
23 |
+
.DS_Store
|
24 |
+
.env.local
|
25 |
+
.env.development.local
|
26 |
+
.env.test.local
|
27 |
+
.env.production.local
|
28 |
|
29 |
+
npm-debug.log*
|
30 |
+
yarn-debug.log*
|
31 |
+
yarn-error.log*
|
|
|
|
|
32 |
|
|
|
|
|
33 |
|
34 |
+
# python
|
35 |
+
data/
|
36 |
+
Pipfile*
|
37 |
|
38 |
+
# .idea (JetBrains)
|
39 |
+
**/.idea/
|
40 |
|
41 |
# Byte-compiled / optimized / DLL files
|
42 |
__pycache__/
|
|
|
60 |
sdist/
|
61 |
var/
|
62 |
wheels/
|
|
|
63 |
*.egg-info/
|
64 |
.installed.cfg
|
65 |
*.egg
|
|
|
78 |
# Unit test / coverage reports
|
79 |
htmlcov/
|
80 |
.tox/
|
|
|
81 |
.coverage
|
82 |
.coverage.*
|
83 |
.cache
|
84 |
nosetests.xml
|
85 |
coverage.xml
|
86 |
*.cover
|
|
|
87 |
.hypothesis/
|
88 |
.pytest_cache/
|
|
|
89 |
|
90 |
# Translations
|
91 |
*.mo
|
|
|
95 |
*.log
|
96 |
local_settings.py
|
97 |
db.sqlite3
|
|
|
98 |
|
99 |
# Flask stuff:
|
100 |
instance/
|
|
|
107 |
docs/_build/
|
108 |
|
109 |
# PyBuilder
|
|
|
110 |
target/
|
111 |
|
112 |
# Jupyter Notebook
|
113 |
.ipynb_checkpoints
|
114 |
|
|
|
|
|
|
|
|
|
115 |
# pyenv
|
116 |
+
.python-version
|
117 |
+
|
118 |
+
# celery beat schedule file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
celerybeat-schedule
|
|
|
120 |
|
121 |
# SageMath parsed files
|
122 |
*.sage.py
|
|
|
142 |
|
143 |
# mypy
|
144 |
.mypy_cache/
|
145 |
+
test.py
|
|
|
146 |
|
147 |
+
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
|
148 |
+
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
149 |
+
|
150 |
+
# User-specific stuff
|
151 |
+
.idea/**/workspace.xml
|
152 |
+
.idea/**/tasks.xml
|
153 |
+
.idea/**/usage.statistics.xml
|
154 |
+
.idea/**/dictionaries
|
155 |
+
.idea/**/shelf
|
156 |
+
|
157 |
+
# AWS User-specific
|
158 |
+
.idea/**/aws.xml
|
159 |
+
|
160 |
+
# Generated files
|
161 |
+
.idea/**/contentModel.xml
|
162 |
+
|
163 |
+
# Sensitive or high-churn files
|
164 |
+
.idea/**/dataSources/
|
165 |
+
.idea/**/dataSources.ids
|
166 |
+
.idea/**/dataSources.local.xml
|
167 |
+
.idea/**/sqlDataSources.xml
|
168 |
+
.idea/**/dynamic.xml
|
169 |
+
.idea/**/uiDesigner.xml
|
170 |
+
.idea/**/dbnavigator.xml
|
171 |
+
|
172 |
+
# Gradle
|
173 |
+
.idea/**/gradle.xml
|
174 |
+
.idea/**/libraries
|
175 |
+
|
176 |
+
# Gradle and Maven with auto-import
|
177 |
+
# When using Gradle or Maven with auto-import, you should exclude module files,
|
178 |
+
# since they will be recreated, and may cause churn. Uncomment if using
|
179 |
+
# auto-import.
|
180 |
+
# .idea/artifacts
|
181 |
+
# .idea/compiler.xml
|
182 |
+
# .idea/jarRepositories.xml
|
183 |
+
# .idea/modules.xml
|
184 |
+
# .idea/*.iml
|
185 |
+
# .idea/modules
|
186 |
+
# *.iml
|
187 |
+
# *.ipr
|
188 |
+
|
189 |
+
# CMake
|
190 |
+
cmake-build-*/
|
191 |
+
|
192 |
+
# Mongo Explorer plugin
|
193 |
+
.idea/**/mongoSettings.xml
|
194 |
+
|
195 |
+
# File-based project format
|
196 |
+
*.iws
|
197 |
+
|
198 |
+
# IntelliJ
|
199 |
+
out/
|
200 |
+
|
201 |
+
# mpeltonen/sbt-idea plugin
|
202 |
+
.idea_modules/
|
203 |
+
|
204 |
+
# JIRA plugin
|
205 |
+
atlassian-ide-plugin.xml
|
206 |
+
|
207 |
+
# Cursive Clojure plugin
|
208 |
+
.idea/replstate.xml
|
209 |
+
|
210 |
+
# SonarLint plugin
|
211 |
+
.idea/sonarlint/
|
212 |
|
213 |
+
# Crashlytics plugin (for Android Studio and IntelliJ)
|
214 |
+
com_crashlytics_export_strings.xml
|
215 |
+
crashlytics.properties
|
216 |
+
crashlytics-build.properties``
|
217 |
+
fabric.properties
|
218 |
|
219 |
+
# Editor-based Rest Client
|
220 |
+
.idea/httpRequests
|
221 |
|
222 |
+
# Android studio 3.1+ serialized cache file
|
223 |
+
.idea/caches/build_file_checksums.ser
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -2,12 +2,13 @@ import json
|
|
2 |
from collections import defaultdict
|
3 |
from pathlib import Path
|
4 |
|
|
|
5 |
import pandas as pd
|
6 |
import gradio as gr
|
|
|
|
|
7 |
|
8 |
from content import *
|
9 |
-
from css import *
|
10 |
-
import glob
|
11 |
|
12 |
ARC = "arc"
|
13 |
HELLASWAG = "hellaswag"
|
@@ -17,51 +18,17 @@ BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
|
|
17 |
|
18 |
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
"
|
27 |
-
"de": "German",
|
28 |
-
"es": "Spanish",
|
29 |
-
"eu": "Basque",
|
30 |
-
"fr": "French",
|
31 |
-
"gu": "Gujarati",
|
32 |
-
"hi": "Hindi",
|
33 |
-
"hr": "Croatian",
|
34 |
-
"hu": "Hungarian",
|
35 |
-
"hy": "Armenian",
|
36 |
-
"id": "Indonesian",
|
37 |
-
"it": "Italian",
|
38 |
-
"kn": "Kannada",
|
39 |
-
"ml": "Malayalam",
|
40 |
-
"mr": "Marathi",
|
41 |
-
"ne": "Nepali",
|
42 |
-
"nl": "Dutch",
|
43 |
-
"pt": "Portuguese",
|
44 |
-
"ro": "Romanian",
|
45 |
-
"ru": "Russian",
|
46 |
-
"sk": "Slovak",
|
47 |
-
"sr": "Serbian",
|
48 |
-
"sv": "Swedish",
|
49 |
-
"ta": "Tamil",
|
50 |
-
"te": "Telugu",
|
51 |
-
"uk": "Ukrainian",
|
52 |
-
"vi": "Vietnamese",
|
53 |
-
"zh": "Chinese",
|
54 |
-
}
|
55 |
-
|
56 |
-
|
57 |
-
def collect_results():
|
58 |
performance_dict = defaultdict(dict)
|
59 |
-
pretrained_models = set()
|
60 |
for pfin in Path("evals").rglob("*.json"):
|
61 |
data = json.loads(pfin.read_text(encoding="utf-8"))
|
62 |
-
if "results" not in data:
|
63 |
-
continue
|
64 |
-
if "config" not in data:
|
65 |
continue
|
66 |
results = data["results"]
|
67 |
config = data["config"]
|
@@ -74,7 +41,6 @@ def collect_results():
|
|
74 |
continue
|
75 |
pretrained = pretrained[0].split("=")[1]
|
76 |
pretrained = pretrained.split("/")[-1]
|
77 |
-
pretrained_models.add(pretrained)
|
78 |
|
79 |
for lang_task, perfs in results.items():
|
80 |
task, lang = lang_task.split("_")
|
@@ -85,33 +51,46 @@ def collect_results():
|
|
85 |
p = round(perfs[metric] * 100, 1)
|
86 |
performance_dict[(pretrained, lang)][task] = p
|
87 |
|
88 |
-
return performance_dict
|
89 |
|
90 |
|
91 |
-
def
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
for (pretrained, lang), perfs in performance_dict.items():
|
94 |
-
lang_name = LANG_NAME[lang]
|
95 |
arc_perf = perfs.get(ARC, 0.0)
|
96 |
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
|
97 |
mmlu_perf = perfs.get(MMLU, 0.0)
|
98 |
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
|
99 |
|
100 |
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
|
101 |
-
|
102 |
-
row
|
103 |
-
df.append(row)
|
104 |
|
105 |
-
df = pd.DataFrame.from_records(
|
106 |
df = df.sort_values(by=[AVERAGE_COL], ascending=False)
|
107 |
-
df = df[COLS]
|
108 |
-
|
109 |
return df
|
110 |
|
111 |
|
112 |
-
def
|
113 |
-
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
|
117 |
MODEL_COL = "Model"
|
@@ -120,43 +99,31 @@ ARC_COL = "ARC (25-shot)"
|
|
120 |
HELLASWAG_COL = "HellaSwag (10-shot)οΈ"
|
121 |
MMLU_COL = "MMLU (5-shot)"
|
122 |
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
|
123 |
-
NOTES_COL = "Notes" # For search only
|
124 |
-
|
125 |
-
COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL]
|
126 |
-
TYPES = ["str", "number", "number", "number", "number", "number", "str"]
|
127 |
|
128 |
-
|
129 |
-
|
130 |
|
131 |
-
|
132 |
-
|
|
|
|
|
133 |
gr.HTML(TITLE)
|
134 |
-
gr.Markdown(INTRO_TEXT
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
)
|
147 |
-
|
148 |
-
# # Dummy leaderboard for handling the case when the user uses backspace key
|
149 |
-
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
150 |
-
value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
|
151 |
-
)
|
152 |
-
|
153 |
-
search_bar.change(
|
154 |
-
search_table,
|
155 |
-
[hidden_leaderboard_table_for_search, search_bar],
|
156 |
-
leaderboard_table,
|
157 |
-
)
|
158 |
|
159 |
gr.Markdown(CREDIT, elem_classes="markdown-text")
|
160 |
gr.Markdown(CITATION, elem_classes="markdown-text")
|
161 |
|
162 |
-
|
|
|
|
|
|
2 |
from collections import defaultdict
|
3 |
from pathlib import Path
|
4 |
|
5 |
+
import numpy as np
|
6 |
import pandas as pd
|
7 |
import gradio as gr
|
8 |
+
from pandas import DataFrame
|
9 |
+
from pandas.io.formats.style import Styler
|
10 |
|
11 |
from content import *
|
|
|
|
|
12 |
|
13 |
ARC = "arc"
|
14 |
HELLASWAG = "hellaswag"
|
|
|
18 |
|
19 |
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
|
20 |
|
21 |
+
|
22 |
+
def collect_results() -> dict[tuple[str, str], dict[str, float]]:
|
23 |
+
"""
|
24 |
+
Collects results from the evals folder and returns a dictionary of results
|
25 |
+
:return: a dictionary of results where the keys are typles of (model_name, language) and the values are
|
26 |
+
dictionaries of the form {benchmark_name: performance_score}
|
27 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
performance_dict = defaultdict(dict)
|
|
|
29 |
for pfin in Path("evals").rglob("*.json"):
|
30 |
data = json.loads(pfin.read_text(encoding="utf-8"))
|
31 |
+
if "results" not in data or "config" not in data:
|
|
|
|
|
32 |
continue
|
33 |
results = data["results"]
|
34 |
config = data["config"]
|
|
|
41 |
continue
|
42 |
pretrained = pretrained[0].split("=")[1]
|
43 |
pretrained = pretrained.split("/")[-1]
|
|
|
44 |
|
45 |
for lang_task, perfs in results.items():
|
46 |
task, lang = lang_task.split("_")
|
|
|
51 |
p = round(perfs[metric] * 100, 1)
|
52 |
performance_dict[(pretrained, lang)][task] = p
|
53 |
|
54 |
+
return dict(performance_dict)
|
55 |
|
56 |
|
57 |
+
def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float]]) -> DataFrame:
|
58 |
+
"""
|
59 |
+
Builds a dataframe from the performance dictionary
|
60 |
+
:param performance_dict: a dictionary of results where the keys are typles of (model_name, language) and the values are
|
61 |
+
dictionaries of the form {benchmark_name: performance_score}
|
62 |
+
:return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
|
63 |
+
"""
|
64 |
+
data = []
|
65 |
for (pretrained, lang), perfs in performance_dict.items():
|
|
|
66 |
arc_perf = perfs.get(ARC, 0.0)
|
67 |
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
|
68 |
mmlu_perf = perfs.get(MMLU, 0.0)
|
69 |
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
|
70 |
|
71 |
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
|
72 |
+
row = [pretrained, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
|
73 |
+
data.append(row)
|
|
|
74 |
|
75 |
+
df = pd.DataFrame.from_records(data, columns=COLS)
|
76 |
df = df.sort_values(by=[AVERAGE_COL], ascending=False)
|
|
|
|
|
77 |
return df
|
78 |
|
79 |
|
80 |
+
def style_df(df: DataFrame) -> Styler:
|
81 |
+
"""
|
82 |
+
Styles the dataframe by rounding to two decimals and putting the max value in bold per column
|
83 |
+
:param df: the dataframe to style
|
84 |
+
:return: the Styler
|
85 |
+
"""
|
86 |
+
styler = df.style.format("{:.2f}", subset=df.columns[1:])
|
87 |
+
|
88 |
+
def highlight_max(col):
|
89 |
+
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
|
90 |
+
|
91 |
+
styler = styler.apply(highlight_max, axis=1, subset=df.columns[1:])
|
92 |
+
|
93 |
+
return styler
|
94 |
|
95 |
|
96 |
MODEL_COL = "Model"
|
|
|
99 |
HELLASWAG_COL = "HellaSwag (10-shot)οΈ"
|
100 |
MMLU_COL = "MMLU (5-shot)"
|
101 |
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
|
|
|
|
|
|
|
|
|
102 |
|
103 |
+
COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
|
104 |
+
TYPES = ["str", "number", "number", "number", "number", "number"]
|
105 |
|
106 |
+
results = collect_results()
|
107 |
+
original_df = build_performance_df(results)
|
108 |
+
styled_df = style_df(original_df)
|
109 |
+
with gr.Blocks() as demo:
|
110 |
gr.HTML(TITLE)
|
111 |
+
gr.Markdown(INTRO_TEXT)
|
112 |
+
|
113 |
+
gr.Markdown("## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!")
|
114 |
+
gr.components.Dataframe(
|
115 |
+
value=original_df,
|
116 |
+
headers=COLS,
|
117 |
+
datatype=TYPES,
|
118 |
+
elem_id="leaderboard-table",
|
119 |
+
)
|
120 |
+
|
121 |
+
gr.Markdown("## LaTeX")
|
122 |
+
gr.Code(styled_df.to_latex(convert_css=True))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
gr.Markdown(CREDIT, elem_classes="markdown-text")
|
125 |
gr.Markdown(CITATION, elem_classes="markdown-text")
|
126 |
|
127 |
+
if __name__ == '__main__':
|
128 |
+
demo.launch()
|
129 |
+
|
content.py
CHANGED
@@ -1,44 +1,29 @@
|
|
1 |
-
TITLE = '<h1 align="center" id="space-title">Open Multilingual LLM Evaluation Leaderboard</h1>'
|
2 |
|
3 |
INTRO_TEXT = f"""
|
4 |
## About
|
5 |
|
6 |
-
This
|
7 |
-
|
8 |
-
Our current leaderboard provides evaluation data for 29 languages, i.e.,
|
9 |
-
Arabic, Armenian, Basque, Bengali, Catalan, Chinese, Croatian, Danish, Dutch,
|
10 |
-
French, German, Gujarati, Hindi, Hungarian, Indonesian, Italian, Kannada, Malayalam,
|
11 |
-
Marathi, Nepali, Portuguese, Romanian, Russian, Serbian, Slovak, Spanish, Swedish,
|
12 |
-
Tamil, Telugu, Ukrainian, and Vietnamese, that will be expanded along the way.
|
13 |
-
Both multilingual and language-specific LLMs are welcome in this leaderboard.
|
14 |
-
We currently evaluate models over four benchmarks:
|
15 |
|
16 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot)
|
17 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot)
|
18 |
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot)
|
19 |
- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot)
|
20 |
|
21 |
-
|
22 |
|
23 |
"""
|
24 |
|
25 |
-
HOW_TO = f"""
|
26 |
-
## How to list your model performance on this leaderboard:
|
27 |
-
|
28 |
-
Run the evaluation of your model using this repo: <a href="https://github.com/laiviet/lm-evaluation-harness" target="_blank">https://github.com/laiviet/lm-evaluation-harness</a>.
|
29 |
-
|
30 |
-
And then, push the evaluation log and make a pull request.
|
31 |
-
"""
|
32 |
-
|
33 |
CREDIT = f"""
|
34 |
## Credit
|
35 |
|
36 |
-
|
37 |
|
38 |
- Datasets (AI2_ARC, HellaSwag, MMLU, TruthfulQA)
|
39 |
-
- Funding and GPU access (Adobe Research)
|
40 |
- Evaluation code (EleutherAI's lm_evaluation_harness repo)
|
41 |
- Leaderboard code (Huggingface4's open_llm_leaderboard repo)
|
|
|
42 |
|
43 |
"""
|
44 |
|
@@ -46,12 +31,19 @@ To make this website, we use the following resources:
|
|
46 |
CITATION = f"""
|
47 |
## Citation
|
48 |
|
49 |
-
```
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
@misc{{lai2023openllmbenchmark,
|
52 |
author = {{Viet Lai and Nghia Trung Ngo and Amir Pouran Ben Veyseh and Franck Dernoncourt and Thien Huu Nguyen}},
|
53 |
title={{Open Multilingual LLM Evaluation Leaderboard}},
|
54 |
year={{2023}}
|
55 |
}}
|
56 |
```
|
57 |
-
"""
|
|
|
1 |
+
TITLE = '<h1 align="center" id="space-title">Open Multilingual LLM Evaluation Leaderboard (Dutch only)</h1>'
|
2 |
|
3 |
INTRO_TEXT = f"""
|
4 |
## About
|
5 |
|
6 |
+
This is a fork of the [Open Multilingual LLM Evaluation Leaderboard](https://huggingface.co/spaces/uonlp/open_multilingual_llm_leaderboard), but restricted to only Dutch models and augmented with additional model results.
|
7 |
+
We test the models on the following benchmarks **for the Dutch version only!!**, which have been translated into Dutch automatically by the original authors of the Open Multilingual LLM Evaluation Leaderboard with `gpt-35-turbo`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot)
|
10 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot)
|
11 |
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot)
|
12 |
- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot)
|
13 |
|
14 |
+
I do not maintain those datasets, I only run benchmarks and add the results to this space. For questions regarding the test sets or running them yourself, see [the original Github repository](https://github.com/laiviet/lm-evaluation-harness).
|
15 |
|
16 |
"""
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
CREDIT = f"""
|
19 |
## Credit
|
20 |
|
21 |
+
This leaderboard has borrowed heavily from the following sources:
|
22 |
|
23 |
- Datasets (AI2_ARC, HellaSwag, MMLU, TruthfulQA)
|
|
|
24 |
- Evaluation code (EleutherAI's lm_evaluation_harness repo)
|
25 |
- Leaderboard code (Huggingface4's open_llm_leaderboard repo)
|
26 |
+
- The multilingual version of the leaderboard (uonlp's open_multilingual_llm_leaderboard repo)
|
27 |
|
28 |
"""
|
29 |
|
|
|
31 |
CITATION = f"""
|
32 |
## Citation
|
33 |
|
|
|
34 |
|
35 |
+
If you use or cite the Dutch benchmark results or this specific leaderboard page, please cite the following paper:
|
36 |
+
|
37 |
+
TDB
|
38 |
+
|
39 |
+
|
40 |
+
If you use the multilingual benchmarks, please cite the following paper:
|
41 |
+
|
42 |
+
```bibtex
|
43 |
@misc{{lai2023openllmbenchmark,
|
44 |
author = {{Viet Lai and Nghia Trung Ngo and Amir Pouran Ben Veyseh and Franck Dernoncourt and Thien Huu Nguyen}},
|
45 |
title={{Open Multilingual LLM Evaluation Leaderboard}},
|
46 |
year={{2023}}
|
47 |
}}
|
48 |
```
|
49 |
+
"""
|
css.py
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
CUSTOM_CSS = """
|
2 |
-
/* Hides the final column */
|
3 |
-
table td:last-child,
|
4 |
-
table th:last-child {
|
5 |
-
display: none;
|
6 |
-
}
|
7 |
-
# table td:first-child,
|
8 |
-
# table th:first-child {
|
9 |
-
# max-width: 400px;
|
10 |
-
# overflow: auto;
|
11 |
-
# white-space: nowrap;
|
12 |
-
# }
|
13 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_nl_Llama-2-7b-chat-hf.json
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
"arc_nl": {
|
4 |
-
"acc": 0.
|
5 |
-
"acc_stderr": 0.
|
6 |
-
"acc_norm": 0.
|
7 |
-
"acc_norm_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
@@ -12,8 +12,8 @@
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
"arc_nl": {
|
4 |
+
"acc": 0.3550042771599658,
|
5 |
+
"acc_stderr": 0.014001474982174305,
|
6 |
+
"acc_norm": 0.3609923011120616,
|
7 |
+
"acc_norm_stderr": 0.014053373664144789
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/arc/arc_nl_Llama-2-7b-hf.json
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
"arc_nl": {
|
4 |
-
"acc": 0.
|
5 |
-
"acc_stderr": 0.
|
6 |
-
"acc_norm": 0.
|
7 |
-
"acc_norm_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
@@ -12,8 +12,8 @@
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
"arc_nl": {
|
4 |
+
"acc": 0.33447390932420873,
|
5 |
+
"acc_stderr": 0.013805185437125271,
|
6 |
+
"acc_norm": 0.3558597091531223,
|
7 |
+
"acc_norm_stderr": 0.014009035017396714
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/arc/{arc_nl_Mistral-7B-v0.1.json β arc_nl_Orca-2-7b.json}
RENAMED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
"arc_nl": {
|
4 |
-
"acc": 0.
|
5 |
-
"acc_stderr": 0.
|
6 |
-
"acc_norm": 0.
|
7 |
-
"acc_norm_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
@@ -12,8 +12,8 @@
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
"arc_nl": {
|
4 |
+
"acc": 0.3661248930710009,
|
5 |
+
"acc_stderr": 0.014095972894279241,
|
6 |
+
"acc_norm": 0.3678357570573139,
|
7 |
+
"acc_norm_stderr": 0.014109788842173
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=microsoft/Orca-2-7b,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/{truthfulqa/truthfulqa_nl-Llama-2-13b-hf.json β arc/arc_nl_gpt2-large-dutch.json}
RENAMED
@@ -1,19 +1,19 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
-
"
|
4 |
-
"
|
5 |
-
"
|
6 |
-
"
|
7 |
-
"
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
11 |
-
"
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
+
"arc_nl": {
|
4 |
+
"acc": 0.20102651839178784,
|
5 |
+
"acc_stderr": 0.011726581781869408,
|
6 |
+
"acc_norm": 0.24037639007698888,
|
7 |
+
"acc_norm_stderr": 0.01250327289928353
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
11 |
+
"arc_nl": 0
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/arc/arc_nl_gpt2-medium-dutch.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"arc_nl": {
|
4 |
+
"acc": 0.21471343028229256,
|
5 |
+
"acc_stderr": 0.012014958326088981,
|
6 |
+
"acc_norm": 0.24294268605645852,
|
7 |
+
"acc_norm_stderr": 0.012548588352773891
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"arc_nl": 0
|
12 |
+
},
|
13 |
+
"config": {
|
14 |
+
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
+
"device": "cuda",
|
18 |
+
"no_cache": false,
|
19 |
+
"limit": null,
|
20 |
+
"bootstrap_iters": 100000,
|
21 |
+
"description_dict": {}
|
22 |
+
}
|
23 |
+
}
|
evals/arc/arc_nl_zephyr-7b-beta.json
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
"arc_nl": {
|
4 |
-
"acc": 0.
|
5 |
-
"acc_stderr": 0.
|
6 |
-
"acc_norm": 0.
|
7 |
-
"acc_norm_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
@@ -12,8 +12,8 @@
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
"arc_nl": {
|
4 |
+
"acc": 0.4311377245508982,
|
5 |
+
"acc_stderr": 0.014490726457652989,
|
6 |
+
"acc_norm": 0.43199315654405473,
|
7 |
+
"acc_norm_stderr": 0.014494184864971338
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/hellaswag/hellaswag_nl_Llama-2-7b-chat-hf.json
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
"hellaswag_nl": {
|
4 |
-
"acc": 0.
|
5 |
-
"acc_stderr": 0.
|
6 |
-
"acc_norm": 0.
|
7 |
-
"acc_norm_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
@@ -12,8 +12,8 @@
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
"hellaswag_nl": {
|
4 |
+
"acc": 0.3838100377765785,
|
5 |
+
"acc_stderr": 0.005052614927289456,
|
6 |
+
"acc_norm": 0.4819212088505127,
|
7 |
+
"acc_norm_stderr": 0.005191425828002782
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/hellaswag/hellaswag_nl_Llama-2-7b-hf.json
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
"hellaswag_nl": {
|
4 |
-
"acc": 0.
|
5 |
-
"acc_stderr": 0.
|
6 |
-
"acc_norm": 0.
|
7 |
-
"acc_norm_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
@@ -12,8 +12,8 @@
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
"hellaswag_nl": {
|
4 |
+
"acc": 0.386184565569347,
|
5 |
+
"acc_stderr": 0.00505844561828187,
|
6 |
+
"acc_norm": 0.4957366432811657,
|
7 |
+
"acc_norm_stderr": 0.0051946338704556266
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
"hellaswag_nl": {
|
4 |
-
"acc": 0.
|
5 |
-
"acc_stderr": 0.
|
6 |
-
"acc_norm": 0.
|
7 |
-
"acc_norm_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
@@ -12,8 +12,8 @@
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
"hellaswag_nl": {
|
4 |
+
"acc": 0.4336751214247167,
|
5 |
+
"acc_stderr": 0.0051489159372014965,
|
6 |
+
"acc_norm": 0.5662169454937939,
|
7 |
+
"acc_norm_stderr": 0.005149065890785751
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/hellaswag/{hellaswag_nl_zephyr-7b-beta.json β hellaswag_nl_Orca-2-7b.json}
RENAMED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
"hellaswag_nl": {
|
4 |
-
"acc": 0.
|
5 |
-
"acc_stderr": 0.
|
6 |
-
"acc_norm": 0.
|
7 |
-
"acc_norm_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
@@ -12,8 +12,8 @@
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
"hellaswag_nl": {
|
4 |
+
"acc": 0.38456556934700487,
|
5 |
+
"acc_stderr": 0.005054483938257531,
|
6 |
+
"acc_norm": 0.48041014570966,
|
7 |
+
"acc_norm_stderr": 0.005190834031799853
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=microsoft/Orca-2-7b,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/hellaswag/hellaswag_nl_gpt2-large-dutch.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"hellaswag_nl": {
|
4 |
+
"acc": 0.3043712898003238,
|
5 |
+
"acc_stderr": 0.004780698091128437,
|
6 |
+
"acc_norm": 0.34279546681057743,
|
7 |
+
"acc_norm_stderr": 0.004931380767300367
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"hellaswag_nl": 1
|
12 |
+
},
|
13 |
+
"config": {
|
14 |
+
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
+
"device": "cuda",
|
18 |
+
"no_cache": false,
|
19 |
+
"limit": null,
|
20 |
+
"bootstrap_iters": 100000,
|
21 |
+
"description_dict": {}
|
22 |
+
}
|
23 |
+
}
|
evals/hellaswag/hellaswag_nl_gpt2-medium-dutch.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"hellaswag_nl": {
|
4 |
+
"acc": 0.31246627091203455,
|
5 |
+
"acc_stderr": 0.004815587775923881,
|
6 |
+
"acc_norm": 0.36438208310847275,
|
7 |
+
"acc_norm_stderr": 0.00500008398696681
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"hellaswag_nl": 1
|
12 |
+
},
|
13 |
+
"config": {
|
14 |
+
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
+
"device": "cuda",
|
18 |
+
"no_cache": false,
|
19 |
+
"limit": null,
|
20 |
+
"bootstrap_iters": 100000,
|
21 |
+
"description_dict": {}
|
22 |
+
}
|
23 |
+
}
|
evals/hellaswag/hellaswag_nl_neural-chat-7b-v3-1.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"hellaswag_nl": {
|
4 |
+
"acc": 0.44069077172153265,
|
5 |
+
"acc_stderr": 0.0051581467942195215,
|
6 |
+
"acc_norm": 0.5429033998920669,
|
7 |
+
"acc_norm_stderr": 0.005175663147811796
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"hellaswag_nl": 1
|
12 |
+
},
|
13 |
+
"config": {
|
14 |
+
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=Intel/neural-chat-7b-v3-1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
+
"device": "cuda",
|
18 |
+
"no_cache": false,
|
19 |
+
"limit": null,
|
20 |
+
"bootstrap_iters": 100000,
|
21 |
+
"description_dict": {}
|
22 |
+
}
|
23 |
+
}
|
evals/{truthfulqa/truthfulqa_nl_Mistral-7B-v0.1.json β mmlu/mmlu_nl_Mistral-7B-v0.1.json}
RENAMED
@@ -1,19 +1,19 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
-
"
|
4 |
-
"
|
5 |
-
"
|
6 |
-
"
|
7 |
-
"
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
11 |
-
"
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
+
"mmlu_nl": {
|
4 |
+
"acc": 0.45974045685664416,
|
5 |
+
"acc_stderr": 0.004341759787221058,
|
6 |
+
"acc_norm": 0.36912802610609396,
|
7 |
+
"acc_norm_stderr": 0.0042040447899996366
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
11 |
+
"mmlu_nl": 0
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/mmlu/mmlu_nl_gpt2-large-dutch.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"mmlu_nl": {
|
4 |
+
"acc": 0.2301737876603172,
|
5 |
+
"acc_stderr": 0.003667182186959482,
|
6 |
+
"acc_norm": 0.2436821734841011,
|
7 |
+
"acc_norm_stderr": 0.0037400056232706905
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"mmlu_nl": 0
|
12 |
+
},
|
13 |
+
"config": {
|
14 |
+
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
+
"device": "cuda",
|
18 |
+
"no_cache": false,
|
19 |
+
"limit": null,
|
20 |
+
"bootstrap_iters": 100000,
|
21 |
+
"description_dict": {}
|
22 |
+
}
|
23 |
+
}
|
evals/mmlu/mmlu_nl_gpt2-medium-dutch.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"mmlu_nl": {
|
4 |
+
"acc": 0.23343704940426502,
|
5 |
+
"acc_stderr": 0.0036852504856799066,
|
6 |
+
"acc_norm": 0.2483873415800258,
|
7 |
+
"acc_norm_stderr": 0.003764176503735655
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"mmlu_nl": 0
|
12 |
+
},
|
13 |
+
"config": {
|
14 |
+
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
+
"device": "cuda",
|
18 |
+
"no_cache": false,
|
19 |
+
"limit": null,
|
20 |
+
"bootstrap_iters": 100000,
|
21 |
+
"description_dict": {}
|
22 |
+
}
|
23 |
+
}
|
evals/truthfulqa/truthfulqa_nl_Llama-2-13b-hf.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"truthfulqa_nl": {
|
4 |
-
"mc1": 0.2764331210191083,
|
5 |
-
"mc1_stderr": 0.01597262688062874,
|
6 |
-
"mc2": 0.4103755310313891,
|
7 |
-
"mc2_stderr": 0.014811313488625848
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"truthfulqa_nl": 1
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=meta-llama/Llama-2-13b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
-
"batch_size": 8,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/truthfulqa/truthfulqa_nl_Llama-2-7b-chat-hf.json
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
"truthfulqa_nl": {
|
4 |
-
"mc1": 0.
|
5 |
-
"mc1_stderr": 0.
|
6 |
-
"mc2": 0.
|
7 |
-
"mc2_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
@@ -12,8 +12,8 @@
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
"truthfulqa_nl": {
|
4 |
+
"mc1": 0.289171974522293,
|
5 |
+
"mc1_stderr": 0.016192068781346693,
|
6 |
+
"mc2": 0.4445882138885173,
|
7 |
+
"mc2_stderr": 0.016144169053565395
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/truthfulqa/truthfulqa_nl_Llama-2-7b-hf.json
CHANGED
@@ -3,8 +3,8 @@
|
|
3 |
"truthfulqa_nl": {
|
4 |
"mc1": 0.28152866242038216,
|
5 |
"mc1_stderr": 0.016062309899461683,
|
6 |
-
"mc2": 0.
|
7 |
-
"mc2_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
@@ -12,8 +12,8 @@
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
3 |
"truthfulqa_nl": {
|
4 |
"mc1": 0.28152866242038216,
|
5 |
"mc1_stderr": 0.016062309899461683,
|
6 |
+
"mc2": 0.41449853431238814,
|
7 |
+
"mc2_stderr": 0.014922005996963188
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/truthfulqa/{truthfulqa_nl-falcon-40b-ft-alpaca-dolly-dutch.json β truthfulqa_nl_Orca-2-7b.json}
RENAMED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
"truthfulqa_nl": {
|
4 |
-
"mc1": 0.
|
5 |
-
"mc1_stderr": 0.
|
6 |
-
"mc2": 0.
|
7 |
-
"mc2_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
@@ -12,8 +12,8 @@
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
"truthfulqa_nl": {
|
4 |
+
"mc1": 0.3146496815286624,
|
5 |
+
"mc1_stderr": 0.01658486445168711,
|
6 |
+
"mc2": 0.4488463711895695,
|
7 |
+
"mc2_stderr": 0.016292493035951996
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=microsoft/Orca-2-7b,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/truthfulqa/truthfulqa_nl_falcon-40b-ft-alpaca-dolly-dutch.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"truthfulqa_nl": {
|
4 |
-
"mc1": 0.310828025477707,
|
5 |
-
"mc1_stderr": 0.016529733724696277,
|
6 |
-
"mc2": 0.4460845208916539,
|
7 |
-
"mc2_stderr": 0.01476856418537487
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"truthfulqa_nl": 1
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=BramVanroy/falcon-40b-ft-alpaca-dolly-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
-
"batch_size": 8,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/truthfulqa/truthfulqa_nl_falcon-40b.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"truthfulqa_nl": {
|
4 |
-
"mc1": 0.2764331210191083,
|
5 |
-
"mc1_stderr": 0.01597262688062875,
|
6 |
-
"mc2": 0.4091336161450544,
|
7 |
-
"mc2_stderr": 0.014605140809282338
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"truthfulqa_nl": 1
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=tiiuae/falcon-40b,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
-
"batch_size": 8,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/truthfulqa/{truthfulqa_nl-llama2-13b-ft-mc4_nl_cleaned_tiny.json β truthfulqa_nl_gpt2-large-dutch.json}
RENAMED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
"truthfulqa_nl": {
|
4 |
-
"mc1": 0.
|
5 |
-
"mc1_stderr": 0.
|
6 |
-
"mc2": 0.
|
7 |
-
"mc2_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
@@ -12,8 +12,8 @@
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
"truthfulqa_nl": {
|
4 |
+
"mc1": 0.25987261146496815,
|
5 |
+
"mc1_stderr": 0.015663018533664023,
|
6 |
+
"mc2": 0.41961324970531233,
|
7 |
+
"mc2_stderr": 0.01509691194885121
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=yhavinga/gpt2-large-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/truthfulqa/{truthfulqa_nl-falcon-40b.json β truthfulqa_nl_gpt2-medium-dutch.json}
RENAMED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
"truthfulqa_nl": {
|
4 |
-
"mc1": 0.
|
5 |
-
"mc1_stderr": 0.
|
6 |
-
"mc2": 0.
|
7 |
-
"mc2_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
@@ -12,8 +12,8 @@
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
"truthfulqa_nl": {
|
4 |
+
"mc1": 0.2878980891719745,
|
5 |
+
"mc1_stderr": 0.0161708346142461,
|
6 |
+
"mc2": 0.4527386932512769,
|
7 |
+
"mc2_stderr": 0.015417954968769677
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
|
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=yhavinga/gpt2-medium-dutch,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
+
"batch_size": "auto",
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/truthfulqa/truthfulqa_nl_llama2-13b-ft-mc4_nl_cleaned_tiny.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"truthfulqa_nl": {
|
4 |
-
"mc1": 0.2751592356687898,
|
5 |
-
"mc1_stderr": 0.0159498029022655,
|
6 |
-
"mc2": 0.41816127879466414,
|
7 |
-
"mc2_stderr": 0.01474120131034505
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"truthfulqa_nl": 1
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=BramVanroy/llama2-13b-ft-mc4_nl_cleaned_tiny,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
-
"batch_size": 8,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/truthfulqa/truthfulqa_nl_zephyr-7b-beta.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"truthfulqa_nl": {
|
4 |
-
"mc1": 0.3719745222929936,
|
5 |
-
"mc1_stderr": 0.0172618443903749,
|
6 |
-
"mc2": 0.5294532108691418,
|
7 |
-
"mc2_stderr": 0.016221848481192833
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"truthfulqa_nl": 1
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
-
"batch_size": 64,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|