Ilker Kesen
commited on
Commit
·
500fbd7
1
Parent(s):
74daf31
initialize the first version
Browse files- .gitignore +162 -0
- LICENSE +21 -0
- README.md +1 -14
- app.py +197 -0
- assets/kuis-ai-logo.png +0 -0
- data.py +121 -0
- data/datasets.json +185 -0
- environment.yaml +93 -0
- process_result.py +72 -0
- requirements.txt +8 -0
- results/zero-shot/aya-23-8b.json +161 -0
- results/zero-shot/aya-expanse-8b.json +159 -0
- results/zero-shot/aya101.json +172 -0
- results/zero-shot/commencis-7b.json +172 -0
- results/zero-shot/kanarya-2b.json +171 -0
- results/zero-shot/llama-3-8b-instruct.json +160 -0
- results/zero-shot/llama-3-8b.json +159 -0
- results/zero-shot/llama-3.1-8b-instruct.json +159 -0
- results/zero-shot/llama-3.1-8b.json +127 -0
- results/zero-shot/llama-3.2-1b.json +191 -0
- results/zero-shot/llama-3.2-3b-instruct.json +191 -0
- results/zero-shot/mistral-7b.json +165 -0
- results/zero-shot/trendyol-7b.json +172 -0
- results/zero-shot/turna.json +172 -0
- utils.py +28 -0
.gitignore
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
110 |
+
.pdm.toml
|
111 |
+
.pdm-python
|
112 |
+
.pdm-build/
|
113 |
+
|
114 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
115 |
+
__pypackages__/
|
116 |
+
|
117 |
+
# Celery stuff
|
118 |
+
celerybeat-schedule
|
119 |
+
celerybeat.pid
|
120 |
+
|
121 |
+
# SageMath parsed files
|
122 |
+
*.sage.py
|
123 |
+
|
124 |
+
# Environments
|
125 |
+
.env
|
126 |
+
.venv
|
127 |
+
env/
|
128 |
+
venv/
|
129 |
+
ENV/
|
130 |
+
env.bak/
|
131 |
+
venv.bak/
|
132 |
+
|
133 |
+
# Spyder project settings
|
134 |
+
.spyderproject
|
135 |
+
.spyproject
|
136 |
+
|
137 |
+
# Rope project settings
|
138 |
+
.ropeproject
|
139 |
+
|
140 |
+
# mkdocs documentation
|
141 |
+
/site
|
142 |
+
|
143 |
+
# mypy
|
144 |
+
.mypy_cache/
|
145 |
+
.dmypy.json
|
146 |
+
dmypy.json
|
147 |
+
|
148 |
+
# Pyre type checker
|
149 |
+
.pyre/
|
150 |
+
|
151 |
+
# pytype static type analyzer
|
152 |
+
.pytype/
|
153 |
+
|
154 |
+
# Cython debug symbols
|
155 |
+
cython_debug/
|
156 |
+
|
157 |
+
# PyCharm
|
158 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
159 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
160 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
161 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
162 |
+
#.idea/
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2024 KUIS AI Center
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,14 +1 @@
|
|
1 |
-
|
2 |
-
title: Pergel
|
3 |
-
emoji: 📈
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo: pink
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.40.2
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: mit
|
11 |
-
short_description: 'Pergel: A Unified Benchmark for Evaluating Turkish LLMs'
|
12 |
-
---
|
13 |
-
|
14 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# Cetvel-leaderboard
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import json
|
4 |
+
from utils import read_results, preprocess_path, get_model_url
|
5 |
+
from data import Tasks, Metrics, DATASET_TASK_DICT, TASK_METRIC_DICT, DATASET_GROUPS
|
6 |
+
|
7 |
+
|
8 |
+
st.set_page_config(
|
9 |
+
page_title='Cetvel 📏',
|
10 |
+
layout='centered',
|
11 |
+
)
|
12 |
+
|
13 |
+
|
14 |
+
@st.cache_data
|
15 |
+
def cache_results(path):
|
16 |
+
json_results = read_results(path)
|
17 |
+
results = list()
|
18 |
+
for entry in json_results:
|
19 |
+
row = {
|
20 |
+
'model': entry['model']['model'],
|
21 |
+
'num_parameters': entry['model']['num_parameters'],
|
22 |
+
'url': get_model_url(entry['model']),
|
23 |
+
'architecture': entry['model']['architecture'],
|
24 |
+
'type': entry['model']['type'],
|
25 |
+
'precision': entry['model']['dtype'],
|
26 |
+
}
|
27 |
+
for result in entry['results']:
|
28 |
+
task = result['task']
|
29 |
+
metric = TASK_METRIC_DICT.get(task)
|
30 |
+
score = result.get(metric)
|
31 |
+
score = 100 * score if metric != Metrics.WER and score is not None else score
|
32 |
+
row[result['name']] = score
|
33 |
+
results.append(row)
|
34 |
+
df = pd.DataFrame(results)
|
35 |
+
for group, metadata in DATASET_GROUPS.items():
|
36 |
+
df[group] = df[metadata['datasets']].mean(axis=1)
|
37 |
+
return df
|
38 |
+
|
39 |
+
|
40 |
+
@st.cache_data
|
41 |
+
def cache_datasets(path):
|
42 |
+
path = preprocess_path(path)
|
43 |
+
with open(path, 'r') as f:
|
44 |
+
datasets = json.load(f)
|
45 |
+
for key in datasets.keys():
|
46 |
+
datasets[key]['dataset'] = key
|
47 |
+
return datasets
|
48 |
+
|
49 |
+
|
50 |
+
def create_column_configs(items):
|
51 |
+
column_configs = dict()
|
52 |
+
for key, metadata in items.items():
|
53 |
+
column_configs[key] = st.column_config.NumberColumn(
|
54 |
+
metadata.get('name', key),
|
55 |
+
help=metadata['description'],
|
56 |
+
min_value=0,
|
57 |
+
format="%2.2f"
|
58 |
+
)
|
59 |
+
return column_configs
|
60 |
+
|
61 |
+
|
62 |
+
def insert_average(df, keys):
|
63 |
+
df = df.copy(deep=True)
|
64 |
+
df['average'] = df.loc[:, [x for x in df.columns if x in keys]].mean(axis=1)
|
65 |
+
df.insert(1, 'average', df.pop('average'))
|
66 |
+
df.index += 1
|
67 |
+
return df.sort_values(by=['average'], ascending=False)
|
68 |
+
|
69 |
+
|
70 |
+
MODEL_SPEC_CONFIGS = {
|
71 |
+
'model': st.column_config.TextColumn(
|
72 |
+
'Model',
|
73 |
+
help='Large Language Model (LLM) used for the experiments.',
|
74 |
+
max_chars=120,
|
75 |
+
|
76 |
+
),
|
77 |
+
'url': st.column_config.LinkColumn(
|
78 |
+
'URL',
|
79 |
+
help='Model URL.',
|
80 |
+
display_text='Click',
|
81 |
+
),
|
82 |
+
'num_parameters': st.column_config.TextColumn(
|
83 |
+
'#params',
|
84 |
+
help='Approximate number of parameters.',
|
85 |
+
),
|
86 |
+
'type': st.column_config.TextColumn(
|
87 |
+
'Type',
|
88 |
+
help='Model type based on training objective.',
|
89 |
+
),
|
90 |
+
'average': st.column_config.NumberColumn(
|
91 |
+
'Avg.',
|
92 |
+
help='Average across task or dataset performances.',
|
93 |
+
format="%2.2f",
|
94 |
+
)
|
95 |
+
}
|
96 |
+
|
97 |
+
|
98 |
+
def filter_visible_model_specs():
|
99 |
+
specs = {
|
100 |
+
'URL': ('url', 1),
|
101 |
+
'#params': ('num_parameters', 2),
|
102 |
+
'Architecture': ('architecture', 3),
|
103 |
+
'Type': ('type', 4),
|
104 |
+
'Precision': ('precision', 5),
|
105 |
+
}
|
106 |
+
visible_specs = st.multiselect(
|
107 |
+
'Select model specs to be shown in the table.',
|
108 |
+
options=sorted(specs.keys(), key=lambda x: specs[x][1]),
|
109 |
+
)
|
110 |
+
# visible_specs = sorted(visible_specs, key=lambda x: specs[x][1])
|
111 |
+
return [specs[x][0] for x in visible_specs]
|
112 |
+
|
113 |
+
|
114 |
+
def filter_by_model_spec():
|
115 |
+
pass
|
116 |
+
|
117 |
+
|
118 |
+
def filter_visible_datasets(datasets):
|
119 |
+
col1, col2 = st.columns(2)
|
120 |
+
with col1:
|
121 |
+
dataset_grouping = st.selectbox(
|
122 |
+
'Dataset Grouping',
|
123 |
+
[
|
124 |
+
'Group Datasets',
|
125 |
+
'Show All Datasets',
|
126 |
+
],
|
127 |
+
)
|
128 |
+
|
129 |
+
with col2:
|
130 |
+
filter_by_task = st.selectbox(
|
131 |
+
'Filter by Task',
|
132 |
+
[
|
133 |
+
'All',
|
134 |
+
'Understanding Tasks',
|
135 |
+
'Generation Tasks',
|
136 |
+
'Multiple Choice',
|
137 |
+
'Extractive Question Answering',
|
138 |
+
'Natural Language Inference',
|
139 |
+
'Text Classification',
|
140 |
+
'Summarization',
|
141 |
+
],
|
142 |
+
disabled=dataset_grouping == "Group Datasets",
|
143 |
+
)
|
144 |
+
|
145 |
+
if dataset_grouping == 'Group Datasets':
|
146 |
+
return list(DATASET_GROUPS.keys())
|
147 |
+
elif dataset_grouping == 'Show All Datasets':
|
148 |
+
if filter_by_task == 'All':
|
149 |
+
return list(datasets.keys())
|
150 |
+
elif filter_by_task == 'Understanding Tasks':
|
151 |
+
this_datasets = [k for (k, v) in datasets.items() if not v['generative']]
|
152 |
+
return this_datasets
|
153 |
+
elif filter_by_task == 'Generation Tasks':
|
154 |
+
this_datasets = [k for (k, v) in datasets.items() if v['generative']]
|
155 |
+
return this_datasets
|
156 |
+
elif filter_by_task == 'Multiple Choice':
|
157 |
+
return DATASET_GROUPS['MCQA']['datasets']
|
158 |
+
elif filter_by_task == 'Extractive Question Answering':
|
159 |
+
return DATASET_GROUPS['QA']['datasets']
|
160 |
+
elif filter_by_task == 'Natural Language Inference':
|
161 |
+
return DATASET_GROUPS['NLI']['datasets']
|
162 |
+
elif filter_by_task == 'Text Classification':
|
163 |
+
return DATASET_GROUPS['TC']['datasets']
|
164 |
+
elif filter_by_task == 'Summarization':
|
165 |
+
return DATASET_GROUPS['SUM']['datasets']
|
166 |
+
|
167 |
+
|
168 |
+
def introduction():
|
169 |
+
st.title(':blue[Cetvel :straight_ruler:]')
|
170 |
+
st.subheader('A Unified Benchmark for Evaluating Turkish LLMs', anchor=False)
|
171 |
+
st.markdown('''Cetvel is an extended version of the [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness) tool, specifically includes tasks/datasets for benchmarking Turkish Large Language Models (LLMs). Cetvel includes a variety of tasks curated to assess different aspects of model performance in the Turkish language. Our primary goal is to objectively evaluate the capabilities of large language models in understanding and processing Turkish. For documentation and more details about the benchmark and the experiments, you can check the [GitHub repository](https://github.com/KUIS-AI/Cetvel).''')
|
172 |
+
|
173 |
+
|
174 |
+
def main():
|
175 |
+
introduction()
|
176 |
+
results_df = cache_results('./results/zero-shot')
|
177 |
+
datasets = cache_datasets('./data/datasets.json')
|
178 |
+
dataset_column_configs = create_column_configs(datasets)
|
179 |
+
group_column_configs = create_column_configs(DATASET_GROUPS)
|
180 |
+
# score_columns = list(dataset_column_configs.keys()) + list(group_column_configs.keys())
|
181 |
+
column_configs = MODEL_SPEC_CONFIGS | group_column_configs | dataset_column_configs
|
182 |
+
|
183 |
+
visible_data_columns = sorted(filter_visible_datasets(datasets), key=str.casefold)
|
184 |
+
visible_model_columns = filter_visible_model_specs()
|
185 |
+
results_df = insert_average(results_df, visible_data_columns)
|
186 |
+
|
187 |
+
st.dataframe(
|
188 |
+
results_df,
|
189 |
+
use_container_width=True,
|
190 |
+
hide_index=True,
|
191 |
+
column_config=column_configs,
|
192 |
+
column_order=['model', 'average',] + visible_model_columns + visible_data_columns,
|
193 |
+
)
|
194 |
+
st.image('./assets/kuis-ai-logo.png', width=240)
|
195 |
+
|
196 |
+
|
197 |
+
main()
|
assets/kuis-ai-logo.png
ADDED
data.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import StrEnum, auto
|
2 |
+
|
3 |
+
|
4 |
+
class Tasks(StrEnum):
|
5 |
+
EXTRACTIVE_QUESTION_ANSWERING = auto()
|
6 |
+
MULTIPLE_CHOICE = auto()
|
7 |
+
SUMMARIZATION = auto()
|
8 |
+
NATURAL_LANGUAGE_INFERENCE = auto()
|
9 |
+
TEXT_CLASSIFICATION = auto()
|
10 |
+
MACHINE_TRANSLATION = auto()
|
11 |
+
GRAMMATICAL_ERROR_CORRECTION = auto()
|
12 |
+
|
13 |
+
|
14 |
+
class Metrics(StrEnum):
|
15 |
+
F1 = "f1"
|
16 |
+
EXACT_MATCH = "exact_match"
|
17 |
+
ROGUE1 = "rouge1"
|
18 |
+
ROUGE2 = "rouge2"
|
19 |
+
ROUGEL = "rougeL"
|
20 |
+
ACCURACY = "acc"
|
21 |
+
WER = "wer"
|
22 |
+
BLEU = "bleu"
|
23 |
+
|
24 |
+
|
25 |
+
DATASET_TASK_DICT = {
|
26 |
+
# extractive qa
|
27 |
+
'xquad_tr': Tasks.EXTRACTIVE_QUESTION_ANSWERING,
|
28 |
+
'tquad': Tasks.EXTRACTIVE_QUESTION_ANSWERING,
|
29 |
+
'mkqa_tr': Tasks.EXTRACTIVE_QUESTION_ANSWERING, # not exactly
|
30 |
+
|
31 |
+
# summarization
|
32 |
+
'xlsum_tr': Tasks.SUMMARIZATION,
|
33 |
+
'mlsum_tr': Tasks.SUMMARIZATION,
|
34 |
+
'wiki_lingua_tr': Tasks.SUMMARIZATION,
|
35 |
+
'tr-wikihow-summ': Tasks.SUMMARIZATION,
|
36 |
+
|
37 |
+
# NLI
|
38 |
+
#'nli_tr': Tasks.NATURAL_LANGUAGE_INFERENCE,
|
39 |
+
'mnli_tr': Tasks.NATURAL_LANGUAGE_INFERENCE,
|
40 |
+
'snli_tr': Tasks.NATURAL_LANGUAGE_INFERENCE,
|
41 |
+
'xnli_tr': Tasks.NATURAL_LANGUAGE_INFERENCE,
|
42 |
+
|
43 |
+
# multiple-choice
|
44 |
+
'xcopa_tr': Tasks.MULTIPLE_CHOICE,
|
45 |
+
'exams_tr': Tasks.MULTIPLE_CHOICE,
|
46 |
+
'belebele_tr': Tasks.MULTIPLE_CHOICE,
|
47 |
+
'turkish_plu': Tasks.MULTIPLE_CHOICE,
|
48 |
+
'turkish_plu_goal_inference': Tasks.MULTIPLE_CHOICE,
|
49 |
+
'turkish_plu_next_event_prediction': Tasks.MULTIPLE_CHOICE,
|
50 |
+
'turkish_plu_step_inference': Tasks.MULTIPLE_CHOICE,
|
51 |
+
'turkish_plu_step_ordering': Tasks.MULTIPLE_CHOICE,
|
52 |
+
|
53 |
+
# fact-checking, not sure whether these are multi-choice
|
54 |
+
# 'trclaim19': Tasks.MULTIPLE_CHOICE,
|
55 |
+
'check_worthiness': Tasks.MULTIPLE_CHOICE,
|
56 |
+
'relevance_judgment': Tasks.MULTIPLE_CHOICE,
|
57 |
+
|
58 |
+
# text classification
|
59 |
+
'sts_tr': Tasks.TEXT_CLASSIFICATION,
|
60 |
+
'offenseval_tr': Tasks.TEXT_CLASSIFICATION,
|
61 |
+
'news_cat': Tasks.TEXT_CLASSIFICATION,
|
62 |
+
'ironytr': Tasks.TEXT_CLASSIFICATION,
|
63 |
+
|
64 |
+
# other generation
|
65 |
+
'wmt-tr-en-prompt': Tasks.MACHINE_TRANSLATION,
|
66 |
+
'gecturk_generation': Tasks.GRAMMATICAL_ERROR_CORRECTION,
|
67 |
+
}
|
68 |
+
|
69 |
+
|
70 |
+
TASK_METRIC_DICT = {
|
71 |
+
Tasks.EXTRACTIVE_QUESTION_ANSWERING: Metrics.EXACT_MATCH,
|
72 |
+
Tasks.MULTIPLE_CHOICE: Metrics.ACCURACY,
|
73 |
+
Tasks.TEXT_CLASSIFICATION: Metrics.ACCURACY,
|
74 |
+
Tasks.NATURAL_LANGUAGE_INFERENCE: Metrics.ACCURACY,
|
75 |
+
Tasks.SUMMARIZATION: Metrics.ROUGE2,
|
76 |
+
Tasks.MACHINE_TRANSLATION: Metrics.BLEU,
|
77 |
+
Tasks.GRAMMATICAL_ERROR_CORRECTION: Metrics.EXACT_MATCH,
|
78 |
+
}
|
79 |
+
|
80 |
+
|
81 |
+
GENERATIVE_TASKS = (
|
82 |
+
Tasks.SUMMARIZATION,
|
83 |
+
Tasks.MACHINE_TRANSLATION,
|
84 |
+
Tasks.GRAMMATICAL_ERROR_CORRECTION,
|
85 |
+
)
|
86 |
+
|
87 |
+
DATASET_GROUPS = {
|
88 |
+
'QA': {
|
89 |
+
'datasets': ['xquad_tr', 'tquad', 'mkqa_tr'],
|
90 |
+
'description': 'Turkish splits of SQuAD-like datasets XQuAD and TQUAD.',
|
91 |
+
},
|
92 |
+
'MCQA': {
|
93 |
+
'datasets': ['xcopa_tr', 'exams_tr', 'belebele_tr'] + [x for x in DATASET_TASK_DICT.keys() if x.startswith('turkish_plu')],
|
94 |
+
'description': 'Multiple Choice Question Answering datasets: XCOPA, Exams, Belebele and Turkish PLU.'
|
95 |
+
},
|
96 |
+
'TC': {
|
97 |
+
'datasets': ['sts_tr', 'offenseval_tr', 'news_cat', 'ironytr', ],
|
98 |
+
'description': 'Text Classification datasets.',
|
99 |
+
},
|
100 |
+
'NLI': {
|
101 |
+
'datasets': ['mnli_tr', 'snli_tr', 'xnli_tr'],
|
102 |
+
'description': 'Natural Language Inference (NLI) datasets in Turkish: XNLI, SNLI and MNLI.',
|
103 |
+
},
|
104 |
+
'SUM': {
|
105 |
+
'datasets': [name for name, task in DATASET_TASK_DICT.items() if task == Tasks.SUMMARIZATION],
|
106 |
+
'description': 'Summarization datasets in Turkish (XLSum, MLSum, WikiLingua and TrWikiHowSumm).',
|
107 |
+
},
|
108 |
+
'GEC': {
|
109 |
+
'datasets': ['gecturk_generation',],
|
110 |
+
'description': 'Grammatical Error Correction task.',
|
111 |
+
},
|
112 |
+
'MT': {
|
113 |
+
'datasets': ['wmt-tr-en-prompt'],
|
114 |
+
'description': 'Machine Translation on WMT-16 dataset (English-to-Turkish).',
|
115 |
+
},
|
116 |
+
|
117 |
+
# 'TrClaim19': {
|
118 |
+
# 'datasets': ['check_worthiness', 'relevance_judgment'],
|
119 |
+
# 'description': 'TrClaim19 dataset for fact-checking.',
|
120 |
+
# },
|
121 |
+
}
|
data/datasets.json
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"tquad": {
|
3 |
+
"name": "TQUAD",
|
4 |
+
"task": "extractive_question_answering",
|
5 |
+
"description": "This dataset is the Turkish Question & Answer dataset on Turkish & Islamic Science History within the scope of Teknofest 2018 Artificial Intelligence competition.",
|
6 |
+
"url": "https://github.com/TQuad/turkish-nlp-qa-dataset",
|
7 |
+
"hf_name": "mcemilg/tquad",
|
8 |
+
"generative": false
|
9 |
+
},
|
10 |
+
"xquad_tr": {
|
11 |
+
"name": "XQUAD",
|
12 |
+
"task": "extractive_question_answering",
|
13 |
+
"description": "XQuAD (Cross-lingual Question Answering Dataset) is a benchmark dataset for evaluating cross-lingual question answering performance. The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the development set of SQuAD v1.1 together with their professional translations into ten languages: Spanish, German, Greek, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, and Hindi..",
|
14 |
+
"url": "https://github.com/google-deepmind/xquad",
|
15 |
+
"hf_name": "google/xquad",
|
16 |
+
"generative": false
|
17 |
+
},
|
18 |
+
"mkqa_tr": {
|
19 |
+
"name": "MKQA",
|
20 |
+
"task": "extractive_question_answering",
|
21 |
+
"description": "MKQA: Multilingual Knowledge Questions & Answers. MKQA includes 10k open-domain question-answer pairs in 26 languages, resulting 260k examples in total.",
|
22 |
+
"url": "https://github.com/apple/ml-mkqa",
|
23 |
+
"hf_name": "mcemilg/mkqa_tr",
|
24 |
+
"generative": false
|
25 |
+
},
|
26 |
+
"xlsum_tr": {
|
27 |
+
"name": "XLSum",
|
28 |
+
"task": "summarization",
|
29 |
+
"description": "Abstractive summarization dataset for 44 languages.",
|
30 |
+
"url": "https://github.com/csebuetnlp/xl-sum",
|
31 |
+
"hf_name": "csebuetnlp/xlsum",
|
32 |
+
"generative": true
|
33 |
+
},
|
34 |
+
"mlsum_tr": {
|
35 |
+
"name": "MLSum",
|
36 |
+
"task": "summarization",
|
37 |
+
"description": "A multilingual summarization dataset collected from the newspapers' websites. MLSum contains 1.5M examples in 5 languages including Turkish.",
|
38 |
+
"url": "https://huggingface.co/datasets/reciTAL/mlsum",
|
39 |
+
"hf_name": "reciTAL/mlsum",
|
40 |
+
"generative": true
|
41 |
+
},
|
42 |
+
"wiki_lingua_tr": {
|
43 |
+
"name": "WikiLingua",
|
44 |
+
"task": "summarization",
|
45 |
+
"description": "A multilingual abstractive summarization dataset covering 17 languages.",
|
46 |
+
"url": "https://github.com/esdurmus/Wikilingua",
|
47 |
+
"hf_name": "GEM/wiki_lingua",
|
48 |
+
"generative": true
|
49 |
+
},
|
50 |
+
"tr-wikihow-summ": {
|
51 |
+
"name": "WikiHowSumm",
|
52 |
+
"task": "summarization",
|
53 |
+
"description": "A summarization dataset obtained from WikiHow website.",
|
54 |
+
"url": "https://huggingface.co/datasets/ardauzunoglu/tr-wikihow-summ",
|
55 |
+
"hf_name": "ardauzunoglu/tr-wikihow-summ",
|
56 |
+
"generative": true
|
57 |
+
},
|
58 |
+
"mnli_tr": {
|
59 |
+
"name": "MNLI",
|
60 |
+
"task": "natural_language_inference",
|
61 |
+
"description": "Multi-Genre NLI (MNLI) dataset.",
|
62 |
+
"url": "https://cims.nyu.edu/~sbowman/multinli/",
|
63 |
+
"hf_name": "boun-tabi/nli_tr",
|
64 |
+
"generative": false
|
65 |
+
},
|
66 |
+
"snli_tr": {
|
67 |
+
"name": "SNLI",
|
68 |
+
"task": "natural_language_inference",
|
69 |
+
"description": "The Stanford NLI (SNLI) dataset.",
|
70 |
+
"url": "https://nlp.stanford.edu/projects/snli/",
|
71 |
+
"hf_name": "boun-tabi/nli_tr",
|
72 |
+
"generative": false
|
73 |
+
},
|
74 |
+
"xnli_tr": {
|
75 |
+
"name": "XNLI",
|
76 |
+
"task": "natural_language_inference",
|
77 |
+
"description": "The Cross-Lingual NLI (XNLI) dataset.",
|
78 |
+
"url": "https://github.com/facebookresearch/XNLI",
|
79 |
+
"hf_name": "boun-tabi/nli_tr",
|
80 |
+
"generative": false
|
81 |
+
},
|
82 |
+
"xcopa_tr": {
|
83 |
+
"name": "XCOPA",
|
84 |
+
"task": "multiple_choice",
|
85 |
+
"description": "A multilingual dataset for evaluating causal commonsense reasoning capabilities of language models.",
|
86 |
+
"url": "https://github.com/cambridgeltl/xcopa",
|
87 |
+
"hf_name": "cambridgeltl/xcopa",
|
88 |
+
"generative": false
|
89 |
+
},
|
90 |
+
"exams_tr": {
|
91 |
+
"name": "Exams",
|
92 |
+
"task": "multiple_choice",
|
93 |
+
"description": "A question answering dataset covering high school exams.",
|
94 |
+
"url": "https://huggingface.co/datasets/exams",
|
95 |
+
"hf_name": "exams",
|
96 |
+
"generative": false
|
97 |
+
},
|
98 |
+
"belebele_tr": {
|
99 |
+
"name": "Belebele",
|
100 |
+
"task": "multiple_choice",
|
101 |
+
"description": "A multiple choice question answering dataset to evaluate machine comprehension.",
|
102 |
+
"url": "https://github.com/facebookresearch/belebele",
|
103 |
+
"generative": false
|
104 |
+
},
|
105 |
+
"turkish_plu_goal_inference": {
|
106 |
+
"name": "PLU-GI",
|
107 |
+
"task": "multiple_choice",
|
108 |
+
"description": "TurkishPLU - Goal Inference task.",
|
109 |
+
"url": "https://github.com/GGLAB-KU/turkish-plu",
|
110 |
+
"hf_name": "mcemilg/turkish-plu-goal-inference",
|
111 |
+
"generative": false
|
112 |
+
},
|
113 |
+
"turkish_plu_next_event_prediction": {
|
114 |
+
"name": "PLU-NE",
|
115 |
+
"task": "multiple_choice",
|
116 |
+
"description": "TurkishPLU - Next Event Prediction task.",
|
117 |
+
"url": "https://github.com/GGLAB-KU/turkish-plu",
|
118 |
+
"hf_name": "mcemilg/turkish-plu-next-event-prediction",
|
119 |
+
"generative": false
|
120 |
+
},
|
121 |
+
"turkish_plu_step_inference": {
|
122 |
+
"name": "PLU-SI",
|
123 |
+
"task": "multiple_choice",
|
124 |
+
"description": "TurkishPLU - Step Inference task.",
|
125 |
+
"url": "https://github.com/GGLAB-KU/turkish-plu",
|
126 |
+
"hf_name": "mcemilg/turkish-plu-step-inference",
|
127 |
+
"generative": false
|
128 |
+
},
|
129 |
+
"turkish_plu_step_ordering": {
|
130 |
+
"name": "PLU-SO",
|
131 |
+
"task": "multiple_choice",
|
132 |
+
"description": "TurkishPLU - Step Ordering task.",
|
133 |
+
"url": "https://github.com/GGLAB-KU/turkish-plu",
|
134 |
+
"hf_name": "mcemilg/turkish-plu-step-ordering",
|
135 |
+
"generative": false
|
136 |
+
},
|
137 |
+
"sts_tr": {
|
138 |
+
"name": "STS",
|
139 |
+
"task": "text_classification",
|
140 |
+
"description": "The machine-translated Semantic Textual Similarity dataset in Turkish.",
|
141 |
+
"url": "https://github.com/emrecncelik/sts-benchmark-tr",
|
142 |
+
"hf_name": "emrecan/stsb-mt-turkish",
|
143 |
+
"generative": false
|
144 |
+
},
|
145 |
+
"offenseval_tr": {
|
146 |
+
"name": "OffensEval",
|
147 |
+
"task": "text_classification",
|
148 |
+
"description": "A dataset for offensive speech recognition in Turkish.",
|
149 |
+
"url": "https://sites.google.com/site/offensevalsharedtask/offenseval-2020",
|
150 |
+
"hf_name": "coltekin/offenseval2020_tr",
|
151 |
+
"generative": false
|
152 |
+
},
|
153 |
+
"news_cat": {
|
154 |
+
"name": "NewsCat",
|
155 |
+
"task": "text_classification",
|
156 |
+
"description": "News classification dataset collected from Turkish newspapers websites.",
|
157 |
+
"url": "http://www.kemik.yildiz.edu.tr/veri_kumelerimiz.html",
|
158 |
+
"hf_name": "mcemilg/news-cat",
|
159 |
+
"generative": false
|
160 |
+
},
|
161 |
+
"ironytr": {
|
162 |
+
"name": "IronyTR",
|
163 |
+
"task": "text_classification",
|
164 |
+
"description": "Irony detection dataset in Turkish.",
|
165 |
+
"url": "https://github.com/teghub/IronyTR",
|
166 |
+
"hf_name": "mcemilg/IronyTR",
|
167 |
+
"generative": false
|
168 |
+
},
|
169 |
+
"wmt-tr-en-prompt": {
|
170 |
+
"name": "WMT",
|
171 |
+
"task": "machine_translation",
|
172 |
+
"description": "English-to-Turkish machine translation dataset.",
|
173 |
+
"url": "http://www.aclweb.org/anthology/W/W16/W16-2301",
|
174 |
+
"hf_name": "wmt/wmt16",
|
175 |
+
"generative": true
|
176 |
+
},
|
177 |
+
"gecturk_generation": {
|
178 |
+
"name": "GECTurk",
|
179 |
+
"task": "grammatical_error_correction",
|
180 |
+
"description": "A dataset for grammatical error correction.",
|
181 |
+
"url": "https://github.com/GGLAB-KU/gecturk",
|
182 |
+
"hf_name": "mcemilg/GECTurk-generation",
|
183 |
+
"generative": true
|
184 |
+
}
|
185 |
+
}
|
environment.yaml
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Cetvel-leaderboard
|
2 |
+
channels:
|
3 |
+
- defaults
|
4 |
+
dependencies:
|
5 |
+
- _libgcc_mutex=0.1=main
|
6 |
+
- _openmp_mutex=5.1=1_gnu
|
7 |
+
- bzip2=1.0.8=h5eee18b_6
|
8 |
+
- ca-certificates=2024.7.2=h06a4308_0
|
9 |
+
- expat=2.6.2=h6a678d5_0
|
10 |
+
- ld_impl_linux-64=2.38=h1181459_1
|
11 |
+
- libffi=3.4.4=h6a678d5_1
|
12 |
+
- libgcc-ng=11.2.0=h1234567_1
|
13 |
+
- libgomp=11.2.0=h1234567_1
|
14 |
+
- libstdcxx-ng=11.2.0=h1234567_1
|
15 |
+
- libuuid=1.41.5=h5eee18b_0
|
16 |
+
- ncurses=6.4=h6a678d5_0
|
17 |
+
- openssl=3.0.14=h5eee18b_0
|
18 |
+
- python=3.12.4=h5148396_1
|
19 |
+
- readline=8.2=h5eee18b_0
|
20 |
+
- sqlite=3.45.3=h5eee18b_0
|
21 |
+
- tk=8.6.14=h39e8969_0
|
22 |
+
- wheel=0.43.0=py312h06a4308_0
|
23 |
+
- xz=5.4.6=h5eee18b_1
|
24 |
+
- zlib=1.2.13=h5eee18b_1
|
25 |
+
- pip:
|
26 |
+
- altair==5.3.0
|
27 |
+
- asttokens==2.4.1
|
28 |
+
- attrs==23.2.0
|
29 |
+
- blinker==1.8.2
|
30 |
+
- cachetools==5.3.3
|
31 |
+
- certifi==2024.7.4
|
32 |
+
- charset-normalizer==3.3.2
|
33 |
+
- click==8.1.7
|
34 |
+
- contourpy==1.2.1
|
35 |
+
- cycler==0.12.1
|
36 |
+
- decorator==5.1.1
|
37 |
+
- executing==2.0.1
|
38 |
+
- fonttools==4.53.1
|
39 |
+
- gitdb==4.0.11
|
40 |
+
- gitpython==3.1.43
|
41 |
+
- idna==3.7
|
42 |
+
- ipdb==0.13.13
|
43 |
+
- ipython==8.26.0
|
44 |
+
- jedi==0.19.1
|
45 |
+
- jinja2==3.1.4
|
46 |
+
- jsonschema==4.23.0
|
47 |
+
- jsonschema-specifications==2023.12.1
|
48 |
+
- kiwisolver==1.4.5
|
49 |
+
- markdown-it-py==3.0.0
|
50 |
+
- markupsafe==2.1.5
|
51 |
+
- matplotlib==3.9.1
|
52 |
+
- matplotlib-inline==0.1.7
|
53 |
+
- mdurl==0.1.2
|
54 |
+
- numpy==2.0.0
|
55 |
+
- packaging==24.1
|
56 |
+
- pandas==2.2.2
|
57 |
+
- parso==0.8.4
|
58 |
+
- pexpect==4.9.0
|
59 |
+
- pillow==10.4.0
|
60 |
+
- pip==24.1.2
|
61 |
+
- prompt-toolkit==3.0.47
|
62 |
+
- protobuf==5.27.2
|
63 |
+
- ptyprocess==0.7.0
|
64 |
+
- pure-eval==0.2.2
|
65 |
+
- pyarrow==16.1.0
|
66 |
+
- pydeck==0.9.1
|
67 |
+
- pygments==2.18.0
|
68 |
+
- pyparsing==3.1.2
|
69 |
+
- python-dateutil==2.9.0.post0
|
70 |
+
- pytz==2024.1
|
71 |
+
- redis==5.0.7
|
72 |
+
- referencing==0.35.1
|
73 |
+
- requests==2.32.3
|
74 |
+
- rich==13.7.1
|
75 |
+
- rpds-py==0.19.0
|
76 |
+
- semantic-version==2.10.0
|
77 |
+
- setuptools==70.3.0
|
78 |
+
- setuptools-rust==1.9.0
|
79 |
+
- six==1.16.0
|
80 |
+
- smmap==5.0.1
|
81 |
+
- stack-data==0.6.3
|
82 |
+
- streamlit==1.36.0
|
83 |
+
- tenacity==8.5.0
|
84 |
+
- toml==0.10.2
|
85 |
+
- toolz==0.12.1
|
86 |
+
- tornado==6.4.1
|
87 |
+
- traitlets==5.14.3
|
88 |
+
- typing-extensions==4.12.2
|
89 |
+
- tzdata==2024.1
|
90 |
+
- urllib3==2.2.2
|
91 |
+
- watchdog==4.0.1
|
92 |
+
- wcwidth==0.2.13
|
93 |
+
prefix: /home/ilker/miniconda3/envs/streamlit-tutor
|
process_result.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os.path as osp
|
2 |
+
import argparse
|
3 |
+
import json
|
4 |
+
from data import Tasks, DATASET_TASK_DICT
|
5 |
+
from utils import preprocess_path
|
6 |
+
|
7 |
+
|
8 |
+
def process_result(entry, name, task):
|
9 |
+
processed = {
|
10 |
+
'name': name,
|
11 |
+
'task': str(task),
|
12 |
+
}
|
13 |
+
|
14 |
+
if task == Tasks.EXTRACTIVE_QUESTION_ANSWERING:
|
15 |
+
key = 'em,none' if name == 'mkqa_tr' else 'exact,none'
|
16 |
+
scale = 0.01 if name != 'mkqa_tr' else 1
|
17 |
+
processed['exact_match'] = scale * entry[key]
|
18 |
+
processed['f1'] = scale * entry['f1,none']
|
19 |
+
elif task == Tasks.SUMMARIZATION:
|
20 |
+
processed['rouge1'] = entry['rouge1,none']
|
21 |
+
processed['rouge2'] = entry['rouge2,none']
|
22 |
+
processed['rougeL'] = entry['rougeL,none']
|
23 |
+
elif task in (
|
24 |
+
Tasks.MULTIPLE_CHOICE,
|
25 |
+
Tasks.NATURAL_LANGUAGE_INFERENCE,
|
26 |
+
Tasks.TEXT_CLASSIFICATION,
|
27 |
+
):
|
28 |
+
processed['acc'] = entry['acc,none']
|
29 |
+
processed['acc_norm'] = entry.get('acc_norm,none', processed['acc'])
|
30 |
+
elif task == Tasks.MACHINE_TRANSLATION:
|
31 |
+
processed['wer'] = entry['wer,none']
|
32 |
+
processed['bleu'] = entry['bleu,none']
|
33 |
+
elif task == Tasks.GRAMMATICAL_ERROR_CORRECTION:
|
34 |
+
processed['exact_match'] = entry['exact_match,none']
|
35 |
+
|
36 |
+
return processed
|
37 |
+
|
38 |
+
|
39 |
+
def main():
|
40 |
+
parser = argparse.ArgumentParser(description='Results file formatter.')
|
41 |
+
parser.add_argument('-i', '--input-file', type=str, help='Input JSON file for the results.')
|
42 |
+
parser.add_argument('-o', '--output-file', type=str, help='Output JSON file for the formatted results.')
|
43 |
+
args = parser.parse_args()
|
44 |
+
|
45 |
+
with open(preprocess_path(args.input_file)) as f:
|
46 |
+
raw_data = json.load(f)
|
47 |
+
|
48 |
+
# first, get model args
|
49 |
+
model_args = raw_data['config']['model_args'].split(',')
|
50 |
+
model_args = dict([tuple(pair.split('=')) for pair in model_args])
|
51 |
+
processed = dict()
|
52 |
+
model_args['model'] = model_args.pop('pretrained')
|
53 |
+
processed['model'] = model_args
|
54 |
+
processed['model']['api'] = raw_data['config']['model']
|
55 |
+
|
56 |
+
# then, process results
|
57 |
+
results = raw_data['results']
|
58 |
+
processed['results'] = list()
|
59 |
+
for dataset, entry in results.items():
|
60 |
+
if dataset not in DATASET_TASK_DICT.keys():
|
61 |
+
continue
|
62 |
+
task = DATASET_TASK_DICT[dataset]
|
63 |
+
processed['results'].append(process_result(entry, dataset, task))
|
64 |
+
|
65 |
+
with open(preprocess_path(args.output_file), 'w') as f:
|
66 |
+
json.dump(processed, f, indent=4)
|
67 |
+
|
68 |
+
print('done')
|
69 |
+
|
70 |
+
|
71 |
+
if __name__ == '__main__':
|
72 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==5.3.0
|
2 |
+
click==8.1.7
|
3 |
+
matplotlib==3.9.1
|
4 |
+
numpy==2.0.0
|
5 |
+
pandas==2.2.2
|
6 |
+
pillow==10.4.0
|
7 |
+
streamlit==1.36.0
|
8 |
+
tornado==6.4.1
|
results/zero-shot/aya-23-8b.json
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"load_in_8bit": "True",
|
4 |
+
"trust_remote_code": "True",
|
5 |
+
"model": "CohereForAI/aya-23-8B",
|
6 |
+
"api": "hf",
|
7 |
+
"architecture": "CohereForCausalLM",
|
8 |
+
"dtype": "float16",
|
9 |
+
"max_length": 8192,
|
10 |
+
"type": "instruction-tuned",
|
11 |
+
"num_parameters": "8b"
|
12 |
+
},
|
13 |
+
"results": [
|
14 |
+
{
|
15 |
+
"name": "belebele_tr",
|
16 |
+
"task": "multiple_choice",
|
17 |
+
"acc": 0.6067,
|
18 |
+
"acc_norm": 0.6067
|
19 |
+
},
|
20 |
+
{
|
21 |
+
"name": "exams_tr",
|
22 |
+
"task": "multiple_choice",
|
23 |
+
"acc": 0.2697,
|
24 |
+
"acc_norm": 0.2901
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"name": "check_worthiness",
|
28 |
+
"task": "multiple_choice",
|
29 |
+
"acc": 0.38345521023765994,
|
30 |
+
"acc_norm": 0.49177330895795246
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"name": "ironytr",
|
34 |
+
"task": "text_classification",
|
35 |
+
"acc": 0.5166666666666667,
|
36 |
+
"acc_norm": 0.5016666666666667
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"name": "mkqa_tr",
|
40 |
+
"task": "extractive_question_answering",
|
41 |
+
"exact_match": 0.10017756732761172,
|
42 |
+
"f1": 0.16569513329103133
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"name": "mnli_tr",
|
46 |
+
"task": "natural_language_inference",
|
47 |
+
"acc": 0.3436,
|
48 |
+
"acc_norm": 0.3477
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"name": "news_cat",
|
52 |
+
"task": "text_classification",
|
53 |
+
"acc": 0.724,
|
54 |
+
"acc_norm": 0.632
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"name": "offenseval_tr",
|
58 |
+
"task": "text_classification",
|
59 |
+
"acc": 0.3424036281179138,
|
60 |
+
"acc_norm": 0.7865646258503401
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"name": "relevance_judgment",
|
64 |
+
"task": "multiple_choice",
|
65 |
+
"acc": 0.42550274223034734,
|
66 |
+
"acc_norm": 0.4273308957952468
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"name": "snli_tr",
|
70 |
+
"task": "natural_language_inference",
|
71 |
+
"acc": 0.3249,
|
72 |
+
"acc_norm": 0.3367
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"name": "sts_tr",
|
76 |
+
"task": "text_classification",
|
77 |
+
"acc": 0.22987672226250908,
|
78 |
+
"acc_norm": 0.19434372733865118
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"name": "tquad",
|
82 |
+
"task": "extractive_question_answering",
|
83 |
+
"exact_match": 0.2062780269058296,
|
84 |
+
"f1": 0.4653972244152745
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"name": "turkish_plu_goal_inference",
|
88 |
+
"task": "multiple_choice",
|
89 |
+
"acc": 0.3918757467144564,
|
90 |
+
"acc_norm": 0.3859020310633214
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"name": "turkish_plu_next_event_prediction",
|
94 |
+
"task": "multiple_choice",
|
95 |
+
"acc": 0.4687022900763359,
|
96 |
+
"acc_norm": 0.5374045801526718
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"name": "turkish_plu_step_inference",
|
100 |
+
"task": "multiple_choice",
|
101 |
+
"acc": 0.33986928104575165,
|
102 |
+
"acc_norm": 0.45098039215686275
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"name": "turkish_plu_step_ordering",
|
106 |
+
"task": "multiple_choice",
|
107 |
+
"acc": 0.6180215475024485,
|
108 |
+
"acc_norm": 0.6180215475024485
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"name": "xcopa_tr",
|
112 |
+
"task": "multiple_choice",
|
113 |
+
"acc": 0.596,
|
114 |
+
"acc_norm": 0.596
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"name": "xnli_tr",
|
118 |
+
"task": "natural_language_inference",
|
119 |
+
"acc": 0.4771084337349398,
|
120 |
+
"acc_norm": 0.4771084337349398
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"name": "xquad_tr",
|
124 |
+
"task": "extractive_question_answering",
|
125 |
+
"exact_match": 0.24705882352941178,
|
126 |
+
"f1": 0.44192474929656556
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"name": "gecturk_generation",
|
130 |
+
"task": "grammatical_error_correction",
|
131 |
+
"exact_match": 0.008281573498964804
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"name": "mlsum_tr",
|
135 |
+
"task": "summarization",
|
136 |
+
"rouge1": 0.37037019926313125,
|
137 |
+
"rouge2": 0.24005923597941317,
|
138 |
+
"rougeL": 0.31098002776173184
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"name": "wiki_lingua_tr",
|
142 |
+
"task": "summarization",
|
143 |
+
"rouge1": 0.2645070959726481,
|
144 |
+
"rouge2": 0.11354354716145479,
|
145 |
+
"rougeL": 0.21357621995467704
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"name": "wmt-tr-en-prompt",
|
149 |
+
"task": "machine_translation",
|
150 |
+
"wer": 0.7464128097803795,
|
151 |
+
"bleu": 0.16878189334002527
|
152 |
+
},
|
153 |
+
{
|
154 |
+
"name": "xlsum_tr",
|
155 |
+
"task": "summarization",
|
156 |
+
"rouge1": 0.2855728817569547,
|
157 |
+
"rouge2": 0.14081555638864124,
|
158 |
+
"rougeL": 0.23467303626936886
|
159 |
+
}
|
160 |
+
]
|
161 |
+
}
|
results/zero-shot/aya-expanse-8b.json
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"model": "CohereForAI/aya-expanse-8b",
|
4 |
+
"api": "hf",
|
5 |
+
"architecture": "CohereForCausalLM",
|
6 |
+
"max_length": 8192,
|
7 |
+
"dtype": "float16",
|
8 |
+
"type": "instruction-tuned",
|
9 |
+
"num_parameters": "8b"
|
10 |
+
},
|
11 |
+
"results": [
|
12 |
+
{
|
13 |
+
"name": "belebele_tr",
|
14 |
+
"task": "multiple_choice",
|
15 |
+
"acc": 0.7355555555555555,
|
16 |
+
"acc_norm": 0.7355555555555555
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"name": "exams_tr",
|
20 |
+
"task": "multiple_choice",
|
21 |
+
"acc": 0.3155216284987277,
|
22 |
+
"acc_norm": 0.3460559796437659
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"name": "check_worthiness",
|
26 |
+
"task": "multiple_choice",
|
27 |
+
"acc": 0.4026508226691042,
|
28 |
+
"acc_norm": 0.6224862888482633
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"name": "gecturk_generation",
|
32 |
+
"task": "grammatical_error_correction",
|
33 |
+
"exact_match": 0.0018296499590736194
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"name": "ironytr",
|
37 |
+
"task": "text_classification",
|
38 |
+
"acc": 0.505,
|
39 |
+
"acc_norm": 0.49833333333333335
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"name": "mkqa_tr",
|
43 |
+
"task": "extractive_question_answering",
|
44 |
+
"exact_match": 0.06954720331459012,
|
45 |
+
"f1": 0.13476533908972033
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"name": "mlsum_tr",
|
49 |
+
"task": "summarization",
|
50 |
+
"rouge1": 0.363610486561065,
|
51 |
+
"rouge2": 0.21362825588593481,
|
52 |
+
"rougeL": 0.29773476508614094
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"name": "mnli_tr",
|
56 |
+
"task": "natural_language_inference",
|
57 |
+
"acc": 0.3078,
|
58 |
+
"acc_norm": 0.35
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"name": "news_cat",
|
62 |
+
"task": "text_classification",
|
63 |
+
"acc": 0.76,
|
64 |
+
"acc_norm": 0.58
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"name": "offenseval_tr",
|
68 |
+
"task": "text_classification",
|
69 |
+
"acc": 0.2675736961451247,
|
70 |
+
"acc_norm": 0.7956349206349206
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"name": "relevance_judgment",
|
74 |
+
"task": "multiple_choice",
|
75 |
+
"acc": 0.5877513711151737,
|
76 |
+
"acc_norm": 0.579981718464351
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"name": "snli_tr",
|
80 |
+
"task": "natural_language_inference",
|
81 |
+
"acc": 0.344,
|
82 |
+
"acc_norm": 0.3435
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"name": "sts_tr",
|
86 |
+
"task": "text_classification",
|
87 |
+
"acc": 0.2095721537345903,
|
88 |
+
"acc_norm": 0.21029731689630166
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"name": "tquad",
|
92 |
+
"task": "extractive_question_answering",
|
93 |
+
"exact_match": 0.13452914798206278,
|
94 |
+
"f1": 0.435087842533856
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"name": "turkish_plu_goal_inference",
|
98 |
+
"task": "multiple_choice",
|
99 |
+
"acc": 0.4062126642771804,
|
100 |
+
"acc_norm": 0.3930704898446834
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"name": "turkish_plu_next_event_prediction",
|
104 |
+
"task": "multiple_choice",
|
105 |
+
"acc": 0.4900763358778626,
|
106 |
+
"acc_norm": 0.5465648854961832
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"name": "turkish_plu_step_inference",
|
110 |
+
"task": "multiple_choice",
|
111 |
+
"acc": 0.3464052287581699,
|
112 |
+
"acc_norm": 0.4395424836601307
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"name": "turkish_plu_step_ordering",
|
116 |
+
"task": "multiple_choice",
|
117 |
+
"acc": 0.5935357492654261,
|
118 |
+
"acc_norm": 0.5935357492654261
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"name": "wiki_lingua_tr",
|
122 |
+
"task": "summarization",
|
123 |
+
"rouge1": 0.3064320242538614,
|
124 |
+
"rouge2": 0.1340385267540697,
|
125 |
+
"rougeL": 0.24764232131755232
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"name": "wmt-tr-en-prompt",
|
129 |
+
"task": "machine_translation",
|
130 |
+
"wer": 0.7822550373875778,
|
131 |
+
"bleu": 0.17034711245148307
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"name": "xcopa_tr",
|
135 |
+
"task": "multiple_choice",
|
136 |
+
"acc": 0.578,
|
137 |
+
"acc_norm": 0.578
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"name": "xlsum_tr",
|
141 |
+
"task": "summarization",
|
142 |
+
"rouge1": 0.26621653203927675,
|
143 |
+
"rouge2": 0.133428873146516,
|
144 |
+
"rougeL": 0.2083669711429916
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"name": "xnli_tr",
|
148 |
+
"task": "natural_language_inference",
|
149 |
+
"acc": 0.4919678714859438,
|
150 |
+
"acc_norm": 0.4919678714859438
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"name": "xquad_tr",
|
154 |
+
"task": "extractive_question_answering",
|
155 |
+
"exact_match": 0.2495798319327731,
|
156 |
+
"f1": 0.4735125568867167
|
157 |
+
}
|
158 |
+
]
|
159 |
+
}
|
results/zero-shot/aya101.json
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"dtype": "bfloat16",
|
4 |
+
"max_length": 4096,
|
5 |
+
"model": "CohereForAI/aya-101",
|
6 |
+
"api": "hf",
|
7 |
+
"architecture": "T5ForConditionalGeneration",
|
8 |
+
"type": "instruction-tuned",
|
9 |
+
"num_parameters": "13b"
|
10 |
+
},
|
11 |
+
"results": [
|
12 |
+
{
|
13 |
+
"name": "xquad_tr",
|
14 |
+
"task": "extractive_question_answering",
|
15 |
+
"exact_match": 0.07563025210084033,
|
16 |
+
"f1": 0.16462359535888943
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"name": "xlsum_tr",
|
20 |
+
"task": "summarization",
|
21 |
+
"rouge1": 0.02416422194769531,
|
22 |
+
"rouge2": 0.00149839274458772,
|
23 |
+
"rougeL": 0.02416422194769531
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"name": "xcopa_tr",
|
27 |
+
"task": "multiple_choice",
|
28 |
+
"acc": 0.596,
|
29 |
+
"acc_norm": 0.596
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"name": "wmt-tr-en-prompt",
|
33 |
+
"task": "machine_translation",
|
34 |
+
"wer": 0.9853633715998092,
|
35 |
+
"bleu": 0.0
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"name": "wiki_lingua_tr",
|
39 |
+
"task": "summarization",
|
40 |
+
"rouge1": 0.029006633700390562,
|
41 |
+
"rouge2": 0.0004998910319276452,
|
42 |
+
"rougeL": 0.028967197984657227
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"name": "turkish_plu",
|
46 |
+
"task": "multiple_choice",
|
47 |
+
"acc": 0.41344,
|
48 |
+
"acc_norm": 0.42816
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"name": "turkish_plu_goal_inference",
|
52 |
+
"task": "multiple_choice",
|
53 |
+
"acc": 0.3739545997610514,
|
54 |
+
"acc_norm": 0.33811230585424135
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"name": "turkish_plu_next_event_prediction",
|
58 |
+
"task": "multiple_choice",
|
59 |
+
"acc": 0.34961832061068704,
|
60 |
+
"acc_norm": 0.38625954198473283
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"name": "turkish_plu_step_inference",
|
64 |
+
"task": "multiple_choice",
|
65 |
+
"acc": 0.272875816993464,
|
66 |
+
"acc_norm": 0.35784313725490197
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"name": "turkish_plu_step_ordering",
|
70 |
+
"task": "multiple_choice",
|
71 |
+
"acc": 0.5710088148873653,
|
72 |
+
"acc_norm": 0.5710088148873653
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"name": "check_worthiness",
|
76 |
+
"task": "multiple_choice",
|
77 |
+
"acc": 0.553473491773309,
|
78 |
+
"acc_norm": 0.6238574040219378
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"name": "relevance_judgment",
|
82 |
+
"task": "multiple_choice",
|
83 |
+
"acc": 0.6709323583180987,
|
84 |
+
"acc_norm": 0.5781535648994516
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"name": "tr-wikihow-summ",
|
88 |
+
"task": "summarization",
|
89 |
+
"rouge1": 0.02053796966151103,
|
90 |
+
"rouge2": 0.00029270301029826366,
|
91 |
+
"rougeL": 0.020495031370814234
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"name": "tquad",
|
95 |
+
"task": "extractive_question_answering",
|
96 |
+
"exact_match": 0.053811659192825115,
|
97 |
+
"f1": 0.09199690627084456
|
98 |
+
},
|
99 |
+
{
|
100 |
+
"name": "sts_tr",
|
101 |
+
"task": "text_classification",
|
102 |
+
"acc": 0.1696881798404641,
|
103 |
+
"acc_norm": 0.18781725888324874
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"name": "offenseval_tr",
|
107 |
+
"task": "text_classification",
|
108 |
+
"acc": 0.7993197278911565,
|
109 |
+
"acc_norm": 0.7970521541950113
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"name": "mnli_tr",
|
113 |
+
"task": "natural_language_inference",
|
114 |
+
"acc": 0.279,
|
115 |
+
"acc_norm": 0.3386
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"name": "snli_tr",
|
119 |
+
"task": "natural_language_inference",
|
120 |
+
"acc": 0.2558,
|
121 |
+
"acc_norm": 0.3279
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"name": "xnli_tr",
|
125 |
+
"task": "natural_language_inference",
|
126 |
+
"acc": 0.2998003992015968,
|
127 |
+
"acc_norm": 0.34291417165668664
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"name": "news_cat",
|
131 |
+
"task": "text_classification",
|
132 |
+
"acc": 0.2,
|
133 |
+
"acc_norm": 0.2
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"name": "mlsum_tr",
|
137 |
+
"task": "summarization",
|
138 |
+
"rouge1": 0.021746360547255133,
|
139 |
+
"rouge2": 0.003113110667892852,
|
140 |
+
"rougeL": 0.021727065059735186
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"name": "mkqa_tr",
|
144 |
+
"task": "extractive_question_answering",
|
145 |
+
"exact_match": 0.025451316957679788,
|
146 |
+
"f1": 0.05324060372891391
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"name": "ironytr",
|
150 |
+
"task": "text_classification",
|
151 |
+
"acc": 0.5216666666666666,
|
152 |
+
"acc_norm": 0.5
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"name": "gecturk_generation",
|
156 |
+
"task": "grammatical_error_correction",
|
157 |
+
"exact_match": 0.0
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"name": "exams_tr",
|
161 |
+
"task": "multiple_choice",
|
162 |
+
"acc": 0.22900763358778625,
|
163 |
+
"acc_norm": 0.2366412213740458
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"name": "belebele_tr",
|
167 |
+
"task": "multiple_choice",
|
168 |
+
"acc": 0.2288888888888889,
|
169 |
+
"acc_norm": 0.2288888888888889
|
170 |
+
}
|
171 |
+
]
|
172 |
+
}
|
results/zero-shot/commencis-7b.json
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"dtype": "bfloat16",
|
4 |
+
"max_length": "4096",
|
5 |
+
"model": "Commencis/Commencis-LLM",
|
6 |
+
"api": "hf",
|
7 |
+
"architecture": "MistralForCausalLM",
|
8 |
+
"type": "instruction-tuned",
|
9 |
+
"num_parameters": "7b"
|
10 |
+
},
|
11 |
+
"results": [
|
12 |
+
{
|
13 |
+
"name": "xquad_tr",
|
14 |
+
"task": "extractive_question_answering",
|
15 |
+
"exact_match": 0.06638655462184874,
|
16 |
+
"f1": 0.22895337255761397
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"name": "xlsum_tr",
|
20 |
+
"task": "summarization",
|
21 |
+
"rouge1": 0.23661435034483103,
|
22 |
+
"rouge2": 0.09475637339836376,
|
23 |
+
"rougeL": 0.17114647899378693
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"name": "xcopa_tr",
|
27 |
+
"task": "multiple_choice",
|
28 |
+
"acc": 0.58,
|
29 |
+
"acc_norm": 0.58
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"name": "wmt-tr-en-prompt",
|
33 |
+
"task": "machine_translation",
|
34 |
+
"wer": 1.292660190832963,
|
35 |
+
"bleu": 0.046829706960566486
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"name": "wiki_lingua_tr",
|
39 |
+
"task": "summarization",
|
40 |
+
"rouge1": 0.20899244459581318,
|
41 |
+
"rouge2": 0.06262304805792501,
|
42 |
+
"rougeL": 0.15190187433999106
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"name": "turkish_plu",
|
46 |
+
"task": "multiple_choice",
|
47 |
+
"acc": 0.4128,
|
48 |
+
"acc_norm": 0.46176
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"name": "turkish_plu_goal_inference",
|
52 |
+
"task": "multiple_choice",
|
53 |
+
"acc": 0.34767025089605735,
|
54 |
+
"acc_norm": 0.38948626045400236
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"name": "turkish_plu_next_event_prediction",
|
58 |
+
"task": "multiple_choice",
|
59 |
+
"acc": 0.38625954198473283,
|
60 |
+
"acc_norm": 0.46259541984732827
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"name": "turkish_plu_step_inference",
|
64 |
+
"task": "multiple_choice",
|
65 |
+
"acc": 0.2761437908496732,
|
66 |
+
"acc_norm": 0.3872549019607843
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"name": "turkish_plu_step_ordering",
|
70 |
+
"task": "multiple_choice",
|
71 |
+
"acc": 0.56513222331048,
|
72 |
+
"acc_norm": 0.56513222331048
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"name": "check_worthiness",
|
76 |
+
"task": "multiple_choice",
|
77 |
+
"acc": 0.3903107861060329,
|
78 |
+
"acc_norm": 0.4835466179159049
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"name": "relevance_judgment",
|
82 |
+
"task": "multiple_choice",
|
83 |
+
"acc": 0.5077696526508226,
|
84 |
+
"acc_norm": 0.526508226691042
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"name": "tr-wikihow-summ",
|
88 |
+
"task": "summarization",
|
89 |
+
"rouge1": 0.23101542478965895,
|
90 |
+
"rouge2": 0.0718775262261334,
|
91 |
+
"rougeL": 0.16318786708633073
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"name": "tquad",
|
95 |
+
"task": "extractive_question_answering",
|
96 |
+
"exact_match": 0.053811659192825115,
|
97 |
+
"f1": 0.3110458108565287
|
98 |
+
},
|
99 |
+
{
|
100 |
+
"name": "sts_tr",
|
101 |
+
"task": "text_classification",
|
102 |
+
"acc": 0.14865844815083393,
|
103 |
+
"acc_norm": 0.2226250906453952
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"name": "offenseval_tr",
|
107 |
+
"task": "text_classification",
|
108 |
+
"acc": 0.24263038548752835,
|
109 |
+
"acc_norm": 0.29365079365079366
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"name": "mnli_tr",
|
113 |
+
"task": "natural_language_inference",
|
114 |
+
"acc": 0.3058,
|
115 |
+
"acc_norm": 0.3103
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"name": "snli_tr",
|
119 |
+
"task": "natural_language_inference",
|
120 |
+
"acc": 0.2972,
|
121 |
+
"acc_norm": 0.32
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"name": "xnli_tr",
|
125 |
+
"task": "natural_language_inference",
|
126 |
+
"acc": 0.3141716566866267,
|
127 |
+
"acc_norm": 0.3281437125748503
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"name": "news_cat",
|
131 |
+
"task": "text_classification",
|
132 |
+
"acc": 0.624,
|
133 |
+
"acc_norm": 0.368
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"name": "mlsum_tr",
|
137 |
+
"task": "summarization",
|
138 |
+
"rouge1": 0.30963778437323686,
|
139 |
+
"rouge2": 0.16100694114326877,
|
140 |
+
"rougeL": 0.23447680384800107
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"name": "mkqa_tr",
|
144 |
+
"task": "extractive_question_answering",
|
145 |
+
"exact_match": 0.0324060372891388,
|
146 |
+
"f1": 0.07231572678508513
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"name": "ironytr",
|
150 |
+
"task": "text_classification",
|
151 |
+
"acc": 0.56,
|
152 |
+
"acc_norm": 0.54
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"name": "gecturk_generation",
|
156 |
+
"task": "grammatical_error_correction",
|
157 |
+
"exact_match": 0.1701574461938466
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"name": "exams_tr",
|
161 |
+
"task": "multiple_choice",
|
162 |
+
"acc": 0.24681933842239187,
|
163 |
+
"acc_norm": 0.29770992366412213
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"name": "belebele_tr",
|
167 |
+
"task": "multiple_choice",
|
168 |
+
"acc": 0.3233333333333333,
|
169 |
+
"acc_norm": 0.3233333333333333
|
170 |
+
}
|
171 |
+
]
|
172 |
+
}
|
results/zero-shot/kanarya-2b.json
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"dtype": "float16",
|
4 |
+
"model": "asafaya/kanarya-2b",
|
5 |
+
"api": "hf",
|
6 |
+
"architecture": "GPTJForCausalLM",
|
7 |
+
"type": "pretrained",
|
8 |
+
"num_parameters": "3b"
|
9 |
+
},
|
10 |
+
"results": [
|
11 |
+
{
|
12 |
+
"name": "belebele_tr",
|
13 |
+
"task": "multiple_choice",
|
14 |
+
"acc": 0.2811111111111111,
|
15 |
+
"acc_norm": 0.2811111111111111
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"name": "exams_tr",
|
19 |
+
"task": "multiple_choice",
|
20 |
+
"acc": 0.30025445292620867,
|
21 |
+
"acc_norm": 0.3256997455470738
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"name": "gecturk_generation",
|
25 |
+
"task": "grammatical_error_correction",
|
26 |
+
"exact_match": 9.62973662670326e-05
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"name": "ironytr",
|
30 |
+
"task": "text_classification",
|
31 |
+
"acc": 0.5,
|
32 |
+
"acc_norm": 0.5016666666666667
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"name": "mkqa_tr",
|
36 |
+
"task": "extractive_question_answering",
|
37 |
+
"exact_match": 0.005770938147380882,
|
38 |
+
"f1": 0.0157485308417537
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"name": "mlsum_tr",
|
42 |
+
"task": "summarization",
|
43 |
+
"rouge1": 0.380182975983147,
|
44 |
+
"rouge2": 0.2469518162622865,
|
45 |
+
"rougeL": 0.30607429328228153
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"name": "news_cat",
|
49 |
+
"task": "text_classification",
|
50 |
+
"acc": 0.668,
|
51 |
+
"acc_norm": 0.556
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"name": "mnli_tr",
|
55 |
+
"task": "natural_language_inference",
|
56 |
+
"acc": 0.3278,
|
57 |
+
"acc_norm": 0.3463
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"name": "snli_tr",
|
61 |
+
"task": "natural_language_inference",
|
62 |
+
"acc": 0.3088,
|
63 |
+
"acc_norm": 0.3109
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"name": "xnli_tr",
|
67 |
+
"task": "natural_language_inference",
|
68 |
+
"acc": 0.3273453093812375,
|
69 |
+
"acc_norm": 0.3341317365269461
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"name": "offenseval_tr",
|
73 |
+
"task": "text_classification",
|
74 |
+
"acc": 0.6159297052154195,
|
75 |
+
"acc_norm": 0.796485260770975
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"name": "sts_tr",
|
79 |
+
"task": "text_classification",
|
80 |
+
"acc": 0.12907904278462654,
|
81 |
+
"acc_norm": 0.12037708484408992
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"name": "tquad",
|
85 |
+
"task": "extractive_question_answering",
|
86 |
+
"exact_match": 0.016816143497757848,
|
87 |
+
"f1": 0.046325790025566756
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"name": "check_worthiness",
|
91 |
+
"task": "multiple_choice",
|
92 |
+
"acc": 0.623400365630713,
|
93 |
+
"acc_norm": 0.6238574040219378
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"name": "relevance_judgment",
|
97 |
+
"task": "multiple_choice",
|
98 |
+
"acc": 0.5068555758683729,
|
99 |
+
"acc_norm": 0.5758683729433273
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"name": "turkish_plu",
|
103 |
+
"task": "multiple_choice",
|
104 |
+
"acc": 0.4928,
|
105 |
+
"acc_norm": 0.536
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"name": "turkish_plu_goal_inference",
|
109 |
+
"task": "multiple_choice",
|
110 |
+
"acc": 0.45878136200716846,
|
111 |
+
"acc_norm": 0.46714456391875747
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"name": "turkish_plu_next_event_prediction",
|
115 |
+
"task": "multiple_choice",
|
116 |
+
"acc": 0.45648854961832064,
|
117 |
+
"acc_norm": 0.5190839694656488
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"name": "turkish_plu_step_inference",
|
121 |
+
"task": "multiple_choice",
|
122 |
+
"acc": 0.35784313725490197,
|
123 |
+
"acc_norm": 0.5
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"name": "turkish_plu_step_ordering",
|
127 |
+
"task": "multiple_choice",
|
128 |
+
"acc": 0.6248775710088149,
|
129 |
+
"acc_norm": 0.6248775710088149
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"name": "wiki_lingua_tr",
|
133 |
+
"task": "summarization",
|
134 |
+
"rouge1": 0.14941800836498376,
|
135 |
+
"rouge2": 0.04469826846423095,
|
136 |
+
"rougeL": 0.11118162846926655
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"name": "wmt-tr-en-prompt",
|
140 |
+
"task": "machine_translation",
|
141 |
+
"wer": 2.833755212322392,
|
142 |
+
"bleu": 0.030496946295093332
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"name": "xcopa_tr",
|
146 |
+
"task": "multiple_choice",
|
147 |
+
"acc": 0.642,
|
148 |
+
"acc_norm": 0.642
|
149 |
+
},
|
150 |
+
{
|
151 |
+
"name": "xlsum_tr",
|
152 |
+
"task": "summarization",
|
153 |
+
"rouge1": 0.2462743722502333,
|
154 |
+
"rouge2": 0.09312295140534987,
|
155 |
+
"rougeL": 0.1685445897911506
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"name": "tr-wikihow-summ",
|
159 |
+
"task": "summarization",
|
160 |
+
"rouge1": null,
|
161 |
+
"rouge2": null,
|
162 |
+
"rougeL": null
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"name": "xquad_tr",
|
166 |
+
"task": "extractive_question_answering",
|
167 |
+
"exact_match": 0.008403361344537815,
|
168 |
+
"f1": 0.027799180278171867
|
169 |
+
}
|
170 |
+
]
|
171 |
+
}
|
results/zero-shot/llama-3-8b-instruct.json
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"trust_remote_code": "True",
|
4 |
+
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
5 |
+
"api": "hf",
|
6 |
+
"architecture": "LlamaForCausalLM",
|
7 |
+
"max_length": 8192,
|
8 |
+
"type": "instruction-tuned",
|
9 |
+
"dtype": "bfloat16",
|
10 |
+
"num_parameters": "8b"
|
11 |
+
},
|
12 |
+
"results": [
|
13 |
+
{
|
14 |
+
"name": "belebele_tr",
|
15 |
+
"task": "multiple_choice",
|
16 |
+
"acc": 0.6633333333333333,
|
17 |
+
"acc_norm": 0.6633333333333333
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"name": "exams_tr",
|
21 |
+
"task": "multiple_choice",
|
22 |
+
"acc": 0.2697201017811705,
|
23 |
+
"acc_norm": 0.3104325699745547
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"name": "check_worthiness",
|
27 |
+
"task": "multiple_choice",
|
28 |
+
"acc": 0.4218464351005484,
|
29 |
+
"acc_norm": 0.5644424131627057
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"name": "ironytr",
|
33 |
+
"task": "text_classification",
|
34 |
+
"acc": 0.545,
|
35 |
+
"acc_norm": 0.6466666666666666
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"name": "mkqa_tr",
|
39 |
+
"task": "extractive_question_answering",
|
40 |
+
"exact_match": 0.0424681858538029,
|
41 |
+
"f1": 0.11050423163975964
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"name": "mnli_tr",
|
45 |
+
"task": "natural_language_inference",
|
46 |
+
"acc": 0.3201,
|
47 |
+
"acc_norm": 0.3653
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"name": "news_cat",
|
51 |
+
"task": "text_classification",
|
52 |
+
"acc": 0.628,
|
53 |
+
"acc_norm": 0.588
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"name": "offenseval_tr",
|
57 |
+
"task": "text_classification",
|
58 |
+
"acc": 0.3081065759637188,
|
59 |
+
"acc_norm": 0.7304421768707483
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"name": "relevance_judgment",
|
63 |
+
"task": "multiple_choice",
|
64 |
+
"acc": 0.603290676416819,
|
65 |
+
"acc_norm": 0.5790676416819013
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"name": "snli_tr",
|
69 |
+
"task": "natural_language_inference",
|
70 |
+
"acc": 0.3283,
|
71 |
+
"acc_norm": 0.353
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"name": "sts_tr",
|
75 |
+
"task": "text_classification",
|
76 |
+
"acc": 0.14213197969543148,
|
77 |
+
"acc_norm": 0.21537345902828137
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"name": "tquad",
|
81 |
+
"task": "extractive_question_answering",
|
82 |
+
"exact_match": 0.1289237668161435,
|
83 |
+
"f1": 0.4134057883004977
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"name": "turkish_plu_goal_inference",
|
87 |
+
"task": "multiple_choice",
|
88 |
+
"acc": 0.38829151732377537,
|
89 |
+
"acc_norm": 0.43130227001194743
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"name": "turkish_plu_next_event_prediction",
|
93 |
+
"task": "multiple_choice",
|
94 |
+
"acc": 0.4549618320610687,
|
95 |
+
"acc_norm": 0.517557251908397
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"name": "turkish_plu_step_inference",
|
99 |
+
"task": "multiple_choice",
|
100 |
+
"acc": 0.3137254901960784,
|
101 |
+
"acc_norm": 0.44281045751633985
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"name": "turkish_plu_step_ordering",
|
105 |
+
"task": "multiple_choice",
|
106 |
+
"acc": 0.6160626836434868,
|
107 |
+
"acc_norm": 0.6160626836434868
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"name": "xcopa_tr",
|
111 |
+
"task": "multiple_choice",
|
112 |
+
"acc": 0.586,
|
113 |
+
"acc_norm": 0.586
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"name": "xnli_tr",
|
117 |
+
"task": "natural_language_inference",
|
118 |
+
"acc": 0.4389558232931727,
|
119 |
+
"acc_norm": 0.4389558232931727
|
120 |
+
},
|
121 |
+
{
|
122 |
+
"name": "xquad_tr",
|
123 |
+
"task": "extractive_question_answering",
|
124 |
+
"exact_match": 0.09747899159663864,
|
125 |
+
"f1": 0.24450355256139333
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"name": "gecturk_generation",
|
129 |
+
"task": "grammatical_error_correction",
|
130 |
+
"exact_match": 0.005007463045885695
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"name": "mlsum_tr",
|
134 |
+
"task": "summarization",
|
135 |
+
"rouge1": 0.40612528796779146,
|
136 |
+
"rouge2": 0.25769550481564407,
|
137 |
+
"rougeL": 0.3281187592669974
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"name": "wiki_lingua_tr",
|
141 |
+
"task": "summarization",
|
142 |
+
"rouge1": 0.23621778991663983,
|
143 |
+
"rouge2": 0.08052321922363763,
|
144 |
+
"rougeL": 0.1710165526266978
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"name": "wmt-tr-en-prompt",
|
148 |
+
"task": "machine_translation",
|
149 |
+
"wer": 0.823814082821166,
|
150 |
+
"bleu": 0.13572050882587958
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"name": "xlsum_tr",
|
154 |
+
"task": "summarization",
|
155 |
+
"rouge1": 0.29619456321037296,
|
156 |
+
"rouge2": 0.13520487191226377,
|
157 |
+
"rougeL": 0.220446635816053
|
158 |
+
}
|
159 |
+
]
|
160 |
+
}
|
results/zero-shot/llama-3-8b.json
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"model": "meta-llama/Meta-Llama-3-8B",
|
4 |
+
"api": "hf",
|
5 |
+
"architecture": "LlamaForCausalLM",
|
6 |
+
"max_length": 8192,
|
7 |
+
"type": "pretrained",
|
8 |
+
"dtype": "bfloat16",
|
9 |
+
"num_parameters": "8b"
|
10 |
+
},
|
11 |
+
"results": [
|
12 |
+
{
|
13 |
+
"name": "belebele_tr",
|
14 |
+
"task": "multiple_choice",
|
15 |
+
"acc": 0.5144,
|
16 |
+
"acc_norm": 0.5144
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"name": "exams_tr",
|
20 |
+
"task": "multiple_choice",
|
21 |
+
"acc": 0.3028,
|
22 |
+
"acc_norm": 0.3537
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"name": "check_worthiness",
|
26 |
+
"task": "multiple_choice",
|
27 |
+
"acc": 0.37614259597806216,
|
28 |
+
"acc_norm": 0.38391224862888484
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"name": "ironytr",
|
32 |
+
"task": "text_classification",
|
33 |
+
"acc": 0.515,
|
34 |
+
"acc_norm": 0.525
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"name": "mkqa_tr",
|
38 |
+
"task": "extractive_question_answering",
|
39 |
+
"exact_match": 0.13465522343888725,
|
40 |
+
"f1": 0.19144550324599957
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"name": "mnli_tr",
|
44 |
+
"task": "natural_language_inference",
|
45 |
+
"acc": 0.3206,
|
46 |
+
"acc_norm": 0.3329
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"name": "news_cat",
|
50 |
+
"task": "text_classification",
|
51 |
+
"acc": 0.724,
|
52 |
+
"acc_norm": 0.656
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"name": "offenseval_tr",
|
56 |
+
"task": "text_classification",
|
57 |
+
"acc": 0.2193877551020408,
|
58 |
+
"acc_norm": 0.48214285714285715
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"name": "relevance_judgment",
|
62 |
+
"task": "multiple_choice",
|
63 |
+
"acc": 0.42550274223034734,
|
64 |
+
"acc_norm": 0.5173674588665448
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"name": "snli_tr",
|
68 |
+
"task": "natural_language_inference",
|
69 |
+
"acc": 0.325,
|
70 |
+
"acc_norm": 0.3766
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"name": "sts_tr",
|
74 |
+
"task": "text_classification",
|
75 |
+
"acc": 0.16388687454677303,
|
76 |
+
"acc_norm": 0.19216823785351705
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"name": "tquad",
|
80 |
+
"task": "extractive_question_answering",
|
81 |
+
"exact_match": 0.28475336322869954,
|
82 |
+
"f1": 0.5013148868557868
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"name": "turkish_plu_goal_inference",
|
86 |
+
"task": "multiple_choice",
|
87 |
+
"acc": 0.38948626045400236,
|
88 |
+
"acc_norm": 0.4169653524492234
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"name": "turkish_plu_next_event_prediction",
|
92 |
+
"task": "multiple_choice",
|
93 |
+
"acc": 0.4488549618320611,
|
94 |
+
"acc_norm": 0.5328244274809161
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"name": "turkish_plu_step_inference",
|
98 |
+
"task": "multiple_choice",
|
99 |
+
"acc": 0.32189542483660133,
|
100 |
+
"acc_norm": 0.47058823529411764
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"name": "turkish_plu_step_ordering",
|
104 |
+
"task": "multiple_choice",
|
105 |
+
"acc": 0.6278158667972575,
|
106 |
+
"acc_norm": 0.6278158667972575
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"name": "xcopa_tr",
|
110 |
+
"task": "multiple_choice",
|
111 |
+
"acc": 0.618,
|
112 |
+
"acc_norm": 0.618
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"name": "xnli_tr",
|
116 |
+
"task": "natural_language_inference",
|
117 |
+
"acc": 0.4839357429718876,
|
118 |
+
"acc_norm": 0.4839357429718876
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"name": "xquad_tr",
|
122 |
+
"task": "extractive_question_answering",
|
123 |
+
"exact_match": 0.20840336134453782,
|
124 |
+
"f1": 0.33796418555415153
|
125 |
+
},
|
126 |
+
{
|
127 |
+
"name": "gecturk_generation",
|
128 |
+
"task": "grammatical_error_correction",
|
129 |
+
"exact_match": 0.006692666955558766
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"name": "mlsum_tr",
|
133 |
+
"task": "summarization",
|
134 |
+
"rouge1": 0.38446881575055203,
|
135 |
+
"rouge2": 0.2503978598237102,
|
136 |
+
"rougeL": 0.319713589198042
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"name": "wiki_lingua_tr",
|
140 |
+
"task": "summarization",
|
141 |
+
"rouge1": 0.2069234464456151,
|
142 |
+
"rouge2": 0.06576422586110373,
|
143 |
+
"rougeL": 0.1516869929958613
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"name": "wmt-tr-en-prompt",
|
147 |
+
"task": "machine_translation",
|
148 |
+
"wer": 0.9262281724087097,
|
149 |
+
"bleu": 0.113320746345327
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"name": "xlsum_tr",
|
153 |
+
"task": "summarization",
|
154 |
+
"rouge1": 0.2615001361521869,
|
155 |
+
"rouge2": 0.11093149007661907,
|
156 |
+
"rougeL": 0.20321693263972507
|
157 |
+
}
|
158 |
+
]
|
159 |
+
}
|
results/zero-shot/llama-3.1-8b-instruct.json
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
4 |
+
"api": "hf",
|
5 |
+
"dtype": "bfloat16",
|
6 |
+
"max_length": 131072,
|
7 |
+
"architecture": "LlamaForCausalLM",
|
8 |
+
"type": "instruction-tuned",
|
9 |
+
"num_parameters": "8b"
|
10 |
+
},
|
11 |
+
"results": [
|
12 |
+
{
|
13 |
+
"name": "belebele_tr",
|
14 |
+
"task": "multiple_choice",
|
15 |
+
"acc": 0.7077777777777777,
|
16 |
+
"acc_norm": 0.7077777777777777
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"name": "exams_tr",
|
20 |
+
"task": "multiple_choice",
|
21 |
+
"acc": 0.3231552162849873,
|
22 |
+
"acc_norm": 0.35877862595419846
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"name": "check_worthiness",
|
26 |
+
"task": "multiple_choice",
|
27 |
+
"acc": 0.37614259597806216,
|
28 |
+
"acc_norm": 0.37614259597806216
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"name": "ironytr",
|
32 |
+
"task": "text_classification",
|
33 |
+
"acc": 0.5133333333333333,
|
34 |
+
"acc_norm": 0.5666666666666667
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"name": "mkqa_tr",
|
38 |
+
"task": "extractive_question_answering",
|
39 |
+
"exact_match": 0.09115122817401598,
|
40 |
+
"f1": 0.15627870028803578
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"name": "mnli_tr",
|
44 |
+
"task": "natural_language_inference",
|
45 |
+
"acc": 0.3209,
|
46 |
+
"acc_norm": 0.3596
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"name": "news_cat",
|
50 |
+
"task": "text_classification",
|
51 |
+
"acc": 0.66,
|
52 |
+
"acc_norm": 0.604
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"name": "offenseval_tr",
|
56 |
+
"task": "text_classification",
|
57 |
+
"acc": 0.23582766439909297,
|
58 |
+
"acc_norm": 0.3687641723356009
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"name": "relevance_judgment",
|
62 |
+
"task": "multiple_choice",
|
63 |
+
"acc": 0.4648080438756856,
|
64 |
+
"acc_norm": 0.5648994515539305
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"name": "snli_tr",
|
68 |
+
"task": "natural_language_inference",
|
69 |
+
"acc": 0.3028,
|
70 |
+
"acc_norm": 0.3528
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"name": "sts_tr",
|
74 |
+
"task": "text_classification",
|
75 |
+
"acc": 0.19579405366207397,
|
76 |
+
"acc_norm": 0.1551849166062364
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"name": "tquad",
|
80 |
+
"task": "extractive_question_answering",
|
81 |
+
"exact_match": 0.23318385650224216,
|
82 |
+
"f1": 0.5062272078338648
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"name": "turkish_plu_goal_inference",
|
86 |
+
"task": "multiple_choice",
|
87 |
+
"acc": 0.40860215053763443,
|
88 |
+
"acc_norm": 0.45997610513739545
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"name": "turkish_plu_next_event_prediction",
|
92 |
+
"task": "multiple_choice",
|
93 |
+
"acc": 0.4442748091603053,
|
94 |
+
"acc_norm": 0.5419847328244275
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"name": "turkish_plu_step_inference",
|
98 |
+
"task": "multiple_choice",
|
99 |
+
"acc": 0.33169934640522875,
|
100 |
+
"acc_norm": 0.4624183006535948
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"name": "turkish_plu_step_ordering",
|
104 |
+
"task": "multiple_choice",
|
105 |
+
"acc": 0.633692458374143,
|
106 |
+
"acc_norm": 0.633692458374143
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"name": "xcopa_tr",
|
110 |
+
"task": "multiple_choice",
|
111 |
+
"acc": 0.608,
|
112 |
+
"acc_norm": 0.608
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"name": "xnli_tr",
|
116 |
+
"task": "natural_language_inference",
|
117 |
+
"acc": 0.4807228915662651,
|
118 |
+
"acc_norm": 0.4807228915662651
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"name": "xquad_tr",
|
122 |
+
"task": "extractive_question_answering",
|
123 |
+
"exact_match": 0.21428571428571427,
|
124 |
+
"f1": 0.4170277103753468
|
125 |
+
},
|
126 |
+
{
|
127 |
+
"name": "gecturk_generation",
|
128 |
+
"task": "grammatical_error_correction",
|
129 |
+
"exact_match": 0.005007463045885695
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"name": "mlsum_tr",
|
133 |
+
"task": "summarization",
|
134 |
+
"rouge1": 0.40612528796779146,
|
135 |
+
"rouge2": 0.25769550481564407,
|
136 |
+
"rougeL": 0.3281187592669974
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"name": "wiki_lingua_tr",
|
140 |
+
"task": "summarization",
|
141 |
+
"rouge1": 0.23621778991663983,
|
142 |
+
"rouge2": 0.08052321922363763,
|
143 |
+
"rougeL": 0.1710165526266978
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"name": "wmt-tr-en-prompt",
|
147 |
+
"task": "machine_translation",
|
148 |
+
"wer": 0.823814082821166,
|
149 |
+
"bleu": 0.13572050882587958
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"name": "xlsum_tr",
|
153 |
+
"task": "summarization",
|
154 |
+
"rouge1": 0.29619456321037296,
|
155 |
+
"rouge2": 0.13520487191226377,
|
156 |
+
"rougeL": 0.220446635816053
|
157 |
+
}
|
158 |
+
]
|
159 |
+
}
|
results/zero-shot/llama-3.1-8b.json
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"model": "meta-llama/Meta-Llama-3.1-8B",
|
4 |
+
"api": "hf",
|
5 |
+
"dtype": "bfloat16",
|
6 |
+
"max_length": 131072,
|
7 |
+
"architecture": "LlamaForCausalLM",
|
8 |
+
"type": "pretrained",
|
9 |
+
"num_parameters": "8b"
|
10 |
+
},
|
11 |
+
"results": [
|
12 |
+
{
|
13 |
+
"name": "belebele_tr",
|
14 |
+
"task": "multiple_choice",
|
15 |
+
"acc": 0.6144,
|
16 |
+
"acc_norm": 0.6144
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"name": "exams_tr",
|
20 |
+
"task": "multiple_choice",
|
21 |
+
"acc": 0.3130,
|
22 |
+
"acc_norm": 0.3537
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"name": "check_worthiness",
|
26 |
+
"task": "multiple_choice",
|
27 |
+
"acc": 0.37614259597806216,
|
28 |
+
"acc_norm": 0.37751371115173676
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"name": "ironytr",
|
32 |
+
"task": "text_classification",
|
33 |
+
"acc": 0.585,
|
34 |
+
"acc_norm": 0.5183333333333333
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"name": "mkqa_tr",
|
38 |
+
"task": "extractive_question_answering",
|
39 |
+
"exact_match": 0.09248298313110388,
|
40 |
+
"f1": 0.15127108197296948
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"name": "mnli_tr",
|
44 |
+
"task": "natural_language_inference",
|
45 |
+
"acc": 0.3495,
|
46 |
+
"acc_norm": 0.3481
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"name": "news_cat",
|
50 |
+
"task": "text_classification",
|
51 |
+
"acc": 0.692,
|
52 |
+
"acc_norm": 0.588
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"name": "offenseval_tr",
|
56 |
+
"task": "text_classification",
|
57 |
+
"acc": 0.3463718820861678,
|
58 |
+
"acc_norm": 0.7636054421768708
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"name": "relevance_judgment",
|
62 |
+
"task": "multiple_choice",
|
63 |
+
"acc": 0.4227605118829982,
|
64 |
+
"acc_norm": 0.506398537477148
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"name": "snli_tr",
|
68 |
+
"task": "natural_language_inference",
|
69 |
+
"acc": 0.3169,
|
70 |
+
"acc_norm": 0.3379
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"name": "sts_tr",
|
74 |
+
"task": "text_classification",
|
75 |
+
"acc": 0.17041334300217548,
|
76 |
+
"acc_norm": 0.2001450326323423
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"name": "tquad",
|
80 |
+
"task": "extractive_question_answering",
|
81 |
+
"exact_match": 0.2757847533632287,
|
82 |
+
"f1": 0.5178366277473359
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"name": "turkish_plu_goal_inference",
|
86 |
+
"task": "multiple_choice",
|
87 |
+
"acc": 0.4145758661887694,
|
88 |
+
"acc_norm": 0.4324970131421744
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"name": "turkish_plu_next_event_prediction",
|
92 |
+
"task": "multiple_choice",
|
93 |
+
"acc": 0.4488549618320611,
|
94 |
+
"acc_norm": 0.5358778625954198
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"name": "turkish_plu_step_inference",
|
98 |
+
"task": "multiple_choice",
|
99 |
+
"acc": 0.3382352941176471,
|
100 |
+
"acc_norm": 0.4738562091503268
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"name": "turkish_plu_step_ordering",
|
104 |
+
"task": "multiple_choice",
|
105 |
+
"acc": 0.6425073457394711,
|
106 |
+
"acc_norm": 0.6425073457394711
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"name": "xcopa_tr",
|
110 |
+
"task": "multiple_choice",
|
111 |
+
"acc": 0.626,
|
112 |
+
"acc_norm": 0.626
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"name": "xnli_tr",
|
116 |
+
"task": "natural_language_inference",
|
117 |
+
"acc": 0.4947791164658635,
|
118 |
+
"acc_norm": 0.4947791164658635
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"name": "xquad_tr",
|
122 |
+
"task": "extractive_question_answering",
|
123 |
+
"exact_match": 0.2092436974789916,
|
124 |
+
"f1": 0.35674599908781446
|
125 |
+
}
|
126 |
+
]
|
127 |
+
}
|
results/zero-shot/llama-3.2-1b.json
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"model": "meta-llama/Llama-3.2-1B",
|
4 |
+
"api": "hf",
|
5 |
+
"dtype": "bfloat16",
|
6 |
+
"max_length": 131072,
|
7 |
+
"architecture": "LlamaForCausalLM",
|
8 |
+
"type": "pretrained",
|
9 |
+
"num_parameters": "1b"
|
10 |
+
},
|
11 |
+
"results": [
|
12 |
+
{
|
13 |
+
"name": "belebele_tr",
|
14 |
+
"task": "multiple_choice",
|
15 |
+
"acc": 0.29555555555555557,
|
16 |
+
"acc_norm": 0.29555555555555557
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"name": "exams_tr",
|
20 |
+
"task": "multiple_choice",
|
21 |
+
"acc": 0.28498727735368956,
|
22 |
+
"acc_norm": 0.3053435114503817
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"name": "check_worthiness",
|
26 |
+
"task": "multiple_choice",
|
27 |
+
"acc": 0.3880255941499086,
|
28 |
+
"acc_norm": 0.623400365630713
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"name": "gecturk_generation",
|
32 |
+
"task": "grammatical_error_correction",
|
33 |
+
"exact_match": 0.00741489720256151
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"name": "ironytr",
|
37 |
+
"task": "text_classification",
|
38 |
+
"acc": 0.5283333333333333,
|
39 |
+
"acc_norm": 0.5033333333333333
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"name": "mkqa_tr",
|
43 |
+
"task": "extractive_question_answering",
|
44 |
+
"exact_match": 0.007694584196507843,
|
45 |
+
"f1": 0.03304091036050505
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"name": "mlsum_tr",
|
49 |
+
"task": "summarization",
|
50 |
+
"rouge1": 0.23283491254211872,
|
51 |
+
"rouge2": 0.13426790568610214,
|
52 |
+
"rougeL": 0.18915548037371513
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"name": "mnli_tr",
|
56 |
+
"task": "natural_language_inference",
|
57 |
+
"acc": 0.3232,
|
58 |
+
"acc_norm": 0.334
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"name": "news_cat",
|
62 |
+
"task": "text_classification",
|
63 |
+
"acc": 0.58,
|
64 |
+
"acc_norm": 0.532
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"name": "offenseval_tr",
|
68 |
+
"task": "text_classification",
|
69 |
+
"acc": 0.4671201814058957,
|
70 |
+
"acc_norm": 0.7820294784580499
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"name": "relevance_judgment",
|
74 |
+
"task": "multiple_choice",
|
75 |
+
"acc": 0.56672760511883,
|
76 |
+
"acc_norm": 0.5781535648994516
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"name": "snli_tr",
|
80 |
+
"task": "natural_language_inference",
|
81 |
+
"acc": 0.3239,
|
82 |
+
"acc_norm": 0.3105
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"name": "sts_tr",
|
86 |
+
"task": "text_classification",
|
87 |
+
"acc": 0.17113850616388687,
|
88 |
+
"acc_norm": 0.22552574329224076
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"name": "tquad",
|
92 |
+
"task": "extractive_question_answering",
|
93 |
+
"exact_match": 0.06278026905829596,
|
94 |
+
"f1": 0.21486130318406463
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"name": "turkish_plu_goal_inference",
|
98 |
+
"task": "multiple_choice",
|
99 |
+
"acc": 0.35842293906810035,
|
100 |
+
"acc_norm": 0.4026284348864994
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"name": "turkish_plu_next_event_prediction",
|
104 |
+
"task": "multiple_choice",
|
105 |
+
"acc": 0.3709923664122137,
|
106 |
+
"acc_norm": 0.467175572519084
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"name": "turkish_plu_step_inference",
|
110 |
+
"task": "multiple_choice",
|
111 |
+
"acc": 0.27941176470588236,
|
112 |
+
"acc_norm": 0.41830065359477125
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"name": "turkish_plu_step_ordering",
|
116 |
+
"task": "multiple_choice",
|
117 |
+
"acc": 0.5759059745347699,
|
118 |
+
"acc_norm": 0.5759059745347699
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"name": "wiki_lingua_tr",
|
122 |
+
"task": "summarization",
|
123 |
+
"rouge1": 0.10861529436199803,
|
124 |
+
"rouge2": 0.034862923521078545,
|
125 |
+
"rougeL": 0.08692160533533941
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"name": "wmt-tr-en-prompt",
|
129 |
+
"task": "machine_translation",
|
130 |
+
"wer": 3.910683208136067,
|
131 |
+
"bleu": 0.012043288243775466
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"name": "xcopa_tr",
|
135 |
+
"task": "multiple_choice",
|
136 |
+
"acc": 0.556,
|
137 |
+
"acc_norm": 0.556
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"name": "xlsum_tr",
|
141 |
+
"task": "summarization",
|
142 |
+
"rouge1": 0.16924699150407269,
|
143 |
+
"rouge2": 0.07190935921365724,
|
144 |
+
"rougeL": 0.13255123335488528
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"name": "xnli_tr",
|
148 |
+
"task": "natural_language_inference",
|
149 |
+
"acc": 0.4389558232931727,
|
150 |
+
"acc_norm": 0.4389558232931727
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"name": "xquad_tr",
|
154 |
+
"task": "extractive_question_answering",
|
155 |
+
"exact_match": 0.04873949579831932,
|
156 |
+
"f1": 0.11156636293859905
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"name": "gecturk_generation",
|
160 |
+
"task": "grammatical_error_correction",
|
161 |
+
"exact_match": 0.0073185998362944775
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"name": "mlsum_tr",
|
165 |
+
"task": "summarization",
|
166 |
+
"rouge1": 0.35440052022111407,
|
167 |
+
"rouge2": 0.2215476501673455,
|
168 |
+
"rougeL": 0.2911311598176804
|
169 |
+
},
|
170 |
+
{
|
171 |
+
"name": "wiki_lingua_tr",
|
172 |
+
"task": "summarization",
|
173 |
+
"rouge1": 0.18510384577665046,
|
174 |
+
"rouge2": 0.056181066004903614,
|
175 |
+
"rougeL": 0.1392211003290612
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"name": "wmt-tr-en-prompt",
|
179 |
+
"task": "machine_translation",
|
180 |
+
"wer": 1.311990023748812,
|
181 |
+
"bleu": 0.02624044942774961
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "xlsum_tr",
|
185 |
+
"task": "summarization",
|
186 |
+
"rouge1": 0.2429304790539497,
|
187 |
+
"rouge2": 0.09668008744707143,
|
188 |
+
"rougeL": 0.18327092913535944
|
189 |
+
}
|
190 |
+
]
|
191 |
+
}
|
results/zero-shot/llama-3.2-3b-instruct.json
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"model": "meta-llama/Llama-3.2-3B-Instruct",
|
4 |
+
"api": "hf",
|
5 |
+
"dtype": "bfloat16",
|
6 |
+
"max_length": 131072,
|
7 |
+
"architecture": "LlamaForCausalLM",
|
8 |
+
"type": "instruction-tuned",
|
9 |
+
"num_parameters": "3b"
|
10 |
+
},
|
11 |
+
"results": [
|
12 |
+
{
|
13 |
+
"name": "belebele_tr",
|
14 |
+
"task": "multiple_choice",
|
15 |
+
"acc": 0.5577777777777778,
|
16 |
+
"acc_norm": 0.5577777777777778
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"name": "exams_tr",
|
20 |
+
"task": "multiple_choice",
|
21 |
+
"acc": 0.26208651399491095,
|
22 |
+
"acc_norm": 0.3053435114503817
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"name": "check_worthiness",
|
26 |
+
"task": "multiple_choice",
|
27 |
+
"acc": 0.37614259597806216,
|
28 |
+
"acc_norm": 0.3807129798903108
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"name": "gecturk_generation",
|
32 |
+
"task": "grammatical_error_correction",
|
33 |
+
"exact_match": 0.007222302470027445
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"name": "ironytr",
|
37 |
+
"task": "text_classification",
|
38 |
+
"acc": 0.5016666666666667,
|
39 |
+
"acc_norm": 0.5083333333333333
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"name": "mkqa_tr",
|
43 |
+
"task": "extractive_question_answering",
|
44 |
+
"exact_match": 0.04675939627108612,
|
45 |
+
"f1": 0.08114473798410345
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"name": "mlsum_tr",
|
49 |
+
"task": "summarization",
|
50 |
+
"rouge1": 0.2669056212126977,
|
51 |
+
"rouge2": 0.1480446780314802,
|
52 |
+
"rougeL": 0.2106440565987865
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"name": "mnli_tr",
|
56 |
+
"task": "natural_language_inference",
|
57 |
+
"acc": 0.32,
|
58 |
+
"acc_norm": 0.3141
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"name": "news_cat",
|
62 |
+
"task": "text_classification",
|
63 |
+
"acc": 0.64,
|
64 |
+
"acc_norm": 0.552
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"name": "offenseval_tr",
|
68 |
+
"task": "text_classification",
|
69 |
+
"acc": 0.20634920634920634,
|
70 |
+
"acc_norm": 0.35600907029478457
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"name": "relevance_judgment",
|
74 |
+
"task": "multiple_choice",
|
75 |
+
"acc": 0.4227605118829982,
|
76 |
+
"acc_norm": 0.42413162705667273
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"name": "snli_tr",
|
80 |
+
"task": "natural_language_inference",
|
81 |
+
"acc": 0.319,
|
82 |
+
"acc_norm": 0.2923
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"name": "sts_tr",
|
86 |
+
"task": "text_classification",
|
87 |
+
"acc": 0.12907904278462654,
|
88 |
+
"acc_norm": 0.16896301667875271
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"name": "tquad",
|
92 |
+
"task": "extractive_question_answering",
|
93 |
+
"exact_match": 0.18721973094170405,
|
94 |
+
"f1": 0.5109898180473623
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"name": "turkish_plu_goal_inference",
|
98 |
+
"task": "multiple_choice",
|
99 |
+
"acc": 0.3321385902031063,
|
100 |
+
"acc_norm": 0.3548387096774194
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"name": "turkish_plu_next_event_prediction",
|
104 |
+
"task": "multiple_choice",
|
105 |
+
"acc": 0.3648854961832061,
|
106 |
+
"acc_norm": 0.4488549618320611
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"name": "turkish_plu_step_inference",
|
110 |
+
"task": "multiple_choice",
|
111 |
+
"acc": 0.24183006535947713,
|
112 |
+
"acc_norm": 0.3758169934640523
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"name": "turkish_plu_step_ordering",
|
116 |
+
"task": "multiple_choice",
|
117 |
+
"acc": 0.5710088148873653,
|
118 |
+
"acc_norm": 0.5710088148873653
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"name": "wiki_lingua_tr",
|
122 |
+
"task": "summarization",
|
123 |
+
"rouge1": 0.1342879173103036,
|
124 |
+
"rouge2": 0.041489300068460175,
|
125 |
+
"rougeL": 0.10482785510181569
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"name": "wmt-tr-en-prompt",
|
129 |
+
"task": "machine_translation",
|
130 |
+
"wer": 1.7706536060519733,
|
131 |
+
"bleu": 0.048843165627950165
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"name": "xcopa_tr",
|
135 |
+
"task": "multiple_choice",
|
136 |
+
"acc": 0.546,
|
137 |
+
"acc_norm": 0.546
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"name": "xlsum_tr",
|
141 |
+
"task": "summarization",
|
142 |
+
"rouge1": 0.17224405229987672,
|
143 |
+
"rouge2": 0.06736413357191079,
|
144 |
+
"rougeL": 0.12750762702828333
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"name": "xnli_tr",
|
148 |
+
"task": "natural_language_inference",
|
149 |
+
"acc": 0.42811244979919677,
|
150 |
+
"acc_norm": 0.42811244979919677
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"name": "xquad_tr",
|
154 |
+
"task": "extractive_question_answering",
|
155 |
+
"exact_match": 0.23025210084033615,
|
156 |
+
"f1": 0.4335914561273987
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"name": "gecturk_generation",
|
160 |
+
"task": "grammatical_error_correction",
|
161 |
+
"exact_match": 0.009726033992970293
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"name": "mlsum_tr",
|
165 |
+
"task": "summarization",
|
166 |
+
"rouge1": 0.36482642805140486,
|
167 |
+
"rouge2": 0.2215366481025873,
|
168 |
+
"rougeL": 0.2964001074060548
|
169 |
+
},
|
170 |
+
{
|
171 |
+
"name": "wiki_lingua_tr",
|
172 |
+
"task": "summarization",
|
173 |
+
"rouge1": 0.21420020104688736,
|
174 |
+
"rouge2": 0.06939715371402275,
|
175 |
+
"rougeL": 0.1623531918550368
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"name": "wmt-tr-en-prompt",
|
179 |
+
"task": "machine_translation",
|
180 |
+
"wer": 0.9910280580654681,
|
181 |
+
"bleu": 0.08179536823012563
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "xlsum_tr",
|
185 |
+
"task": "summarization",
|
186 |
+
"rouge1": 0.2616423061938248,
|
187 |
+
"rouge2": 0.11064039063859936,
|
188 |
+
"rougeL": 0.19686955120787036
|
189 |
+
}
|
190 |
+
]
|
191 |
+
}
|
results/zero-shot/mistral-7b.json
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"dtype": "bfloat16",
|
4 |
+
"max_length": "4096",
|
5 |
+
"model": "mistralai/Mistral-7B-v0.1",
|
6 |
+
"api": "hf",
|
7 |
+
"architecture": "MixtralForCausalLM",
|
8 |
+
"type": "pretrained",
|
9 |
+
"num_parameters": "7b"
|
10 |
+
},
|
11 |
+
"results": [
|
12 |
+
{
|
13 |
+
"name": "xquad_tr",
|
14 |
+
"task": "extractive_question_answering",
|
15 |
+
"exact_match": 0.16722689075630254,
|
16 |
+
"f1": 0.32150094374615246
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"name": "xcopa_tr",
|
20 |
+
"task": "multiple_choice",
|
21 |
+
"acc": 0.566,
|
22 |
+
"acc_norm": 0.566
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"name": "turkish_plu",
|
26 |
+
"task": "multiple_choice",
|
27 |
+
"acc": 0.45152,
|
28 |
+
"acc_norm": 0.5136
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"name": "turkish_plu_goal_inference",
|
32 |
+
"task": "multiple_choice",
|
33 |
+
"acc": 0.42771804062126645,
|
34 |
+
"acc_norm": 0.46714456391875747
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"name": "turkish_plu_next_event_prediction",
|
38 |
+
"task": "multiple_choice",
|
39 |
+
"acc": 0.39541984732824426,
|
40 |
+
"acc_norm": 0.5022900763358779
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"name": "turkish_plu_step_inference",
|
44 |
+
"task": "multiple_choice",
|
45 |
+
"acc": 0.29248366013071897,
|
46 |
+
"acc_norm": 0.4411764705882353
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"name": "turkish_plu_step_ordering",
|
50 |
+
"task": "multiple_choice",
|
51 |
+
"acc": 0.6023506366307542,
|
52 |
+
"acc_norm": 0.6023506366307542
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"name": "check_worthiness",
|
56 |
+
"task": "multiple_choice",
|
57 |
+
"acc": 0.37614259597806216,
|
58 |
+
"acc_norm": 0.42458866544789764
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"name": "relevance_judgment",
|
62 |
+
"task": "multiple_choice",
|
63 |
+
"acc": 0.4218464351005484,
|
64 |
+
"acc_norm": 0.49588665447897623
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"name": "tquad",
|
68 |
+
"task": "extractive_question_answering",
|
69 |
+
"exact_match": 0.2096412556053812,
|
70 |
+
"f1": 0.4767364701184728
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"name": "sts_tr",
|
74 |
+
"task": "text_classification",
|
75 |
+
"acc": 0.135605511240029,
|
76 |
+
"acc_norm": 0.20522117476432197
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"name": "offenseval_tr",
|
80 |
+
"task": "text_classification",
|
81 |
+
"acc": 0.2046485260770975,
|
82 |
+
"acc_norm": 0.3735827664399093
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"name": "mnli_tr",
|
86 |
+
"task": "natural_language_inference",
|
87 |
+
"acc": 0.3194,
|
88 |
+
"acc_norm": 0.3267
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"name": "snli_tr",
|
92 |
+
"task": "natural_language_inference",
|
93 |
+
"acc": 0.3196,
|
94 |
+
"acc_norm": 0.3201
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"name": "xnli_tr",
|
98 |
+
"task": "natural_language_inference",
|
99 |
+
"acc": 0.331936127744511,
|
100 |
+
"acc_norm": 0.34910179640718564
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"name": "news_cat",
|
104 |
+
"task": "text_classification",
|
105 |
+
"acc": 0.652,
|
106 |
+
"acc_norm": 0.44
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"name": "mkqa_tr",
|
110 |
+
"task": "extractive_question_answering",
|
111 |
+
"exact_match": 0.12030186445693992,
|
112 |
+
"f1": 0.16163416207615164
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"name": "ironytr",
|
116 |
+
"task": "text_classification",
|
117 |
+
"acc": 0.5016666666666667,
|
118 |
+
"acc_norm": 0.52
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"name": "exams_tr",
|
122 |
+
"task": "multiple_choice",
|
123 |
+
"acc": 0.24173027989821882,
|
124 |
+
"acc_norm": 0.30279898218829515
|
125 |
+
},
|
126 |
+
{
|
127 |
+
"name": "belebele_tr",
|
128 |
+
"task": "multiple_choice",
|
129 |
+
"acc": 0.37444444444444447,
|
130 |
+
"acc_norm": 0.37444444444444447
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"name": "gecturk_generation",
|
134 |
+
"task": "grammatical_error_correction",
|
135 |
+
"exact_match": 0.20660599932591844
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"name": "mlsum_tr",
|
139 |
+
"task": "summarization",
|
140 |
+
"rouge1": 0.09403885616158554,
|
141 |
+
"rouge2": 0.06300721907752257,
|
142 |
+
"rougeL": 0.08169726458665999
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"name": "wiki_lingua_tr",
|
146 |
+
"task": "summarization",
|
147 |
+
"rouge1": 0.1905392717787084,
|
148 |
+
"rouge2": 0.05957088325130176,
|
149 |
+
"rougeL": 0.1472985242082243
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"name": "wmt-tr-en-prompt",
|
153 |
+
"task": "machine_translation",
|
154 |
+
"wer": 1.0876062644712858,
|
155 |
+
"bleu": 0.04973628734419603
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"name": "xlsum_tr",
|
159 |
+
"task": "summarization",
|
160 |
+
"rouge1": 0.02720399421152351,
|
161 |
+
"rouge2": 0.012032606076011431,
|
162 |
+
"rougeL": 0.02311080687545987
|
163 |
+
}
|
164 |
+
]
|
165 |
+
}
|
results/zero-shot/trendyol-7b.json
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"dtype": "bfloat16",
|
4 |
+
"max_length": "4096",
|
5 |
+
"model": "Trendyol/Trendyol-LLM-7b-base-v1.0",
|
6 |
+
"api": "hf",
|
7 |
+
"architecture": "MixtralForCausalLM",
|
8 |
+
"type": "instruction-tuned",
|
9 |
+
"num_parameters": "7b"
|
10 |
+
},
|
11 |
+
"results": [
|
12 |
+
{
|
13 |
+
"name": "xquad_tr",
|
14 |
+
"task": "extractive_question_answering",
|
15 |
+
"exact_match": 0.0,
|
16 |
+
"f1": 0.15289561928390746
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"name": "xlsum_tr",
|
20 |
+
"task": "summarization",
|
21 |
+
"rouge1": 0.12128827095936726,
|
22 |
+
"rouge2": 0.05041801264157676,
|
23 |
+
"rougeL": 0.09604301857137748
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"name": "xcopa_tr",
|
27 |
+
"task": "multiple_choice",
|
28 |
+
"acc": 0.61,
|
29 |
+
"acc_norm": 0.61
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"name": "wmt-tr-en-prompt",
|
33 |
+
"task": "machine_translation",
|
34 |
+
"wer": 13.038665635458035,
|
35 |
+
"bleu": 0.010261135899096054
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"name": "wiki_lingua_tr",
|
39 |
+
"task": "summarization",
|
40 |
+
"rouge1": 0.09429776166714862,
|
41 |
+
"rouge2": 0.02873358785517343,
|
42 |
+
"rougeL": 0.07767336257524773
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"name": "turkish_plu",
|
46 |
+
"task": "multiple_choice",
|
47 |
+
"acc": 0.46944,
|
48 |
+
"acc_norm": 0.49952
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"name": "turkish_plu_goal_inference",
|
52 |
+
"task": "multiple_choice",
|
53 |
+
"acc": 0.4635603345280765,
|
54 |
+
"acc_norm": 0.44683393070489846
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"name": "turkish_plu_next_event_prediction",
|
58 |
+
"task": "multiple_choice",
|
59 |
+
"acc": 0.43206106870229005,
|
60 |
+
"acc_norm": 0.48854961832061067
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"name": "turkish_plu_step_inference",
|
64 |
+
"task": "multiple_choice",
|
65 |
+
"acc": 0.3235294117647059,
|
66 |
+
"acc_norm": 0.4395424836601307
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"name": "turkish_plu_step_ordering",
|
70 |
+
"task": "multiple_choice",
|
71 |
+
"acc": 0.5857002938295789,
|
72 |
+
"acc_norm": 0.5857002938295789
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"name": "check_worthiness",
|
76 |
+
"task": "multiple_choice",
|
77 |
+
"acc": 0.37614259597806216,
|
78 |
+
"acc_norm": 0.37614259597806216
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"name": "relevance_judgment",
|
82 |
+
"task": "multiple_choice",
|
83 |
+
"acc": 0.4218464351005484,
|
84 |
+
"acc_norm": 0.4218464351005484
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"name": "tr-wikihow-summ",
|
88 |
+
"task": "summarization",
|
89 |
+
"rouge1": 0.1602888221320987,
|
90 |
+
"rouge2": 0.04616347811027626,
|
91 |
+
"rougeL": 0.12482407983918105
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"name": "tquad",
|
95 |
+
"task": "extractive_question_answering",
|
96 |
+
"exact_match": 0.007847533632286996,
|
97 |
+
"f1": 0.26089513093937805
|
98 |
+
},
|
99 |
+
{
|
100 |
+
"name": "sts_tr",
|
101 |
+
"task": "text_classification",
|
102 |
+
"acc": 0.1551849166062364,
|
103 |
+
"acc_norm": 0.22697606961566352
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"name": "offenseval_tr",
|
107 |
+
"task": "text_classification",
|
108 |
+
"acc": 0.20294784580498867,
|
109 |
+
"acc_norm": 0.20294784580498867
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"name": "mnli_tr",
|
113 |
+
"task": "natural_language_inference",
|
114 |
+
"acc": 0.3134,
|
115 |
+
"acc_norm": 0.2942
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"name": "snli_tr",
|
119 |
+
"task": "natural_language_inference",
|
120 |
+
"acc": 0.3204,
|
121 |
+
"acc_norm": 0.2894
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"name": "xnli_tr",
|
125 |
+
"task": "natural_language_inference",
|
126 |
+
"acc": 0.32974051896207585,
|
127 |
+
"acc_norm": 0.300998003992016
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"name": "news_cat",
|
131 |
+
"task": "text_classification",
|
132 |
+
"acc": 0.812,
|
133 |
+
"acc_norm": 0.628
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"name": "mlsum_tr",
|
137 |
+
"task": "summarization",
|
138 |
+
"rouge1": 0.15450187559493767,
|
139 |
+
"rouge2": 0.08797823051939649,
|
140 |
+
"rougeL": 0.1350441813405041
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"name": "mkqa_tr",
|
144 |
+
"task": "extractive_question_answering",
|
145 |
+
"exact_match": 0.001479727730097662,
|
146 |
+
"f1": 0.037161672000373895
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"name": "ironytr",
|
150 |
+
"task": "text_classification",
|
151 |
+
"acc": 0.5,
|
152 |
+
"acc_norm": 0.5
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"name": "gecturk_generation",
|
156 |
+
"task": "grammatical_error_correction",
|
157 |
+
"exact_match": 0.00048148683133516297
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"name": "exams_tr",
|
161 |
+
"task": "multiple_choice",
|
162 |
+
"acc": 0.28498727735368956,
|
163 |
+
"acc_norm": 0.3486005089058524
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"name": "belebele_tr",
|
167 |
+
"task": "multiple_choice",
|
168 |
+
"acc": 0.3622222222222222,
|
169 |
+
"acc_norm": 0.3622222222222222
|
170 |
+
}
|
171 |
+
]
|
172 |
+
}
|
results/zero-shot/turna.json
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": {
|
3 |
+
"dtype": "auto",
|
4 |
+
"max_length": "1024",
|
5 |
+
"model": "boun-tabi-LMG/TURNA",
|
6 |
+
"api": "hf",
|
7 |
+
"architecture": "T5ForCondtiionalGeneration",
|
8 |
+
"type": "pretrained",
|
9 |
+
"num_parameters": "7b"
|
10 |
+
},
|
11 |
+
"results": [
|
12 |
+
{
|
13 |
+
"name": "xquad_tr",
|
14 |
+
"task": "extractive_question_answering",
|
15 |
+
"exact_match": 0.0,
|
16 |
+
"f1": 0.0
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"name": "xlsum_tr",
|
20 |
+
"task": "summarization",
|
21 |
+
"rouge1": 0.1904384366601188,
|
22 |
+
"rouge2": 0.060686113611140166,
|
23 |
+
"rougeL": 0.1311090280660866
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"name": "xcopa_tr",
|
27 |
+
"task": "multiple_choice",
|
28 |
+
"acc": 0.558,
|
29 |
+
"acc_norm": 0.558
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"name": "wmt-tr-en-prompt",
|
33 |
+
"task": "machine_translation",
|
34 |
+
"wer": 3.9036796738046218,
|
35 |
+
"bleu": 0.0008286617236874524
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"name": "wiki_lingua_tr",
|
39 |
+
"task": "summarization",
|
40 |
+
"rouge1": 0.18435291474691423,
|
41 |
+
"rouge2": 0.05584649726914134,
|
42 |
+
"rougeL": 0.13446021077350823
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"name": "turkish_plu",
|
46 |
+
"task": "multiple_choice",
|
47 |
+
"acc": 0.40288,
|
48 |
+
"acc_norm": 0.44608
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"name": "turkish_plu_goal_inference",
|
52 |
+
"task": "multiple_choice",
|
53 |
+
"acc": 0.37992831541218636,
|
54 |
+
"acc_norm": 0.35722819593787336
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"name": "turkish_plu_next_event_prediction",
|
58 |
+
"task": "multiple_choice",
|
59 |
+
"acc": 0.383206106870229,
|
60 |
+
"acc_norm": 0.4488549618320611
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"name": "turkish_plu_step_inference",
|
64 |
+
"task": "multiple_choice",
|
65 |
+
"acc": 0.272875816993464,
|
66 |
+
"acc_norm": 0.4542483660130719
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"name": "turkish_plu_step_ordering",
|
70 |
+
"task": "multiple_choice",
|
71 |
+
"acc": 0.5122428991185113,
|
72 |
+
"acc_norm": 0.5122428991185113
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"name": "check_worthiness",
|
76 |
+
"task": "multiple_choice",
|
77 |
+
"acc": 0.42230347349177333,
|
78 |
+
"acc_norm": 0.620201096892139
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"name": "relevance_judgment",
|
82 |
+
"task": "multiple_choice",
|
83 |
+
"acc": 0.4904021937842779,
|
84 |
+
"acc_norm": 0.5781535648994516
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"name": "tr-wikihow-summ",
|
88 |
+
"task": "summarization",
|
89 |
+
"rouge1": 0.20515501424269858,
|
90 |
+
"rouge2": 0.05693981251975118,
|
91 |
+
"rougeL": 0.1449313333992171
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"name": "tquad",
|
95 |
+
"task": "extractive_question_answering",
|
96 |
+
"exact_match": 0.0,
|
97 |
+
"f1": 0.0003736920777279522
|
98 |
+
},
|
99 |
+
{
|
100 |
+
"name": "sts_tr",
|
101 |
+
"task": "text_classification",
|
102 |
+
"acc": 0.14213197969543148,
|
103 |
+
"acc_norm": 0.19506889050036258
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"name": "offenseval_tr",
|
107 |
+
"task": "text_classification",
|
108 |
+
"acc": 0.5099206349206349,
|
109 |
+
"acc_norm": 0.7970521541950113
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"name": "mnli_tr",
|
113 |
+
"task": "natural_language_inference",
|
114 |
+
"acc": 0.3203,
|
115 |
+
"acc_norm": 0.3159
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"name": "snli_tr",
|
119 |
+
"task": "natural_language_inference",
|
120 |
+
"acc": 0.3223,
|
121 |
+
"acc_norm": 0.3278
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"name": "xnli_tr",
|
125 |
+
"task": "natural_language_inference",
|
126 |
+
"acc": 0.32974051896207585,
|
127 |
+
"acc_norm": 0.3277445109780439
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"name": "news_cat",
|
131 |
+
"task": "text_classification",
|
132 |
+
"acc": 0.328,
|
133 |
+
"acc_norm": 0.208
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"name": "mlsum_tr",
|
137 |
+
"task": "summarization",
|
138 |
+
"rouge1": 0.20830277213555015,
|
139 |
+
"rouge2": 0.11040542892341527,
|
140 |
+
"rougeL": 0.16135585618616377
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"name": "mkqa_tr",
|
144 |
+
"task": "extractive_question_answering",
|
145 |
+
"exact_match": 0.0011837821840781297,
|
146 |
+
"f1": 0.006720430107526878
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"name": "ironytr",
|
150 |
+
"task": "text_classification",
|
151 |
+
"acc": 0.48333333333333334,
|
152 |
+
"acc_norm": 0.5033333333333333
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"name": "gecturk_generation",
|
156 |
+
"task": "grammatical_error_correction",
|
157 |
+
"exact_match": 0.0
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"name": "exams_tr",
|
161 |
+
"task": "multiple_choice",
|
162 |
+
"acc": 0.2366412213740458,
|
163 |
+
"acc_norm": 0.2748091603053435
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"name": "belebele_tr",
|
167 |
+
"task": "multiple_choice",
|
168 |
+
"acc": 0.22555555555555556,
|
169 |
+
"acc_norm": 0.22555555555555556
|
170 |
+
}
|
171 |
+
]
|
172 |
+
}
|
utils.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import os.path as osp
|
3 |
+
import json
|
4 |
+
|
5 |
+
|
6 |
+
def preprocess_path(path):
|
7 |
+
path = osp.expanduser(path)
|
8 |
+
path = osp.abspath(path)
|
9 |
+
return path
|
10 |
+
|
11 |
+
|
12 |
+
def get_model_url(entry):
|
13 |
+
if entry['api'] == 'hf':
|
14 |
+
return f'https://huggingface.co/{entry["model"]}'
|
15 |
+
return entry.get('url', f'https://localhost/{entry["model"]}')
|
16 |
+
|
17 |
+
|
18 |
+
def read_results(path):
|
19 |
+
path = preprocess_path(path)
|
20 |
+
file_list = sorted(os.listdir(path))
|
21 |
+
results = list()
|
22 |
+
for file_name in file_list:
|
23 |
+
file_path = osp.join(path, file_name)
|
24 |
+
with open(file_path, 'r') as f:
|
25 |
+
this = json.load(f)
|
26 |
+
results.append(this)
|
27 |
+
return results
|
28 |
+
|