Ilker Kesen commited on
Commit
500fbd7
·
1 Parent(s): 74daf31

initialize the first version

Browse files
.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 KUIS AI Center
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,14 +1 @@
1
- ---
2
- title: Pergel
3
- emoji: 📈
4
- colorFrom: blue
5
- colorTo: pink
6
- sdk: streamlit
7
- sdk_version: 1.40.2
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: 'Pergel: A Unified Benchmark for Evaluating Turkish LLMs'
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # Cetvel-leaderboard
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import json
4
+ from utils import read_results, preprocess_path, get_model_url
5
+ from data import Tasks, Metrics, DATASET_TASK_DICT, TASK_METRIC_DICT, DATASET_GROUPS
6
+
7
+
8
+ st.set_page_config(
9
+ page_title='Cetvel 📏',
10
+ layout='centered',
11
+ )
12
+
13
+
14
+ @st.cache_data
15
+ def cache_results(path):
16
+ json_results = read_results(path)
17
+ results = list()
18
+ for entry in json_results:
19
+ row = {
20
+ 'model': entry['model']['model'],
21
+ 'num_parameters': entry['model']['num_parameters'],
22
+ 'url': get_model_url(entry['model']),
23
+ 'architecture': entry['model']['architecture'],
24
+ 'type': entry['model']['type'],
25
+ 'precision': entry['model']['dtype'],
26
+ }
27
+ for result in entry['results']:
28
+ task = result['task']
29
+ metric = TASK_METRIC_DICT.get(task)
30
+ score = result.get(metric)
31
+ score = 100 * score if metric != Metrics.WER and score is not None else score
32
+ row[result['name']] = score
33
+ results.append(row)
34
+ df = pd.DataFrame(results)
35
+ for group, metadata in DATASET_GROUPS.items():
36
+ df[group] = df[metadata['datasets']].mean(axis=1)
37
+ return df
38
+
39
+
40
+ @st.cache_data
41
+ def cache_datasets(path):
42
+ path = preprocess_path(path)
43
+ with open(path, 'r') as f:
44
+ datasets = json.load(f)
45
+ for key in datasets.keys():
46
+ datasets[key]['dataset'] = key
47
+ return datasets
48
+
49
+
50
+ def create_column_configs(items):
51
+ column_configs = dict()
52
+ for key, metadata in items.items():
53
+ column_configs[key] = st.column_config.NumberColumn(
54
+ metadata.get('name', key),
55
+ help=metadata['description'],
56
+ min_value=0,
57
+ format="%2.2f"
58
+ )
59
+ return column_configs
60
+
61
+
62
+ def insert_average(df, keys):
63
+ df = df.copy(deep=True)
64
+ df['average'] = df.loc[:, [x for x in df.columns if x in keys]].mean(axis=1)
65
+ df.insert(1, 'average', df.pop('average'))
66
+ df.index += 1
67
+ return df.sort_values(by=['average'], ascending=False)
68
+
69
+
70
+ MODEL_SPEC_CONFIGS = {
71
+ 'model': st.column_config.TextColumn(
72
+ 'Model',
73
+ help='Large Language Model (LLM) used for the experiments.',
74
+ max_chars=120,
75
+
76
+ ),
77
+ 'url': st.column_config.LinkColumn(
78
+ 'URL',
79
+ help='Model URL.',
80
+ display_text='Click',
81
+ ),
82
+ 'num_parameters': st.column_config.TextColumn(
83
+ '#params',
84
+ help='Approximate number of parameters.',
85
+ ),
86
+ 'type': st.column_config.TextColumn(
87
+ 'Type',
88
+ help='Model type based on training objective.',
89
+ ),
90
+ 'average': st.column_config.NumberColumn(
91
+ 'Avg.',
92
+ help='Average across task or dataset performances.',
93
+ format="%2.2f",
94
+ )
95
+ }
96
+
97
+
98
+ def filter_visible_model_specs():
99
+ specs = {
100
+ 'URL': ('url', 1),
101
+ '#params': ('num_parameters', 2),
102
+ 'Architecture': ('architecture', 3),
103
+ 'Type': ('type', 4),
104
+ 'Precision': ('precision', 5),
105
+ }
106
+ visible_specs = st.multiselect(
107
+ 'Select model specs to be shown in the table.',
108
+ options=sorted(specs.keys(), key=lambda x: specs[x][1]),
109
+ )
110
+ # visible_specs = sorted(visible_specs, key=lambda x: specs[x][1])
111
+ return [specs[x][0] for x in visible_specs]
112
+
113
+
114
+ def filter_by_model_spec():
115
+ pass
116
+
117
+
118
+ def filter_visible_datasets(datasets):
119
+ col1, col2 = st.columns(2)
120
+ with col1:
121
+ dataset_grouping = st.selectbox(
122
+ 'Dataset Grouping',
123
+ [
124
+ 'Group Datasets',
125
+ 'Show All Datasets',
126
+ ],
127
+ )
128
+
129
+ with col2:
130
+ filter_by_task = st.selectbox(
131
+ 'Filter by Task',
132
+ [
133
+ 'All',
134
+ 'Understanding Tasks',
135
+ 'Generation Tasks',
136
+ 'Multiple Choice',
137
+ 'Extractive Question Answering',
138
+ 'Natural Language Inference',
139
+ 'Text Classification',
140
+ 'Summarization',
141
+ ],
142
+ disabled=dataset_grouping == "Group Datasets",
143
+ )
144
+
145
+ if dataset_grouping == 'Group Datasets':
146
+ return list(DATASET_GROUPS.keys())
147
+ elif dataset_grouping == 'Show All Datasets':
148
+ if filter_by_task == 'All':
149
+ return list(datasets.keys())
150
+ elif filter_by_task == 'Understanding Tasks':
151
+ this_datasets = [k for (k, v) in datasets.items() if not v['generative']]
152
+ return this_datasets
153
+ elif filter_by_task == 'Generation Tasks':
154
+ this_datasets = [k for (k, v) in datasets.items() if v['generative']]
155
+ return this_datasets
156
+ elif filter_by_task == 'Multiple Choice':
157
+ return DATASET_GROUPS['MCQA']['datasets']
158
+ elif filter_by_task == 'Extractive Question Answering':
159
+ return DATASET_GROUPS['QA']['datasets']
160
+ elif filter_by_task == 'Natural Language Inference':
161
+ return DATASET_GROUPS['NLI']['datasets']
162
+ elif filter_by_task == 'Text Classification':
163
+ return DATASET_GROUPS['TC']['datasets']
164
+ elif filter_by_task == 'Summarization':
165
+ return DATASET_GROUPS['SUM']['datasets']
166
+
167
+
168
+ def introduction():
169
+ st.title(':blue[Cetvel :straight_ruler:]')
170
+ st.subheader('A Unified Benchmark for Evaluating Turkish LLMs', anchor=False)
171
+ st.markdown('''Cetvel is an extended version of the [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness) tool, specifically includes tasks/datasets for benchmarking Turkish Large Language Models (LLMs). Cetvel includes a variety of tasks curated to assess different aspects of model performance in the Turkish language. Our primary goal is to objectively evaluate the capabilities of large language models in understanding and processing Turkish. For documentation and more details about the benchmark and the experiments, you can check the [GitHub repository](https://github.com/KUIS-AI/Cetvel).''')
172
+
173
+
174
+ def main():
175
+ introduction()
176
+ results_df = cache_results('./results/zero-shot')
177
+ datasets = cache_datasets('./data/datasets.json')
178
+ dataset_column_configs = create_column_configs(datasets)
179
+ group_column_configs = create_column_configs(DATASET_GROUPS)
180
+ # score_columns = list(dataset_column_configs.keys()) + list(group_column_configs.keys())
181
+ column_configs = MODEL_SPEC_CONFIGS | group_column_configs | dataset_column_configs
182
+
183
+ visible_data_columns = sorted(filter_visible_datasets(datasets), key=str.casefold)
184
+ visible_model_columns = filter_visible_model_specs()
185
+ results_df = insert_average(results_df, visible_data_columns)
186
+
187
+ st.dataframe(
188
+ results_df,
189
+ use_container_width=True,
190
+ hide_index=True,
191
+ column_config=column_configs,
192
+ column_order=['model', 'average',] + visible_model_columns + visible_data_columns,
193
+ )
194
+ st.image('./assets/kuis-ai-logo.png', width=240)
195
+
196
+
197
+ main()
assets/kuis-ai-logo.png ADDED
data.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import StrEnum, auto
2
+
3
+
4
+ class Tasks(StrEnum):
5
+ EXTRACTIVE_QUESTION_ANSWERING = auto()
6
+ MULTIPLE_CHOICE = auto()
7
+ SUMMARIZATION = auto()
8
+ NATURAL_LANGUAGE_INFERENCE = auto()
9
+ TEXT_CLASSIFICATION = auto()
10
+ MACHINE_TRANSLATION = auto()
11
+ GRAMMATICAL_ERROR_CORRECTION = auto()
12
+
13
+
14
+ class Metrics(StrEnum):
15
+ F1 = "f1"
16
+ EXACT_MATCH = "exact_match"
17
+ ROGUE1 = "rouge1"
18
+ ROUGE2 = "rouge2"
19
+ ROUGEL = "rougeL"
20
+ ACCURACY = "acc"
21
+ WER = "wer"
22
+ BLEU = "bleu"
23
+
24
+
25
+ DATASET_TASK_DICT = {
26
+ # extractive qa
27
+ 'xquad_tr': Tasks.EXTRACTIVE_QUESTION_ANSWERING,
28
+ 'tquad': Tasks.EXTRACTIVE_QUESTION_ANSWERING,
29
+ 'mkqa_tr': Tasks.EXTRACTIVE_QUESTION_ANSWERING, # not exactly
30
+
31
+ # summarization
32
+ 'xlsum_tr': Tasks.SUMMARIZATION,
33
+ 'mlsum_tr': Tasks.SUMMARIZATION,
34
+ 'wiki_lingua_tr': Tasks.SUMMARIZATION,
35
+ 'tr-wikihow-summ': Tasks.SUMMARIZATION,
36
+
37
+ # NLI
38
+ #'nli_tr': Tasks.NATURAL_LANGUAGE_INFERENCE,
39
+ 'mnli_tr': Tasks.NATURAL_LANGUAGE_INFERENCE,
40
+ 'snli_tr': Tasks.NATURAL_LANGUAGE_INFERENCE,
41
+ 'xnli_tr': Tasks.NATURAL_LANGUAGE_INFERENCE,
42
+
43
+ # multiple-choice
44
+ 'xcopa_tr': Tasks.MULTIPLE_CHOICE,
45
+ 'exams_tr': Tasks.MULTIPLE_CHOICE,
46
+ 'belebele_tr': Tasks.MULTIPLE_CHOICE,
47
+ 'turkish_plu': Tasks.MULTIPLE_CHOICE,
48
+ 'turkish_plu_goal_inference': Tasks.MULTIPLE_CHOICE,
49
+ 'turkish_plu_next_event_prediction': Tasks.MULTIPLE_CHOICE,
50
+ 'turkish_plu_step_inference': Tasks.MULTIPLE_CHOICE,
51
+ 'turkish_plu_step_ordering': Tasks.MULTIPLE_CHOICE,
52
+
53
+ # fact-checking, not sure whether these are multi-choice
54
+ # 'trclaim19': Tasks.MULTIPLE_CHOICE,
55
+ 'check_worthiness': Tasks.MULTIPLE_CHOICE,
56
+ 'relevance_judgment': Tasks.MULTIPLE_CHOICE,
57
+
58
+ # text classification
59
+ 'sts_tr': Tasks.TEXT_CLASSIFICATION,
60
+ 'offenseval_tr': Tasks.TEXT_CLASSIFICATION,
61
+ 'news_cat': Tasks.TEXT_CLASSIFICATION,
62
+ 'ironytr': Tasks.TEXT_CLASSIFICATION,
63
+
64
+ # other generation
65
+ 'wmt-tr-en-prompt': Tasks.MACHINE_TRANSLATION,
66
+ 'gecturk_generation': Tasks.GRAMMATICAL_ERROR_CORRECTION,
67
+ }
68
+
69
+
70
+ TASK_METRIC_DICT = {
71
+ Tasks.EXTRACTIVE_QUESTION_ANSWERING: Metrics.EXACT_MATCH,
72
+ Tasks.MULTIPLE_CHOICE: Metrics.ACCURACY,
73
+ Tasks.TEXT_CLASSIFICATION: Metrics.ACCURACY,
74
+ Tasks.NATURAL_LANGUAGE_INFERENCE: Metrics.ACCURACY,
75
+ Tasks.SUMMARIZATION: Metrics.ROUGE2,
76
+ Tasks.MACHINE_TRANSLATION: Metrics.BLEU,
77
+ Tasks.GRAMMATICAL_ERROR_CORRECTION: Metrics.EXACT_MATCH,
78
+ }
79
+
80
+
81
+ GENERATIVE_TASKS = (
82
+ Tasks.SUMMARIZATION,
83
+ Tasks.MACHINE_TRANSLATION,
84
+ Tasks.GRAMMATICAL_ERROR_CORRECTION,
85
+ )
86
+
87
+ DATASET_GROUPS = {
88
+ 'QA': {
89
+ 'datasets': ['xquad_tr', 'tquad', 'mkqa_tr'],
90
+ 'description': 'Turkish splits of SQuAD-like datasets XQuAD and TQUAD.',
91
+ },
92
+ 'MCQA': {
93
+ 'datasets': ['xcopa_tr', 'exams_tr', 'belebele_tr'] + [x for x in DATASET_TASK_DICT.keys() if x.startswith('turkish_plu')],
94
+ 'description': 'Multiple Choice Question Answering datasets: XCOPA, Exams, Belebele and Turkish PLU.'
95
+ },
96
+ 'TC': {
97
+ 'datasets': ['sts_tr', 'offenseval_tr', 'news_cat', 'ironytr', ],
98
+ 'description': 'Text Classification datasets.',
99
+ },
100
+ 'NLI': {
101
+ 'datasets': ['mnli_tr', 'snli_tr', 'xnli_tr'],
102
+ 'description': 'Natural Language Inference (NLI) datasets in Turkish: XNLI, SNLI and MNLI.',
103
+ },
104
+ 'SUM': {
105
+ 'datasets': [name for name, task in DATASET_TASK_DICT.items() if task == Tasks.SUMMARIZATION],
106
+ 'description': 'Summarization datasets in Turkish (XLSum, MLSum, WikiLingua and TrWikiHowSumm).',
107
+ },
108
+ 'GEC': {
109
+ 'datasets': ['gecturk_generation',],
110
+ 'description': 'Grammatical Error Correction task.',
111
+ },
112
+ 'MT': {
113
+ 'datasets': ['wmt-tr-en-prompt'],
114
+ 'description': 'Machine Translation on WMT-16 dataset (English-to-Turkish).',
115
+ },
116
+
117
+ # 'TrClaim19': {
118
+ # 'datasets': ['check_worthiness', 'relevance_judgment'],
119
+ # 'description': 'TrClaim19 dataset for fact-checking.',
120
+ # },
121
+ }
data/datasets.json ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tquad": {
3
+ "name": "TQUAD",
4
+ "task": "extractive_question_answering",
5
+ "description": "This dataset is the Turkish Question & Answer dataset on Turkish & Islamic Science History within the scope of Teknofest 2018 Artificial Intelligence competition.",
6
+ "url": "https://github.com/TQuad/turkish-nlp-qa-dataset",
7
+ "hf_name": "mcemilg/tquad",
8
+ "generative": false
9
+ },
10
+ "xquad_tr": {
11
+ "name": "XQUAD",
12
+ "task": "extractive_question_answering",
13
+ "description": "XQuAD (Cross-lingual Question Answering Dataset) is a benchmark dataset for evaluating cross-lingual question answering performance. The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the development set of SQuAD v1.1 together with their professional translations into ten languages: Spanish, German, Greek, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, and Hindi..",
14
+ "url": "https://github.com/google-deepmind/xquad",
15
+ "hf_name": "google/xquad",
16
+ "generative": false
17
+ },
18
+ "mkqa_tr": {
19
+ "name": "MKQA",
20
+ "task": "extractive_question_answering",
21
+ "description": "MKQA: Multilingual Knowledge Questions & Answers. MKQA includes 10k open-domain question-answer pairs in 26 languages, resulting 260k examples in total.",
22
+ "url": "https://github.com/apple/ml-mkqa",
23
+ "hf_name": "mcemilg/mkqa_tr",
24
+ "generative": false
25
+ },
26
+ "xlsum_tr": {
27
+ "name": "XLSum",
28
+ "task": "summarization",
29
+ "description": "Abstractive summarization dataset for 44 languages.",
30
+ "url": "https://github.com/csebuetnlp/xl-sum",
31
+ "hf_name": "csebuetnlp/xlsum",
32
+ "generative": true
33
+ },
34
+ "mlsum_tr": {
35
+ "name": "MLSum",
36
+ "task": "summarization",
37
+ "description": "A multilingual summarization dataset collected from the newspapers' websites. MLSum contains 1.5M examples in 5 languages including Turkish.",
38
+ "url": "https://huggingface.co/datasets/reciTAL/mlsum",
39
+ "hf_name": "reciTAL/mlsum",
40
+ "generative": true
41
+ },
42
+ "wiki_lingua_tr": {
43
+ "name": "WikiLingua",
44
+ "task": "summarization",
45
+ "description": "A multilingual abstractive summarization dataset covering 17 languages.",
46
+ "url": "https://github.com/esdurmus/Wikilingua",
47
+ "hf_name": "GEM/wiki_lingua",
48
+ "generative": true
49
+ },
50
+ "tr-wikihow-summ": {
51
+ "name": "WikiHowSumm",
52
+ "task": "summarization",
53
+ "description": "A summarization dataset obtained from WikiHow website.",
54
+ "url": "https://huggingface.co/datasets/ardauzunoglu/tr-wikihow-summ",
55
+ "hf_name": "ardauzunoglu/tr-wikihow-summ",
56
+ "generative": true
57
+ },
58
+ "mnli_tr": {
59
+ "name": "MNLI",
60
+ "task": "natural_language_inference",
61
+ "description": "Multi-Genre NLI (MNLI) dataset.",
62
+ "url": "https://cims.nyu.edu/~sbowman/multinli/",
63
+ "hf_name": "boun-tabi/nli_tr",
64
+ "generative": false
65
+ },
66
+ "snli_tr": {
67
+ "name": "SNLI",
68
+ "task": "natural_language_inference",
69
+ "description": "The Stanford NLI (SNLI) dataset.",
70
+ "url": "https://nlp.stanford.edu/projects/snli/",
71
+ "hf_name": "boun-tabi/nli_tr",
72
+ "generative": false
73
+ },
74
+ "xnli_tr": {
75
+ "name": "XNLI",
76
+ "task": "natural_language_inference",
77
+ "description": "The Cross-Lingual NLI (XNLI) dataset.",
78
+ "url": "https://github.com/facebookresearch/XNLI",
79
+ "hf_name": "boun-tabi/nli_tr",
80
+ "generative": false
81
+ },
82
+ "xcopa_tr": {
83
+ "name": "XCOPA",
84
+ "task": "multiple_choice",
85
+ "description": "A multilingual dataset for evaluating causal commonsense reasoning capabilities of language models.",
86
+ "url": "https://github.com/cambridgeltl/xcopa",
87
+ "hf_name": "cambridgeltl/xcopa",
88
+ "generative": false
89
+ },
90
+ "exams_tr": {
91
+ "name": "Exams",
92
+ "task": "multiple_choice",
93
+ "description": "A question answering dataset covering high school exams.",
94
+ "url": "https://huggingface.co/datasets/exams",
95
+ "hf_name": "exams",
96
+ "generative": false
97
+ },
98
+ "belebele_tr": {
99
+ "name": "Belebele",
100
+ "task": "multiple_choice",
101
+ "description": "A multiple choice question answering dataset to evaluate machine comprehension.",
102
+ "url": "https://github.com/facebookresearch/belebele",
103
+ "generative": false
104
+ },
105
+ "turkish_plu_goal_inference": {
106
+ "name": "PLU-GI",
107
+ "task": "multiple_choice",
108
+ "description": "TurkishPLU - Goal Inference task.",
109
+ "url": "https://github.com/GGLAB-KU/turkish-plu",
110
+ "hf_name": "mcemilg/turkish-plu-goal-inference",
111
+ "generative": false
112
+ },
113
+ "turkish_plu_next_event_prediction": {
114
+ "name": "PLU-NE",
115
+ "task": "multiple_choice",
116
+ "description": "TurkishPLU - Next Event Prediction task.",
117
+ "url": "https://github.com/GGLAB-KU/turkish-plu",
118
+ "hf_name": "mcemilg/turkish-plu-next-event-prediction",
119
+ "generative": false
120
+ },
121
+ "turkish_plu_step_inference": {
122
+ "name": "PLU-SI",
123
+ "task": "multiple_choice",
124
+ "description": "TurkishPLU - Step Inference task.",
125
+ "url": "https://github.com/GGLAB-KU/turkish-plu",
126
+ "hf_name": "mcemilg/turkish-plu-step-inference",
127
+ "generative": false
128
+ },
129
+ "turkish_plu_step_ordering": {
130
+ "name": "PLU-SO",
131
+ "task": "multiple_choice",
132
+ "description": "TurkishPLU - Step Ordering task.",
133
+ "url": "https://github.com/GGLAB-KU/turkish-plu",
134
+ "hf_name": "mcemilg/turkish-plu-step-ordering",
135
+ "generative": false
136
+ },
137
+ "sts_tr": {
138
+ "name": "STS",
139
+ "task": "text_classification",
140
+ "description": "The machine-translated Semantic Textual Similarity dataset in Turkish.",
141
+ "url": "https://github.com/emrecncelik/sts-benchmark-tr",
142
+ "hf_name": "emrecan/stsb-mt-turkish",
143
+ "generative": false
144
+ },
145
+ "offenseval_tr": {
146
+ "name": "OffensEval",
147
+ "task": "text_classification",
148
+ "description": "A dataset for offensive speech recognition in Turkish.",
149
+ "url": "https://sites.google.com/site/offensevalsharedtask/offenseval-2020",
150
+ "hf_name": "coltekin/offenseval2020_tr",
151
+ "generative": false
152
+ },
153
+ "news_cat": {
154
+ "name": "NewsCat",
155
+ "task": "text_classification",
156
+ "description": "News classification dataset collected from Turkish newspapers websites.",
157
+ "url": "http://www.kemik.yildiz.edu.tr/veri_kumelerimiz.html",
158
+ "hf_name": "mcemilg/news-cat",
159
+ "generative": false
160
+ },
161
+ "ironytr": {
162
+ "name": "IronyTR",
163
+ "task": "text_classification",
164
+ "description": "Irony detection dataset in Turkish.",
165
+ "url": "https://github.com/teghub/IronyTR",
166
+ "hf_name": "mcemilg/IronyTR",
167
+ "generative": false
168
+ },
169
+ "wmt-tr-en-prompt": {
170
+ "name": "WMT",
171
+ "task": "machine_translation",
172
+ "description": "English-to-Turkish machine translation dataset.",
173
+ "url": "http://www.aclweb.org/anthology/W/W16/W16-2301",
174
+ "hf_name": "wmt/wmt16",
175
+ "generative": true
176
+ },
177
+ "gecturk_generation": {
178
+ "name": "GECTurk",
179
+ "task": "grammatical_error_correction",
180
+ "description": "A dataset for grammatical error correction.",
181
+ "url": "https://github.com/GGLAB-KU/gecturk",
182
+ "hf_name": "mcemilg/GECTurk-generation",
183
+ "generative": true
184
+ }
185
+ }
environment.yaml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Cetvel-leaderboard
2
+ channels:
3
+ - defaults
4
+ dependencies:
5
+ - _libgcc_mutex=0.1=main
6
+ - _openmp_mutex=5.1=1_gnu
7
+ - bzip2=1.0.8=h5eee18b_6
8
+ - ca-certificates=2024.7.2=h06a4308_0
9
+ - expat=2.6.2=h6a678d5_0
10
+ - ld_impl_linux-64=2.38=h1181459_1
11
+ - libffi=3.4.4=h6a678d5_1
12
+ - libgcc-ng=11.2.0=h1234567_1
13
+ - libgomp=11.2.0=h1234567_1
14
+ - libstdcxx-ng=11.2.0=h1234567_1
15
+ - libuuid=1.41.5=h5eee18b_0
16
+ - ncurses=6.4=h6a678d5_0
17
+ - openssl=3.0.14=h5eee18b_0
18
+ - python=3.12.4=h5148396_1
19
+ - readline=8.2=h5eee18b_0
20
+ - sqlite=3.45.3=h5eee18b_0
21
+ - tk=8.6.14=h39e8969_0
22
+ - wheel=0.43.0=py312h06a4308_0
23
+ - xz=5.4.6=h5eee18b_1
24
+ - zlib=1.2.13=h5eee18b_1
25
+ - pip:
26
+ - altair==5.3.0
27
+ - asttokens==2.4.1
28
+ - attrs==23.2.0
29
+ - blinker==1.8.2
30
+ - cachetools==5.3.3
31
+ - certifi==2024.7.4
32
+ - charset-normalizer==3.3.2
33
+ - click==8.1.7
34
+ - contourpy==1.2.1
35
+ - cycler==0.12.1
36
+ - decorator==5.1.1
37
+ - executing==2.0.1
38
+ - fonttools==4.53.1
39
+ - gitdb==4.0.11
40
+ - gitpython==3.1.43
41
+ - idna==3.7
42
+ - ipdb==0.13.13
43
+ - ipython==8.26.0
44
+ - jedi==0.19.1
45
+ - jinja2==3.1.4
46
+ - jsonschema==4.23.0
47
+ - jsonschema-specifications==2023.12.1
48
+ - kiwisolver==1.4.5
49
+ - markdown-it-py==3.0.0
50
+ - markupsafe==2.1.5
51
+ - matplotlib==3.9.1
52
+ - matplotlib-inline==0.1.7
53
+ - mdurl==0.1.2
54
+ - numpy==2.0.0
55
+ - packaging==24.1
56
+ - pandas==2.2.2
57
+ - parso==0.8.4
58
+ - pexpect==4.9.0
59
+ - pillow==10.4.0
60
+ - pip==24.1.2
61
+ - prompt-toolkit==3.0.47
62
+ - protobuf==5.27.2
63
+ - ptyprocess==0.7.0
64
+ - pure-eval==0.2.2
65
+ - pyarrow==16.1.0
66
+ - pydeck==0.9.1
67
+ - pygments==2.18.0
68
+ - pyparsing==3.1.2
69
+ - python-dateutil==2.9.0.post0
70
+ - pytz==2024.1
71
+ - redis==5.0.7
72
+ - referencing==0.35.1
73
+ - requests==2.32.3
74
+ - rich==13.7.1
75
+ - rpds-py==0.19.0
76
+ - semantic-version==2.10.0
77
+ - setuptools==70.3.0
78
+ - setuptools-rust==1.9.0
79
+ - six==1.16.0
80
+ - smmap==5.0.1
81
+ - stack-data==0.6.3
82
+ - streamlit==1.36.0
83
+ - tenacity==8.5.0
84
+ - toml==0.10.2
85
+ - toolz==0.12.1
86
+ - tornado==6.4.1
87
+ - traitlets==5.14.3
88
+ - typing-extensions==4.12.2
89
+ - tzdata==2024.1
90
+ - urllib3==2.2.2
91
+ - watchdog==4.0.1
92
+ - wcwidth==0.2.13
93
+ prefix: /home/ilker/miniconda3/envs/streamlit-tutor
process_result.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path as osp
2
+ import argparse
3
+ import json
4
+ from data import Tasks, DATASET_TASK_DICT
5
+ from utils import preprocess_path
6
+
7
+
8
+ def process_result(entry, name, task):
9
+ processed = {
10
+ 'name': name,
11
+ 'task': str(task),
12
+ }
13
+
14
+ if task == Tasks.EXTRACTIVE_QUESTION_ANSWERING:
15
+ key = 'em,none' if name == 'mkqa_tr' else 'exact,none'
16
+ scale = 0.01 if name != 'mkqa_tr' else 1
17
+ processed['exact_match'] = scale * entry[key]
18
+ processed['f1'] = scale * entry['f1,none']
19
+ elif task == Tasks.SUMMARIZATION:
20
+ processed['rouge1'] = entry['rouge1,none']
21
+ processed['rouge2'] = entry['rouge2,none']
22
+ processed['rougeL'] = entry['rougeL,none']
23
+ elif task in (
24
+ Tasks.MULTIPLE_CHOICE,
25
+ Tasks.NATURAL_LANGUAGE_INFERENCE,
26
+ Tasks.TEXT_CLASSIFICATION,
27
+ ):
28
+ processed['acc'] = entry['acc,none']
29
+ processed['acc_norm'] = entry.get('acc_norm,none', processed['acc'])
30
+ elif task == Tasks.MACHINE_TRANSLATION:
31
+ processed['wer'] = entry['wer,none']
32
+ processed['bleu'] = entry['bleu,none']
33
+ elif task == Tasks.GRAMMATICAL_ERROR_CORRECTION:
34
+ processed['exact_match'] = entry['exact_match,none']
35
+
36
+ return processed
37
+
38
+
39
+ def main():
40
+ parser = argparse.ArgumentParser(description='Results file formatter.')
41
+ parser.add_argument('-i', '--input-file', type=str, help='Input JSON file for the results.')
42
+ parser.add_argument('-o', '--output-file', type=str, help='Output JSON file for the formatted results.')
43
+ args = parser.parse_args()
44
+
45
+ with open(preprocess_path(args.input_file)) as f:
46
+ raw_data = json.load(f)
47
+
48
+ # first, get model args
49
+ model_args = raw_data['config']['model_args'].split(',')
50
+ model_args = dict([tuple(pair.split('=')) for pair in model_args])
51
+ processed = dict()
52
+ model_args['model'] = model_args.pop('pretrained')
53
+ processed['model'] = model_args
54
+ processed['model']['api'] = raw_data['config']['model']
55
+
56
+ # then, process results
57
+ results = raw_data['results']
58
+ processed['results'] = list()
59
+ for dataset, entry in results.items():
60
+ if dataset not in DATASET_TASK_DICT.keys():
61
+ continue
62
+ task = DATASET_TASK_DICT[dataset]
63
+ processed['results'].append(process_result(entry, dataset, task))
64
+
65
+ with open(preprocess_path(args.output_file), 'w') as f:
66
+ json.dump(processed, f, indent=4)
67
+
68
+ print('done')
69
+
70
+
71
+ if __name__ == '__main__':
72
+ main()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ altair==5.3.0
2
+ click==8.1.7
3
+ matplotlib==3.9.1
4
+ numpy==2.0.0
5
+ pandas==2.2.2
6
+ pillow==10.4.0
7
+ streamlit==1.36.0
8
+ tornado==6.4.1
results/zero-shot/aya-23-8b.json ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "load_in_8bit": "True",
4
+ "trust_remote_code": "True",
5
+ "model": "CohereForAI/aya-23-8B",
6
+ "api": "hf",
7
+ "architecture": "CohereForCausalLM",
8
+ "dtype": "float16",
9
+ "max_length": 8192,
10
+ "type": "instruction-tuned",
11
+ "num_parameters": "8b"
12
+ },
13
+ "results": [
14
+ {
15
+ "name": "belebele_tr",
16
+ "task": "multiple_choice",
17
+ "acc": 0.6067,
18
+ "acc_norm": 0.6067
19
+ },
20
+ {
21
+ "name": "exams_tr",
22
+ "task": "multiple_choice",
23
+ "acc": 0.2697,
24
+ "acc_norm": 0.2901
25
+ },
26
+ {
27
+ "name": "check_worthiness",
28
+ "task": "multiple_choice",
29
+ "acc": 0.38345521023765994,
30
+ "acc_norm": 0.49177330895795246
31
+ },
32
+ {
33
+ "name": "ironytr",
34
+ "task": "text_classification",
35
+ "acc": 0.5166666666666667,
36
+ "acc_norm": 0.5016666666666667
37
+ },
38
+ {
39
+ "name": "mkqa_tr",
40
+ "task": "extractive_question_answering",
41
+ "exact_match": 0.10017756732761172,
42
+ "f1": 0.16569513329103133
43
+ },
44
+ {
45
+ "name": "mnli_tr",
46
+ "task": "natural_language_inference",
47
+ "acc": 0.3436,
48
+ "acc_norm": 0.3477
49
+ },
50
+ {
51
+ "name": "news_cat",
52
+ "task": "text_classification",
53
+ "acc": 0.724,
54
+ "acc_norm": 0.632
55
+ },
56
+ {
57
+ "name": "offenseval_tr",
58
+ "task": "text_classification",
59
+ "acc": 0.3424036281179138,
60
+ "acc_norm": 0.7865646258503401
61
+ },
62
+ {
63
+ "name": "relevance_judgment",
64
+ "task": "multiple_choice",
65
+ "acc": 0.42550274223034734,
66
+ "acc_norm": 0.4273308957952468
67
+ },
68
+ {
69
+ "name": "snli_tr",
70
+ "task": "natural_language_inference",
71
+ "acc": 0.3249,
72
+ "acc_norm": 0.3367
73
+ },
74
+ {
75
+ "name": "sts_tr",
76
+ "task": "text_classification",
77
+ "acc": 0.22987672226250908,
78
+ "acc_norm": 0.19434372733865118
79
+ },
80
+ {
81
+ "name": "tquad",
82
+ "task": "extractive_question_answering",
83
+ "exact_match": 0.2062780269058296,
84
+ "f1": 0.4653972244152745
85
+ },
86
+ {
87
+ "name": "turkish_plu_goal_inference",
88
+ "task": "multiple_choice",
89
+ "acc": 0.3918757467144564,
90
+ "acc_norm": 0.3859020310633214
91
+ },
92
+ {
93
+ "name": "turkish_plu_next_event_prediction",
94
+ "task": "multiple_choice",
95
+ "acc": 0.4687022900763359,
96
+ "acc_norm": 0.5374045801526718
97
+ },
98
+ {
99
+ "name": "turkish_plu_step_inference",
100
+ "task": "multiple_choice",
101
+ "acc": 0.33986928104575165,
102
+ "acc_norm": 0.45098039215686275
103
+ },
104
+ {
105
+ "name": "turkish_plu_step_ordering",
106
+ "task": "multiple_choice",
107
+ "acc": 0.6180215475024485,
108
+ "acc_norm": 0.6180215475024485
109
+ },
110
+ {
111
+ "name": "xcopa_tr",
112
+ "task": "multiple_choice",
113
+ "acc": 0.596,
114
+ "acc_norm": 0.596
115
+ },
116
+ {
117
+ "name": "xnli_tr",
118
+ "task": "natural_language_inference",
119
+ "acc": 0.4771084337349398,
120
+ "acc_norm": 0.4771084337349398
121
+ },
122
+ {
123
+ "name": "xquad_tr",
124
+ "task": "extractive_question_answering",
125
+ "exact_match": 0.24705882352941178,
126
+ "f1": 0.44192474929656556
127
+ },
128
+ {
129
+ "name": "gecturk_generation",
130
+ "task": "grammatical_error_correction",
131
+ "exact_match": 0.008281573498964804
132
+ },
133
+ {
134
+ "name": "mlsum_tr",
135
+ "task": "summarization",
136
+ "rouge1": 0.37037019926313125,
137
+ "rouge2": 0.24005923597941317,
138
+ "rougeL": 0.31098002776173184
139
+ },
140
+ {
141
+ "name": "wiki_lingua_tr",
142
+ "task": "summarization",
143
+ "rouge1": 0.2645070959726481,
144
+ "rouge2": 0.11354354716145479,
145
+ "rougeL": 0.21357621995467704
146
+ },
147
+ {
148
+ "name": "wmt-tr-en-prompt",
149
+ "task": "machine_translation",
150
+ "wer": 0.7464128097803795,
151
+ "bleu": 0.16878189334002527
152
+ },
153
+ {
154
+ "name": "xlsum_tr",
155
+ "task": "summarization",
156
+ "rouge1": 0.2855728817569547,
157
+ "rouge2": 0.14081555638864124,
158
+ "rougeL": 0.23467303626936886
159
+ }
160
+ ]
161
+ }
results/zero-shot/aya-expanse-8b.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "model": "CohereForAI/aya-expanse-8b",
4
+ "api": "hf",
5
+ "architecture": "CohereForCausalLM",
6
+ "max_length": 8192,
7
+ "dtype": "float16",
8
+ "type": "instruction-tuned",
9
+ "num_parameters": "8b"
10
+ },
11
+ "results": [
12
+ {
13
+ "name": "belebele_tr",
14
+ "task": "multiple_choice",
15
+ "acc": 0.7355555555555555,
16
+ "acc_norm": 0.7355555555555555
17
+ },
18
+ {
19
+ "name": "exams_tr",
20
+ "task": "multiple_choice",
21
+ "acc": 0.3155216284987277,
22
+ "acc_norm": 0.3460559796437659
23
+ },
24
+ {
25
+ "name": "check_worthiness",
26
+ "task": "multiple_choice",
27
+ "acc": 0.4026508226691042,
28
+ "acc_norm": 0.6224862888482633
29
+ },
30
+ {
31
+ "name": "gecturk_generation",
32
+ "task": "grammatical_error_correction",
33
+ "exact_match": 0.0018296499590736194
34
+ },
35
+ {
36
+ "name": "ironytr",
37
+ "task": "text_classification",
38
+ "acc": 0.505,
39
+ "acc_norm": 0.49833333333333335
40
+ },
41
+ {
42
+ "name": "mkqa_tr",
43
+ "task": "extractive_question_answering",
44
+ "exact_match": 0.06954720331459012,
45
+ "f1": 0.13476533908972033
46
+ },
47
+ {
48
+ "name": "mlsum_tr",
49
+ "task": "summarization",
50
+ "rouge1": 0.363610486561065,
51
+ "rouge2": 0.21362825588593481,
52
+ "rougeL": 0.29773476508614094
53
+ },
54
+ {
55
+ "name": "mnli_tr",
56
+ "task": "natural_language_inference",
57
+ "acc": 0.3078,
58
+ "acc_norm": 0.35
59
+ },
60
+ {
61
+ "name": "news_cat",
62
+ "task": "text_classification",
63
+ "acc": 0.76,
64
+ "acc_norm": 0.58
65
+ },
66
+ {
67
+ "name": "offenseval_tr",
68
+ "task": "text_classification",
69
+ "acc": 0.2675736961451247,
70
+ "acc_norm": 0.7956349206349206
71
+ },
72
+ {
73
+ "name": "relevance_judgment",
74
+ "task": "multiple_choice",
75
+ "acc": 0.5877513711151737,
76
+ "acc_norm": 0.579981718464351
77
+ },
78
+ {
79
+ "name": "snli_tr",
80
+ "task": "natural_language_inference",
81
+ "acc": 0.344,
82
+ "acc_norm": 0.3435
83
+ },
84
+ {
85
+ "name": "sts_tr",
86
+ "task": "text_classification",
87
+ "acc": 0.2095721537345903,
88
+ "acc_norm": 0.21029731689630166
89
+ },
90
+ {
91
+ "name": "tquad",
92
+ "task": "extractive_question_answering",
93
+ "exact_match": 0.13452914798206278,
94
+ "f1": 0.435087842533856
95
+ },
96
+ {
97
+ "name": "turkish_plu_goal_inference",
98
+ "task": "multiple_choice",
99
+ "acc": 0.4062126642771804,
100
+ "acc_norm": 0.3930704898446834
101
+ },
102
+ {
103
+ "name": "turkish_plu_next_event_prediction",
104
+ "task": "multiple_choice",
105
+ "acc": 0.4900763358778626,
106
+ "acc_norm": 0.5465648854961832
107
+ },
108
+ {
109
+ "name": "turkish_plu_step_inference",
110
+ "task": "multiple_choice",
111
+ "acc": 0.3464052287581699,
112
+ "acc_norm": 0.4395424836601307
113
+ },
114
+ {
115
+ "name": "turkish_plu_step_ordering",
116
+ "task": "multiple_choice",
117
+ "acc": 0.5935357492654261,
118
+ "acc_norm": 0.5935357492654261
119
+ },
120
+ {
121
+ "name": "wiki_lingua_tr",
122
+ "task": "summarization",
123
+ "rouge1": 0.3064320242538614,
124
+ "rouge2": 0.1340385267540697,
125
+ "rougeL": 0.24764232131755232
126
+ },
127
+ {
128
+ "name": "wmt-tr-en-prompt",
129
+ "task": "machine_translation",
130
+ "wer": 0.7822550373875778,
131
+ "bleu": 0.17034711245148307
132
+ },
133
+ {
134
+ "name": "xcopa_tr",
135
+ "task": "multiple_choice",
136
+ "acc": 0.578,
137
+ "acc_norm": 0.578
138
+ },
139
+ {
140
+ "name": "xlsum_tr",
141
+ "task": "summarization",
142
+ "rouge1": 0.26621653203927675,
143
+ "rouge2": 0.133428873146516,
144
+ "rougeL": 0.2083669711429916
145
+ },
146
+ {
147
+ "name": "xnli_tr",
148
+ "task": "natural_language_inference",
149
+ "acc": 0.4919678714859438,
150
+ "acc_norm": 0.4919678714859438
151
+ },
152
+ {
153
+ "name": "xquad_tr",
154
+ "task": "extractive_question_answering",
155
+ "exact_match": 0.2495798319327731,
156
+ "f1": 0.4735125568867167
157
+ }
158
+ ]
159
+ }
results/zero-shot/aya101.json ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "dtype": "bfloat16",
4
+ "max_length": 4096,
5
+ "model": "CohereForAI/aya-101",
6
+ "api": "hf",
7
+ "architecture": "T5ForConditionalGeneration",
8
+ "type": "instruction-tuned",
9
+ "num_parameters": "13b"
10
+ },
11
+ "results": [
12
+ {
13
+ "name": "xquad_tr",
14
+ "task": "extractive_question_answering",
15
+ "exact_match": 0.07563025210084033,
16
+ "f1": 0.16462359535888943
17
+ },
18
+ {
19
+ "name": "xlsum_tr",
20
+ "task": "summarization",
21
+ "rouge1": 0.02416422194769531,
22
+ "rouge2": 0.00149839274458772,
23
+ "rougeL": 0.02416422194769531
24
+ },
25
+ {
26
+ "name": "xcopa_tr",
27
+ "task": "multiple_choice",
28
+ "acc": 0.596,
29
+ "acc_norm": 0.596
30
+ },
31
+ {
32
+ "name": "wmt-tr-en-prompt",
33
+ "task": "machine_translation",
34
+ "wer": 0.9853633715998092,
35
+ "bleu": 0.0
36
+ },
37
+ {
38
+ "name": "wiki_lingua_tr",
39
+ "task": "summarization",
40
+ "rouge1": 0.029006633700390562,
41
+ "rouge2": 0.0004998910319276452,
42
+ "rougeL": 0.028967197984657227
43
+ },
44
+ {
45
+ "name": "turkish_plu",
46
+ "task": "multiple_choice",
47
+ "acc": 0.41344,
48
+ "acc_norm": 0.42816
49
+ },
50
+ {
51
+ "name": "turkish_plu_goal_inference",
52
+ "task": "multiple_choice",
53
+ "acc": 0.3739545997610514,
54
+ "acc_norm": 0.33811230585424135
55
+ },
56
+ {
57
+ "name": "turkish_plu_next_event_prediction",
58
+ "task": "multiple_choice",
59
+ "acc": 0.34961832061068704,
60
+ "acc_norm": 0.38625954198473283
61
+ },
62
+ {
63
+ "name": "turkish_plu_step_inference",
64
+ "task": "multiple_choice",
65
+ "acc": 0.272875816993464,
66
+ "acc_norm": 0.35784313725490197
67
+ },
68
+ {
69
+ "name": "turkish_plu_step_ordering",
70
+ "task": "multiple_choice",
71
+ "acc": 0.5710088148873653,
72
+ "acc_norm": 0.5710088148873653
73
+ },
74
+ {
75
+ "name": "check_worthiness",
76
+ "task": "multiple_choice",
77
+ "acc": 0.553473491773309,
78
+ "acc_norm": 0.6238574040219378
79
+ },
80
+ {
81
+ "name": "relevance_judgment",
82
+ "task": "multiple_choice",
83
+ "acc": 0.6709323583180987,
84
+ "acc_norm": 0.5781535648994516
85
+ },
86
+ {
87
+ "name": "tr-wikihow-summ",
88
+ "task": "summarization",
89
+ "rouge1": 0.02053796966151103,
90
+ "rouge2": 0.00029270301029826366,
91
+ "rougeL": 0.020495031370814234
92
+ },
93
+ {
94
+ "name": "tquad",
95
+ "task": "extractive_question_answering",
96
+ "exact_match": 0.053811659192825115,
97
+ "f1": 0.09199690627084456
98
+ },
99
+ {
100
+ "name": "sts_tr",
101
+ "task": "text_classification",
102
+ "acc": 0.1696881798404641,
103
+ "acc_norm": 0.18781725888324874
104
+ },
105
+ {
106
+ "name": "offenseval_tr",
107
+ "task": "text_classification",
108
+ "acc": 0.7993197278911565,
109
+ "acc_norm": 0.7970521541950113
110
+ },
111
+ {
112
+ "name": "mnli_tr",
113
+ "task": "natural_language_inference",
114
+ "acc": 0.279,
115
+ "acc_norm": 0.3386
116
+ },
117
+ {
118
+ "name": "snli_tr",
119
+ "task": "natural_language_inference",
120
+ "acc": 0.2558,
121
+ "acc_norm": 0.3279
122
+ },
123
+ {
124
+ "name": "xnli_tr",
125
+ "task": "natural_language_inference",
126
+ "acc": 0.2998003992015968,
127
+ "acc_norm": 0.34291417165668664
128
+ },
129
+ {
130
+ "name": "news_cat",
131
+ "task": "text_classification",
132
+ "acc": 0.2,
133
+ "acc_norm": 0.2
134
+ },
135
+ {
136
+ "name": "mlsum_tr",
137
+ "task": "summarization",
138
+ "rouge1": 0.021746360547255133,
139
+ "rouge2": 0.003113110667892852,
140
+ "rougeL": 0.021727065059735186
141
+ },
142
+ {
143
+ "name": "mkqa_tr",
144
+ "task": "extractive_question_answering",
145
+ "exact_match": 0.025451316957679788,
146
+ "f1": 0.05324060372891391
147
+ },
148
+ {
149
+ "name": "ironytr",
150
+ "task": "text_classification",
151
+ "acc": 0.5216666666666666,
152
+ "acc_norm": 0.5
153
+ },
154
+ {
155
+ "name": "gecturk_generation",
156
+ "task": "grammatical_error_correction",
157
+ "exact_match": 0.0
158
+ },
159
+ {
160
+ "name": "exams_tr",
161
+ "task": "multiple_choice",
162
+ "acc": 0.22900763358778625,
163
+ "acc_norm": 0.2366412213740458
164
+ },
165
+ {
166
+ "name": "belebele_tr",
167
+ "task": "multiple_choice",
168
+ "acc": 0.2288888888888889,
169
+ "acc_norm": 0.2288888888888889
170
+ }
171
+ ]
172
+ }
results/zero-shot/commencis-7b.json ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "dtype": "bfloat16",
4
+ "max_length": "4096",
5
+ "model": "Commencis/Commencis-LLM",
6
+ "api": "hf",
7
+ "architecture": "MistralForCausalLM",
8
+ "type": "instruction-tuned",
9
+ "num_parameters": "7b"
10
+ },
11
+ "results": [
12
+ {
13
+ "name": "xquad_tr",
14
+ "task": "extractive_question_answering",
15
+ "exact_match": 0.06638655462184874,
16
+ "f1": 0.22895337255761397
17
+ },
18
+ {
19
+ "name": "xlsum_tr",
20
+ "task": "summarization",
21
+ "rouge1": 0.23661435034483103,
22
+ "rouge2": 0.09475637339836376,
23
+ "rougeL": 0.17114647899378693
24
+ },
25
+ {
26
+ "name": "xcopa_tr",
27
+ "task": "multiple_choice",
28
+ "acc": 0.58,
29
+ "acc_norm": 0.58
30
+ },
31
+ {
32
+ "name": "wmt-tr-en-prompt",
33
+ "task": "machine_translation",
34
+ "wer": 1.292660190832963,
35
+ "bleu": 0.046829706960566486
36
+ },
37
+ {
38
+ "name": "wiki_lingua_tr",
39
+ "task": "summarization",
40
+ "rouge1": 0.20899244459581318,
41
+ "rouge2": 0.06262304805792501,
42
+ "rougeL": 0.15190187433999106
43
+ },
44
+ {
45
+ "name": "turkish_plu",
46
+ "task": "multiple_choice",
47
+ "acc": 0.4128,
48
+ "acc_norm": 0.46176
49
+ },
50
+ {
51
+ "name": "turkish_plu_goal_inference",
52
+ "task": "multiple_choice",
53
+ "acc": 0.34767025089605735,
54
+ "acc_norm": 0.38948626045400236
55
+ },
56
+ {
57
+ "name": "turkish_plu_next_event_prediction",
58
+ "task": "multiple_choice",
59
+ "acc": 0.38625954198473283,
60
+ "acc_norm": 0.46259541984732827
61
+ },
62
+ {
63
+ "name": "turkish_plu_step_inference",
64
+ "task": "multiple_choice",
65
+ "acc": 0.2761437908496732,
66
+ "acc_norm": 0.3872549019607843
67
+ },
68
+ {
69
+ "name": "turkish_plu_step_ordering",
70
+ "task": "multiple_choice",
71
+ "acc": 0.56513222331048,
72
+ "acc_norm": 0.56513222331048
73
+ },
74
+ {
75
+ "name": "check_worthiness",
76
+ "task": "multiple_choice",
77
+ "acc": 0.3903107861060329,
78
+ "acc_norm": 0.4835466179159049
79
+ },
80
+ {
81
+ "name": "relevance_judgment",
82
+ "task": "multiple_choice",
83
+ "acc": 0.5077696526508226,
84
+ "acc_norm": 0.526508226691042
85
+ },
86
+ {
87
+ "name": "tr-wikihow-summ",
88
+ "task": "summarization",
89
+ "rouge1": 0.23101542478965895,
90
+ "rouge2": 0.0718775262261334,
91
+ "rougeL": 0.16318786708633073
92
+ },
93
+ {
94
+ "name": "tquad",
95
+ "task": "extractive_question_answering",
96
+ "exact_match": 0.053811659192825115,
97
+ "f1": 0.3110458108565287
98
+ },
99
+ {
100
+ "name": "sts_tr",
101
+ "task": "text_classification",
102
+ "acc": 0.14865844815083393,
103
+ "acc_norm": 0.2226250906453952
104
+ },
105
+ {
106
+ "name": "offenseval_tr",
107
+ "task": "text_classification",
108
+ "acc": 0.24263038548752835,
109
+ "acc_norm": 0.29365079365079366
110
+ },
111
+ {
112
+ "name": "mnli_tr",
113
+ "task": "natural_language_inference",
114
+ "acc": 0.3058,
115
+ "acc_norm": 0.3103
116
+ },
117
+ {
118
+ "name": "snli_tr",
119
+ "task": "natural_language_inference",
120
+ "acc": 0.2972,
121
+ "acc_norm": 0.32
122
+ },
123
+ {
124
+ "name": "xnli_tr",
125
+ "task": "natural_language_inference",
126
+ "acc": 0.3141716566866267,
127
+ "acc_norm": 0.3281437125748503
128
+ },
129
+ {
130
+ "name": "news_cat",
131
+ "task": "text_classification",
132
+ "acc": 0.624,
133
+ "acc_norm": 0.368
134
+ },
135
+ {
136
+ "name": "mlsum_tr",
137
+ "task": "summarization",
138
+ "rouge1": 0.30963778437323686,
139
+ "rouge2": 0.16100694114326877,
140
+ "rougeL": 0.23447680384800107
141
+ },
142
+ {
143
+ "name": "mkqa_tr",
144
+ "task": "extractive_question_answering",
145
+ "exact_match": 0.0324060372891388,
146
+ "f1": 0.07231572678508513
147
+ },
148
+ {
149
+ "name": "ironytr",
150
+ "task": "text_classification",
151
+ "acc": 0.56,
152
+ "acc_norm": 0.54
153
+ },
154
+ {
155
+ "name": "gecturk_generation",
156
+ "task": "grammatical_error_correction",
157
+ "exact_match": 0.1701574461938466
158
+ },
159
+ {
160
+ "name": "exams_tr",
161
+ "task": "multiple_choice",
162
+ "acc": 0.24681933842239187,
163
+ "acc_norm": 0.29770992366412213
164
+ },
165
+ {
166
+ "name": "belebele_tr",
167
+ "task": "multiple_choice",
168
+ "acc": 0.3233333333333333,
169
+ "acc_norm": 0.3233333333333333
170
+ }
171
+ ]
172
+ }
results/zero-shot/kanarya-2b.json ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "dtype": "float16",
4
+ "model": "asafaya/kanarya-2b",
5
+ "api": "hf",
6
+ "architecture": "GPTJForCausalLM",
7
+ "type": "pretrained",
8
+ "num_parameters": "3b"
9
+ },
10
+ "results": [
11
+ {
12
+ "name": "belebele_tr",
13
+ "task": "multiple_choice",
14
+ "acc": 0.2811111111111111,
15
+ "acc_norm": 0.2811111111111111
16
+ },
17
+ {
18
+ "name": "exams_tr",
19
+ "task": "multiple_choice",
20
+ "acc": 0.30025445292620867,
21
+ "acc_norm": 0.3256997455470738
22
+ },
23
+ {
24
+ "name": "gecturk_generation",
25
+ "task": "grammatical_error_correction",
26
+ "exact_match": 9.62973662670326e-05
27
+ },
28
+ {
29
+ "name": "ironytr",
30
+ "task": "text_classification",
31
+ "acc": 0.5,
32
+ "acc_norm": 0.5016666666666667
33
+ },
34
+ {
35
+ "name": "mkqa_tr",
36
+ "task": "extractive_question_answering",
37
+ "exact_match": 0.005770938147380882,
38
+ "f1": 0.0157485308417537
39
+ },
40
+ {
41
+ "name": "mlsum_tr",
42
+ "task": "summarization",
43
+ "rouge1": 0.380182975983147,
44
+ "rouge2": 0.2469518162622865,
45
+ "rougeL": 0.30607429328228153
46
+ },
47
+ {
48
+ "name": "news_cat",
49
+ "task": "text_classification",
50
+ "acc": 0.668,
51
+ "acc_norm": 0.556
52
+ },
53
+ {
54
+ "name": "mnli_tr",
55
+ "task": "natural_language_inference",
56
+ "acc": 0.3278,
57
+ "acc_norm": 0.3463
58
+ },
59
+ {
60
+ "name": "snli_tr",
61
+ "task": "natural_language_inference",
62
+ "acc": 0.3088,
63
+ "acc_norm": 0.3109
64
+ },
65
+ {
66
+ "name": "xnli_tr",
67
+ "task": "natural_language_inference",
68
+ "acc": 0.3273453093812375,
69
+ "acc_norm": 0.3341317365269461
70
+ },
71
+ {
72
+ "name": "offenseval_tr",
73
+ "task": "text_classification",
74
+ "acc": 0.6159297052154195,
75
+ "acc_norm": 0.796485260770975
76
+ },
77
+ {
78
+ "name": "sts_tr",
79
+ "task": "text_classification",
80
+ "acc": 0.12907904278462654,
81
+ "acc_norm": 0.12037708484408992
82
+ },
83
+ {
84
+ "name": "tquad",
85
+ "task": "extractive_question_answering",
86
+ "exact_match": 0.016816143497757848,
87
+ "f1": 0.046325790025566756
88
+ },
89
+ {
90
+ "name": "check_worthiness",
91
+ "task": "multiple_choice",
92
+ "acc": 0.623400365630713,
93
+ "acc_norm": 0.6238574040219378
94
+ },
95
+ {
96
+ "name": "relevance_judgment",
97
+ "task": "multiple_choice",
98
+ "acc": 0.5068555758683729,
99
+ "acc_norm": 0.5758683729433273
100
+ },
101
+ {
102
+ "name": "turkish_plu",
103
+ "task": "multiple_choice",
104
+ "acc": 0.4928,
105
+ "acc_norm": 0.536
106
+ },
107
+ {
108
+ "name": "turkish_plu_goal_inference",
109
+ "task": "multiple_choice",
110
+ "acc": 0.45878136200716846,
111
+ "acc_norm": 0.46714456391875747
112
+ },
113
+ {
114
+ "name": "turkish_plu_next_event_prediction",
115
+ "task": "multiple_choice",
116
+ "acc": 0.45648854961832064,
117
+ "acc_norm": 0.5190839694656488
118
+ },
119
+ {
120
+ "name": "turkish_plu_step_inference",
121
+ "task": "multiple_choice",
122
+ "acc": 0.35784313725490197,
123
+ "acc_norm": 0.5
124
+ },
125
+ {
126
+ "name": "turkish_plu_step_ordering",
127
+ "task": "multiple_choice",
128
+ "acc": 0.6248775710088149,
129
+ "acc_norm": 0.6248775710088149
130
+ },
131
+ {
132
+ "name": "wiki_lingua_tr",
133
+ "task": "summarization",
134
+ "rouge1": 0.14941800836498376,
135
+ "rouge2": 0.04469826846423095,
136
+ "rougeL": 0.11118162846926655
137
+ },
138
+ {
139
+ "name": "wmt-tr-en-prompt",
140
+ "task": "machine_translation",
141
+ "wer": 2.833755212322392,
142
+ "bleu": 0.030496946295093332
143
+ },
144
+ {
145
+ "name": "xcopa_tr",
146
+ "task": "multiple_choice",
147
+ "acc": 0.642,
148
+ "acc_norm": 0.642
149
+ },
150
+ {
151
+ "name": "xlsum_tr",
152
+ "task": "summarization",
153
+ "rouge1": 0.2462743722502333,
154
+ "rouge2": 0.09312295140534987,
155
+ "rougeL": 0.1685445897911506
156
+ },
157
+ {
158
+ "name": "tr-wikihow-summ",
159
+ "task": "summarization",
160
+ "rouge1": null,
161
+ "rouge2": null,
162
+ "rougeL": null
163
+ },
164
+ {
165
+ "name": "xquad_tr",
166
+ "task": "extractive_question_answering",
167
+ "exact_match": 0.008403361344537815,
168
+ "f1": 0.027799180278171867
169
+ }
170
+ ]
171
+ }
results/zero-shot/llama-3-8b-instruct.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "trust_remote_code": "True",
4
+ "model": "meta-llama/Meta-Llama-3-8B-Instruct",
5
+ "api": "hf",
6
+ "architecture": "LlamaForCausalLM",
7
+ "max_length": 8192,
8
+ "type": "instruction-tuned",
9
+ "dtype": "bfloat16",
10
+ "num_parameters": "8b"
11
+ },
12
+ "results": [
13
+ {
14
+ "name": "belebele_tr",
15
+ "task": "multiple_choice",
16
+ "acc": 0.6633333333333333,
17
+ "acc_norm": 0.6633333333333333
18
+ },
19
+ {
20
+ "name": "exams_tr",
21
+ "task": "multiple_choice",
22
+ "acc": 0.2697201017811705,
23
+ "acc_norm": 0.3104325699745547
24
+ },
25
+ {
26
+ "name": "check_worthiness",
27
+ "task": "multiple_choice",
28
+ "acc": 0.4218464351005484,
29
+ "acc_norm": 0.5644424131627057
30
+ },
31
+ {
32
+ "name": "ironytr",
33
+ "task": "text_classification",
34
+ "acc": 0.545,
35
+ "acc_norm": 0.6466666666666666
36
+ },
37
+ {
38
+ "name": "mkqa_tr",
39
+ "task": "extractive_question_answering",
40
+ "exact_match": 0.0424681858538029,
41
+ "f1": 0.11050423163975964
42
+ },
43
+ {
44
+ "name": "mnli_tr",
45
+ "task": "natural_language_inference",
46
+ "acc": 0.3201,
47
+ "acc_norm": 0.3653
48
+ },
49
+ {
50
+ "name": "news_cat",
51
+ "task": "text_classification",
52
+ "acc": 0.628,
53
+ "acc_norm": 0.588
54
+ },
55
+ {
56
+ "name": "offenseval_tr",
57
+ "task": "text_classification",
58
+ "acc": 0.3081065759637188,
59
+ "acc_norm": 0.7304421768707483
60
+ },
61
+ {
62
+ "name": "relevance_judgment",
63
+ "task": "multiple_choice",
64
+ "acc": 0.603290676416819,
65
+ "acc_norm": 0.5790676416819013
66
+ },
67
+ {
68
+ "name": "snli_tr",
69
+ "task": "natural_language_inference",
70
+ "acc": 0.3283,
71
+ "acc_norm": 0.353
72
+ },
73
+ {
74
+ "name": "sts_tr",
75
+ "task": "text_classification",
76
+ "acc": 0.14213197969543148,
77
+ "acc_norm": 0.21537345902828137
78
+ },
79
+ {
80
+ "name": "tquad",
81
+ "task": "extractive_question_answering",
82
+ "exact_match": 0.1289237668161435,
83
+ "f1": 0.4134057883004977
84
+ },
85
+ {
86
+ "name": "turkish_plu_goal_inference",
87
+ "task": "multiple_choice",
88
+ "acc": 0.38829151732377537,
89
+ "acc_norm": 0.43130227001194743
90
+ },
91
+ {
92
+ "name": "turkish_plu_next_event_prediction",
93
+ "task": "multiple_choice",
94
+ "acc": 0.4549618320610687,
95
+ "acc_norm": 0.517557251908397
96
+ },
97
+ {
98
+ "name": "turkish_plu_step_inference",
99
+ "task": "multiple_choice",
100
+ "acc": 0.3137254901960784,
101
+ "acc_norm": 0.44281045751633985
102
+ },
103
+ {
104
+ "name": "turkish_plu_step_ordering",
105
+ "task": "multiple_choice",
106
+ "acc": 0.6160626836434868,
107
+ "acc_norm": 0.6160626836434868
108
+ },
109
+ {
110
+ "name": "xcopa_tr",
111
+ "task": "multiple_choice",
112
+ "acc": 0.586,
113
+ "acc_norm": 0.586
114
+ },
115
+ {
116
+ "name": "xnli_tr",
117
+ "task": "natural_language_inference",
118
+ "acc": 0.4389558232931727,
119
+ "acc_norm": 0.4389558232931727
120
+ },
121
+ {
122
+ "name": "xquad_tr",
123
+ "task": "extractive_question_answering",
124
+ "exact_match": 0.09747899159663864,
125
+ "f1": 0.24450355256139333
126
+ },
127
+ {
128
+ "name": "gecturk_generation",
129
+ "task": "grammatical_error_correction",
130
+ "exact_match": 0.005007463045885695
131
+ },
132
+ {
133
+ "name": "mlsum_tr",
134
+ "task": "summarization",
135
+ "rouge1": 0.40612528796779146,
136
+ "rouge2": 0.25769550481564407,
137
+ "rougeL": 0.3281187592669974
138
+ },
139
+ {
140
+ "name": "wiki_lingua_tr",
141
+ "task": "summarization",
142
+ "rouge1": 0.23621778991663983,
143
+ "rouge2": 0.08052321922363763,
144
+ "rougeL": 0.1710165526266978
145
+ },
146
+ {
147
+ "name": "wmt-tr-en-prompt",
148
+ "task": "machine_translation",
149
+ "wer": 0.823814082821166,
150
+ "bleu": 0.13572050882587958
151
+ },
152
+ {
153
+ "name": "xlsum_tr",
154
+ "task": "summarization",
155
+ "rouge1": 0.29619456321037296,
156
+ "rouge2": 0.13520487191226377,
157
+ "rougeL": 0.220446635816053
158
+ }
159
+ ]
160
+ }
results/zero-shot/llama-3-8b.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "model": "meta-llama/Meta-Llama-3-8B",
4
+ "api": "hf",
5
+ "architecture": "LlamaForCausalLM",
6
+ "max_length": 8192,
7
+ "type": "pretrained",
8
+ "dtype": "bfloat16",
9
+ "num_parameters": "8b"
10
+ },
11
+ "results": [
12
+ {
13
+ "name": "belebele_tr",
14
+ "task": "multiple_choice",
15
+ "acc": 0.5144,
16
+ "acc_norm": 0.5144
17
+ },
18
+ {
19
+ "name": "exams_tr",
20
+ "task": "multiple_choice",
21
+ "acc": 0.3028,
22
+ "acc_norm": 0.3537
23
+ },
24
+ {
25
+ "name": "check_worthiness",
26
+ "task": "multiple_choice",
27
+ "acc": 0.37614259597806216,
28
+ "acc_norm": 0.38391224862888484
29
+ },
30
+ {
31
+ "name": "ironytr",
32
+ "task": "text_classification",
33
+ "acc": 0.515,
34
+ "acc_norm": 0.525
35
+ },
36
+ {
37
+ "name": "mkqa_tr",
38
+ "task": "extractive_question_answering",
39
+ "exact_match": 0.13465522343888725,
40
+ "f1": 0.19144550324599957
41
+ },
42
+ {
43
+ "name": "mnli_tr",
44
+ "task": "natural_language_inference",
45
+ "acc": 0.3206,
46
+ "acc_norm": 0.3329
47
+ },
48
+ {
49
+ "name": "news_cat",
50
+ "task": "text_classification",
51
+ "acc": 0.724,
52
+ "acc_norm": 0.656
53
+ },
54
+ {
55
+ "name": "offenseval_tr",
56
+ "task": "text_classification",
57
+ "acc": 0.2193877551020408,
58
+ "acc_norm": 0.48214285714285715
59
+ },
60
+ {
61
+ "name": "relevance_judgment",
62
+ "task": "multiple_choice",
63
+ "acc": 0.42550274223034734,
64
+ "acc_norm": 0.5173674588665448
65
+ },
66
+ {
67
+ "name": "snli_tr",
68
+ "task": "natural_language_inference",
69
+ "acc": 0.325,
70
+ "acc_norm": 0.3766
71
+ },
72
+ {
73
+ "name": "sts_tr",
74
+ "task": "text_classification",
75
+ "acc": 0.16388687454677303,
76
+ "acc_norm": 0.19216823785351705
77
+ },
78
+ {
79
+ "name": "tquad",
80
+ "task": "extractive_question_answering",
81
+ "exact_match": 0.28475336322869954,
82
+ "f1": 0.5013148868557868
83
+ },
84
+ {
85
+ "name": "turkish_plu_goal_inference",
86
+ "task": "multiple_choice",
87
+ "acc": 0.38948626045400236,
88
+ "acc_norm": 0.4169653524492234
89
+ },
90
+ {
91
+ "name": "turkish_plu_next_event_prediction",
92
+ "task": "multiple_choice",
93
+ "acc": 0.4488549618320611,
94
+ "acc_norm": 0.5328244274809161
95
+ },
96
+ {
97
+ "name": "turkish_plu_step_inference",
98
+ "task": "multiple_choice",
99
+ "acc": 0.32189542483660133,
100
+ "acc_norm": 0.47058823529411764
101
+ },
102
+ {
103
+ "name": "turkish_plu_step_ordering",
104
+ "task": "multiple_choice",
105
+ "acc": 0.6278158667972575,
106
+ "acc_norm": 0.6278158667972575
107
+ },
108
+ {
109
+ "name": "xcopa_tr",
110
+ "task": "multiple_choice",
111
+ "acc": 0.618,
112
+ "acc_norm": 0.618
113
+ },
114
+ {
115
+ "name": "xnli_tr",
116
+ "task": "natural_language_inference",
117
+ "acc": 0.4839357429718876,
118
+ "acc_norm": 0.4839357429718876
119
+ },
120
+ {
121
+ "name": "xquad_tr",
122
+ "task": "extractive_question_answering",
123
+ "exact_match": 0.20840336134453782,
124
+ "f1": 0.33796418555415153
125
+ },
126
+ {
127
+ "name": "gecturk_generation",
128
+ "task": "grammatical_error_correction",
129
+ "exact_match": 0.006692666955558766
130
+ },
131
+ {
132
+ "name": "mlsum_tr",
133
+ "task": "summarization",
134
+ "rouge1": 0.38446881575055203,
135
+ "rouge2": 0.2503978598237102,
136
+ "rougeL": 0.319713589198042
137
+ },
138
+ {
139
+ "name": "wiki_lingua_tr",
140
+ "task": "summarization",
141
+ "rouge1": 0.2069234464456151,
142
+ "rouge2": 0.06576422586110373,
143
+ "rougeL": 0.1516869929958613
144
+ },
145
+ {
146
+ "name": "wmt-tr-en-prompt",
147
+ "task": "machine_translation",
148
+ "wer": 0.9262281724087097,
149
+ "bleu": 0.113320746345327
150
+ },
151
+ {
152
+ "name": "xlsum_tr",
153
+ "task": "summarization",
154
+ "rouge1": 0.2615001361521869,
155
+ "rouge2": 0.11093149007661907,
156
+ "rougeL": 0.20321693263972507
157
+ }
158
+ ]
159
+ }
results/zero-shot/llama-3.1-8b-instruct.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
4
+ "api": "hf",
5
+ "dtype": "bfloat16",
6
+ "max_length": 131072,
7
+ "architecture": "LlamaForCausalLM",
8
+ "type": "instruction-tuned",
9
+ "num_parameters": "8b"
10
+ },
11
+ "results": [
12
+ {
13
+ "name": "belebele_tr",
14
+ "task": "multiple_choice",
15
+ "acc": 0.7077777777777777,
16
+ "acc_norm": 0.7077777777777777
17
+ },
18
+ {
19
+ "name": "exams_tr",
20
+ "task": "multiple_choice",
21
+ "acc": 0.3231552162849873,
22
+ "acc_norm": 0.35877862595419846
23
+ },
24
+ {
25
+ "name": "check_worthiness",
26
+ "task": "multiple_choice",
27
+ "acc": 0.37614259597806216,
28
+ "acc_norm": 0.37614259597806216
29
+ },
30
+ {
31
+ "name": "ironytr",
32
+ "task": "text_classification",
33
+ "acc": 0.5133333333333333,
34
+ "acc_norm": 0.5666666666666667
35
+ },
36
+ {
37
+ "name": "mkqa_tr",
38
+ "task": "extractive_question_answering",
39
+ "exact_match": 0.09115122817401598,
40
+ "f1": 0.15627870028803578
41
+ },
42
+ {
43
+ "name": "mnli_tr",
44
+ "task": "natural_language_inference",
45
+ "acc": 0.3209,
46
+ "acc_norm": 0.3596
47
+ },
48
+ {
49
+ "name": "news_cat",
50
+ "task": "text_classification",
51
+ "acc": 0.66,
52
+ "acc_norm": 0.604
53
+ },
54
+ {
55
+ "name": "offenseval_tr",
56
+ "task": "text_classification",
57
+ "acc": 0.23582766439909297,
58
+ "acc_norm": 0.3687641723356009
59
+ },
60
+ {
61
+ "name": "relevance_judgment",
62
+ "task": "multiple_choice",
63
+ "acc": 0.4648080438756856,
64
+ "acc_norm": 0.5648994515539305
65
+ },
66
+ {
67
+ "name": "snli_tr",
68
+ "task": "natural_language_inference",
69
+ "acc": 0.3028,
70
+ "acc_norm": 0.3528
71
+ },
72
+ {
73
+ "name": "sts_tr",
74
+ "task": "text_classification",
75
+ "acc": 0.19579405366207397,
76
+ "acc_norm": 0.1551849166062364
77
+ },
78
+ {
79
+ "name": "tquad",
80
+ "task": "extractive_question_answering",
81
+ "exact_match": 0.23318385650224216,
82
+ "f1": 0.5062272078338648
83
+ },
84
+ {
85
+ "name": "turkish_plu_goal_inference",
86
+ "task": "multiple_choice",
87
+ "acc": 0.40860215053763443,
88
+ "acc_norm": 0.45997610513739545
89
+ },
90
+ {
91
+ "name": "turkish_plu_next_event_prediction",
92
+ "task": "multiple_choice",
93
+ "acc": 0.4442748091603053,
94
+ "acc_norm": 0.5419847328244275
95
+ },
96
+ {
97
+ "name": "turkish_plu_step_inference",
98
+ "task": "multiple_choice",
99
+ "acc": 0.33169934640522875,
100
+ "acc_norm": 0.4624183006535948
101
+ },
102
+ {
103
+ "name": "turkish_plu_step_ordering",
104
+ "task": "multiple_choice",
105
+ "acc": 0.633692458374143,
106
+ "acc_norm": 0.633692458374143
107
+ },
108
+ {
109
+ "name": "xcopa_tr",
110
+ "task": "multiple_choice",
111
+ "acc": 0.608,
112
+ "acc_norm": 0.608
113
+ },
114
+ {
115
+ "name": "xnli_tr",
116
+ "task": "natural_language_inference",
117
+ "acc": 0.4807228915662651,
118
+ "acc_norm": 0.4807228915662651
119
+ },
120
+ {
121
+ "name": "xquad_tr",
122
+ "task": "extractive_question_answering",
123
+ "exact_match": 0.21428571428571427,
124
+ "f1": 0.4170277103753468
125
+ },
126
+ {
127
+ "name": "gecturk_generation",
128
+ "task": "grammatical_error_correction",
129
+ "exact_match": 0.005007463045885695
130
+ },
131
+ {
132
+ "name": "mlsum_tr",
133
+ "task": "summarization",
134
+ "rouge1": 0.40612528796779146,
135
+ "rouge2": 0.25769550481564407,
136
+ "rougeL": 0.3281187592669974
137
+ },
138
+ {
139
+ "name": "wiki_lingua_tr",
140
+ "task": "summarization",
141
+ "rouge1": 0.23621778991663983,
142
+ "rouge2": 0.08052321922363763,
143
+ "rougeL": 0.1710165526266978
144
+ },
145
+ {
146
+ "name": "wmt-tr-en-prompt",
147
+ "task": "machine_translation",
148
+ "wer": 0.823814082821166,
149
+ "bleu": 0.13572050882587958
150
+ },
151
+ {
152
+ "name": "xlsum_tr",
153
+ "task": "summarization",
154
+ "rouge1": 0.29619456321037296,
155
+ "rouge2": 0.13520487191226377,
156
+ "rougeL": 0.220446635816053
157
+ }
158
+ ]
159
+ }
results/zero-shot/llama-3.1-8b.json ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "model": "meta-llama/Meta-Llama-3.1-8B",
4
+ "api": "hf",
5
+ "dtype": "bfloat16",
6
+ "max_length": 131072,
7
+ "architecture": "LlamaForCausalLM",
8
+ "type": "pretrained",
9
+ "num_parameters": "8b"
10
+ },
11
+ "results": [
12
+ {
13
+ "name": "belebele_tr",
14
+ "task": "multiple_choice",
15
+ "acc": 0.6144,
16
+ "acc_norm": 0.6144
17
+ },
18
+ {
19
+ "name": "exams_tr",
20
+ "task": "multiple_choice",
21
+ "acc": 0.3130,
22
+ "acc_norm": 0.3537
23
+ },
24
+ {
25
+ "name": "check_worthiness",
26
+ "task": "multiple_choice",
27
+ "acc": 0.37614259597806216,
28
+ "acc_norm": 0.37751371115173676
29
+ },
30
+ {
31
+ "name": "ironytr",
32
+ "task": "text_classification",
33
+ "acc": 0.585,
34
+ "acc_norm": 0.5183333333333333
35
+ },
36
+ {
37
+ "name": "mkqa_tr",
38
+ "task": "extractive_question_answering",
39
+ "exact_match": 0.09248298313110388,
40
+ "f1": 0.15127108197296948
41
+ },
42
+ {
43
+ "name": "mnli_tr",
44
+ "task": "natural_language_inference",
45
+ "acc": 0.3495,
46
+ "acc_norm": 0.3481
47
+ },
48
+ {
49
+ "name": "news_cat",
50
+ "task": "text_classification",
51
+ "acc": 0.692,
52
+ "acc_norm": 0.588
53
+ },
54
+ {
55
+ "name": "offenseval_tr",
56
+ "task": "text_classification",
57
+ "acc": 0.3463718820861678,
58
+ "acc_norm": 0.7636054421768708
59
+ },
60
+ {
61
+ "name": "relevance_judgment",
62
+ "task": "multiple_choice",
63
+ "acc": 0.4227605118829982,
64
+ "acc_norm": 0.506398537477148
65
+ },
66
+ {
67
+ "name": "snli_tr",
68
+ "task": "natural_language_inference",
69
+ "acc": 0.3169,
70
+ "acc_norm": 0.3379
71
+ },
72
+ {
73
+ "name": "sts_tr",
74
+ "task": "text_classification",
75
+ "acc": 0.17041334300217548,
76
+ "acc_norm": 0.2001450326323423
77
+ },
78
+ {
79
+ "name": "tquad",
80
+ "task": "extractive_question_answering",
81
+ "exact_match": 0.2757847533632287,
82
+ "f1": 0.5178366277473359
83
+ },
84
+ {
85
+ "name": "turkish_plu_goal_inference",
86
+ "task": "multiple_choice",
87
+ "acc": 0.4145758661887694,
88
+ "acc_norm": 0.4324970131421744
89
+ },
90
+ {
91
+ "name": "turkish_plu_next_event_prediction",
92
+ "task": "multiple_choice",
93
+ "acc": 0.4488549618320611,
94
+ "acc_norm": 0.5358778625954198
95
+ },
96
+ {
97
+ "name": "turkish_plu_step_inference",
98
+ "task": "multiple_choice",
99
+ "acc": 0.3382352941176471,
100
+ "acc_norm": 0.4738562091503268
101
+ },
102
+ {
103
+ "name": "turkish_plu_step_ordering",
104
+ "task": "multiple_choice",
105
+ "acc": 0.6425073457394711,
106
+ "acc_norm": 0.6425073457394711
107
+ },
108
+ {
109
+ "name": "xcopa_tr",
110
+ "task": "multiple_choice",
111
+ "acc": 0.626,
112
+ "acc_norm": 0.626
113
+ },
114
+ {
115
+ "name": "xnli_tr",
116
+ "task": "natural_language_inference",
117
+ "acc": 0.4947791164658635,
118
+ "acc_norm": 0.4947791164658635
119
+ },
120
+ {
121
+ "name": "xquad_tr",
122
+ "task": "extractive_question_answering",
123
+ "exact_match": 0.2092436974789916,
124
+ "f1": 0.35674599908781446
125
+ }
126
+ ]
127
+ }
results/zero-shot/llama-3.2-1b.json ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "model": "meta-llama/Llama-3.2-1B",
4
+ "api": "hf",
5
+ "dtype": "bfloat16",
6
+ "max_length": 131072,
7
+ "architecture": "LlamaForCausalLM",
8
+ "type": "pretrained",
9
+ "num_parameters": "1b"
10
+ },
11
+ "results": [
12
+ {
13
+ "name": "belebele_tr",
14
+ "task": "multiple_choice",
15
+ "acc": 0.29555555555555557,
16
+ "acc_norm": 0.29555555555555557
17
+ },
18
+ {
19
+ "name": "exams_tr",
20
+ "task": "multiple_choice",
21
+ "acc": 0.28498727735368956,
22
+ "acc_norm": 0.3053435114503817
23
+ },
24
+ {
25
+ "name": "check_worthiness",
26
+ "task": "multiple_choice",
27
+ "acc": 0.3880255941499086,
28
+ "acc_norm": 0.623400365630713
29
+ },
30
+ {
31
+ "name": "gecturk_generation",
32
+ "task": "grammatical_error_correction",
33
+ "exact_match": 0.00741489720256151
34
+ },
35
+ {
36
+ "name": "ironytr",
37
+ "task": "text_classification",
38
+ "acc": 0.5283333333333333,
39
+ "acc_norm": 0.5033333333333333
40
+ },
41
+ {
42
+ "name": "mkqa_tr",
43
+ "task": "extractive_question_answering",
44
+ "exact_match": 0.007694584196507843,
45
+ "f1": 0.03304091036050505
46
+ },
47
+ {
48
+ "name": "mlsum_tr",
49
+ "task": "summarization",
50
+ "rouge1": 0.23283491254211872,
51
+ "rouge2": 0.13426790568610214,
52
+ "rougeL": 0.18915548037371513
53
+ },
54
+ {
55
+ "name": "mnli_tr",
56
+ "task": "natural_language_inference",
57
+ "acc": 0.3232,
58
+ "acc_norm": 0.334
59
+ },
60
+ {
61
+ "name": "news_cat",
62
+ "task": "text_classification",
63
+ "acc": 0.58,
64
+ "acc_norm": 0.532
65
+ },
66
+ {
67
+ "name": "offenseval_tr",
68
+ "task": "text_classification",
69
+ "acc": 0.4671201814058957,
70
+ "acc_norm": 0.7820294784580499
71
+ },
72
+ {
73
+ "name": "relevance_judgment",
74
+ "task": "multiple_choice",
75
+ "acc": 0.56672760511883,
76
+ "acc_norm": 0.5781535648994516
77
+ },
78
+ {
79
+ "name": "snli_tr",
80
+ "task": "natural_language_inference",
81
+ "acc": 0.3239,
82
+ "acc_norm": 0.3105
83
+ },
84
+ {
85
+ "name": "sts_tr",
86
+ "task": "text_classification",
87
+ "acc": 0.17113850616388687,
88
+ "acc_norm": 0.22552574329224076
89
+ },
90
+ {
91
+ "name": "tquad",
92
+ "task": "extractive_question_answering",
93
+ "exact_match": 0.06278026905829596,
94
+ "f1": 0.21486130318406463
95
+ },
96
+ {
97
+ "name": "turkish_plu_goal_inference",
98
+ "task": "multiple_choice",
99
+ "acc": 0.35842293906810035,
100
+ "acc_norm": 0.4026284348864994
101
+ },
102
+ {
103
+ "name": "turkish_plu_next_event_prediction",
104
+ "task": "multiple_choice",
105
+ "acc": 0.3709923664122137,
106
+ "acc_norm": 0.467175572519084
107
+ },
108
+ {
109
+ "name": "turkish_plu_step_inference",
110
+ "task": "multiple_choice",
111
+ "acc": 0.27941176470588236,
112
+ "acc_norm": 0.41830065359477125
113
+ },
114
+ {
115
+ "name": "turkish_plu_step_ordering",
116
+ "task": "multiple_choice",
117
+ "acc": 0.5759059745347699,
118
+ "acc_norm": 0.5759059745347699
119
+ },
120
+ {
121
+ "name": "wiki_lingua_tr",
122
+ "task": "summarization",
123
+ "rouge1": 0.10861529436199803,
124
+ "rouge2": 0.034862923521078545,
125
+ "rougeL": 0.08692160533533941
126
+ },
127
+ {
128
+ "name": "wmt-tr-en-prompt",
129
+ "task": "machine_translation",
130
+ "wer": 3.910683208136067,
131
+ "bleu": 0.012043288243775466
132
+ },
133
+ {
134
+ "name": "xcopa_tr",
135
+ "task": "multiple_choice",
136
+ "acc": 0.556,
137
+ "acc_norm": 0.556
138
+ },
139
+ {
140
+ "name": "xlsum_tr",
141
+ "task": "summarization",
142
+ "rouge1": 0.16924699150407269,
143
+ "rouge2": 0.07190935921365724,
144
+ "rougeL": 0.13255123335488528
145
+ },
146
+ {
147
+ "name": "xnli_tr",
148
+ "task": "natural_language_inference",
149
+ "acc": 0.4389558232931727,
150
+ "acc_norm": 0.4389558232931727
151
+ },
152
+ {
153
+ "name": "xquad_tr",
154
+ "task": "extractive_question_answering",
155
+ "exact_match": 0.04873949579831932,
156
+ "f1": 0.11156636293859905
157
+ },
158
+ {
159
+ "name": "gecturk_generation",
160
+ "task": "grammatical_error_correction",
161
+ "exact_match": 0.0073185998362944775
162
+ },
163
+ {
164
+ "name": "mlsum_tr",
165
+ "task": "summarization",
166
+ "rouge1": 0.35440052022111407,
167
+ "rouge2": 0.2215476501673455,
168
+ "rougeL": 0.2911311598176804
169
+ },
170
+ {
171
+ "name": "wiki_lingua_tr",
172
+ "task": "summarization",
173
+ "rouge1": 0.18510384577665046,
174
+ "rouge2": 0.056181066004903614,
175
+ "rougeL": 0.1392211003290612
176
+ },
177
+ {
178
+ "name": "wmt-tr-en-prompt",
179
+ "task": "machine_translation",
180
+ "wer": 1.311990023748812,
181
+ "bleu": 0.02624044942774961
182
+ },
183
+ {
184
+ "name": "xlsum_tr",
185
+ "task": "summarization",
186
+ "rouge1": 0.2429304790539497,
187
+ "rouge2": 0.09668008744707143,
188
+ "rougeL": 0.18327092913535944
189
+ }
190
+ ]
191
+ }
results/zero-shot/llama-3.2-3b-instruct.json ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "model": "meta-llama/Llama-3.2-3B-Instruct",
4
+ "api": "hf",
5
+ "dtype": "bfloat16",
6
+ "max_length": 131072,
7
+ "architecture": "LlamaForCausalLM",
8
+ "type": "instruction-tuned",
9
+ "num_parameters": "3b"
10
+ },
11
+ "results": [
12
+ {
13
+ "name": "belebele_tr",
14
+ "task": "multiple_choice",
15
+ "acc": 0.5577777777777778,
16
+ "acc_norm": 0.5577777777777778
17
+ },
18
+ {
19
+ "name": "exams_tr",
20
+ "task": "multiple_choice",
21
+ "acc": 0.26208651399491095,
22
+ "acc_norm": 0.3053435114503817
23
+ },
24
+ {
25
+ "name": "check_worthiness",
26
+ "task": "multiple_choice",
27
+ "acc": 0.37614259597806216,
28
+ "acc_norm": 0.3807129798903108
29
+ },
30
+ {
31
+ "name": "gecturk_generation",
32
+ "task": "grammatical_error_correction",
33
+ "exact_match": 0.007222302470027445
34
+ },
35
+ {
36
+ "name": "ironytr",
37
+ "task": "text_classification",
38
+ "acc": 0.5016666666666667,
39
+ "acc_norm": 0.5083333333333333
40
+ },
41
+ {
42
+ "name": "mkqa_tr",
43
+ "task": "extractive_question_answering",
44
+ "exact_match": 0.04675939627108612,
45
+ "f1": 0.08114473798410345
46
+ },
47
+ {
48
+ "name": "mlsum_tr",
49
+ "task": "summarization",
50
+ "rouge1": 0.2669056212126977,
51
+ "rouge2": 0.1480446780314802,
52
+ "rougeL": 0.2106440565987865
53
+ },
54
+ {
55
+ "name": "mnli_tr",
56
+ "task": "natural_language_inference",
57
+ "acc": 0.32,
58
+ "acc_norm": 0.3141
59
+ },
60
+ {
61
+ "name": "news_cat",
62
+ "task": "text_classification",
63
+ "acc": 0.64,
64
+ "acc_norm": 0.552
65
+ },
66
+ {
67
+ "name": "offenseval_tr",
68
+ "task": "text_classification",
69
+ "acc": 0.20634920634920634,
70
+ "acc_norm": 0.35600907029478457
71
+ },
72
+ {
73
+ "name": "relevance_judgment",
74
+ "task": "multiple_choice",
75
+ "acc": 0.4227605118829982,
76
+ "acc_norm": 0.42413162705667273
77
+ },
78
+ {
79
+ "name": "snli_tr",
80
+ "task": "natural_language_inference",
81
+ "acc": 0.319,
82
+ "acc_norm": 0.2923
83
+ },
84
+ {
85
+ "name": "sts_tr",
86
+ "task": "text_classification",
87
+ "acc": 0.12907904278462654,
88
+ "acc_norm": 0.16896301667875271
89
+ },
90
+ {
91
+ "name": "tquad",
92
+ "task": "extractive_question_answering",
93
+ "exact_match": 0.18721973094170405,
94
+ "f1": 0.5109898180473623
95
+ },
96
+ {
97
+ "name": "turkish_plu_goal_inference",
98
+ "task": "multiple_choice",
99
+ "acc": 0.3321385902031063,
100
+ "acc_norm": 0.3548387096774194
101
+ },
102
+ {
103
+ "name": "turkish_plu_next_event_prediction",
104
+ "task": "multiple_choice",
105
+ "acc": 0.3648854961832061,
106
+ "acc_norm": 0.4488549618320611
107
+ },
108
+ {
109
+ "name": "turkish_plu_step_inference",
110
+ "task": "multiple_choice",
111
+ "acc": 0.24183006535947713,
112
+ "acc_norm": 0.3758169934640523
113
+ },
114
+ {
115
+ "name": "turkish_plu_step_ordering",
116
+ "task": "multiple_choice",
117
+ "acc": 0.5710088148873653,
118
+ "acc_norm": 0.5710088148873653
119
+ },
120
+ {
121
+ "name": "wiki_lingua_tr",
122
+ "task": "summarization",
123
+ "rouge1": 0.1342879173103036,
124
+ "rouge2": 0.041489300068460175,
125
+ "rougeL": 0.10482785510181569
126
+ },
127
+ {
128
+ "name": "wmt-tr-en-prompt",
129
+ "task": "machine_translation",
130
+ "wer": 1.7706536060519733,
131
+ "bleu": 0.048843165627950165
132
+ },
133
+ {
134
+ "name": "xcopa_tr",
135
+ "task": "multiple_choice",
136
+ "acc": 0.546,
137
+ "acc_norm": 0.546
138
+ },
139
+ {
140
+ "name": "xlsum_tr",
141
+ "task": "summarization",
142
+ "rouge1": 0.17224405229987672,
143
+ "rouge2": 0.06736413357191079,
144
+ "rougeL": 0.12750762702828333
145
+ },
146
+ {
147
+ "name": "xnli_tr",
148
+ "task": "natural_language_inference",
149
+ "acc": 0.42811244979919677,
150
+ "acc_norm": 0.42811244979919677
151
+ },
152
+ {
153
+ "name": "xquad_tr",
154
+ "task": "extractive_question_answering",
155
+ "exact_match": 0.23025210084033615,
156
+ "f1": 0.4335914561273987
157
+ },
158
+ {
159
+ "name": "gecturk_generation",
160
+ "task": "grammatical_error_correction",
161
+ "exact_match": 0.009726033992970293
162
+ },
163
+ {
164
+ "name": "mlsum_tr",
165
+ "task": "summarization",
166
+ "rouge1": 0.36482642805140486,
167
+ "rouge2": 0.2215366481025873,
168
+ "rougeL": 0.2964001074060548
169
+ },
170
+ {
171
+ "name": "wiki_lingua_tr",
172
+ "task": "summarization",
173
+ "rouge1": 0.21420020104688736,
174
+ "rouge2": 0.06939715371402275,
175
+ "rougeL": 0.1623531918550368
176
+ },
177
+ {
178
+ "name": "wmt-tr-en-prompt",
179
+ "task": "machine_translation",
180
+ "wer": 0.9910280580654681,
181
+ "bleu": 0.08179536823012563
182
+ },
183
+ {
184
+ "name": "xlsum_tr",
185
+ "task": "summarization",
186
+ "rouge1": 0.2616423061938248,
187
+ "rouge2": 0.11064039063859936,
188
+ "rougeL": 0.19686955120787036
189
+ }
190
+ ]
191
+ }
results/zero-shot/mistral-7b.json ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "dtype": "bfloat16",
4
+ "max_length": "4096",
5
+ "model": "mistralai/Mistral-7B-v0.1",
6
+ "api": "hf",
7
+ "architecture": "MixtralForCausalLM",
8
+ "type": "pretrained",
9
+ "num_parameters": "7b"
10
+ },
11
+ "results": [
12
+ {
13
+ "name": "xquad_tr",
14
+ "task": "extractive_question_answering",
15
+ "exact_match": 0.16722689075630254,
16
+ "f1": 0.32150094374615246
17
+ },
18
+ {
19
+ "name": "xcopa_tr",
20
+ "task": "multiple_choice",
21
+ "acc": 0.566,
22
+ "acc_norm": 0.566
23
+ },
24
+ {
25
+ "name": "turkish_plu",
26
+ "task": "multiple_choice",
27
+ "acc": 0.45152,
28
+ "acc_norm": 0.5136
29
+ },
30
+ {
31
+ "name": "turkish_plu_goal_inference",
32
+ "task": "multiple_choice",
33
+ "acc": 0.42771804062126645,
34
+ "acc_norm": 0.46714456391875747
35
+ },
36
+ {
37
+ "name": "turkish_plu_next_event_prediction",
38
+ "task": "multiple_choice",
39
+ "acc": 0.39541984732824426,
40
+ "acc_norm": 0.5022900763358779
41
+ },
42
+ {
43
+ "name": "turkish_plu_step_inference",
44
+ "task": "multiple_choice",
45
+ "acc": 0.29248366013071897,
46
+ "acc_norm": 0.4411764705882353
47
+ },
48
+ {
49
+ "name": "turkish_plu_step_ordering",
50
+ "task": "multiple_choice",
51
+ "acc": 0.6023506366307542,
52
+ "acc_norm": 0.6023506366307542
53
+ },
54
+ {
55
+ "name": "check_worthiness",
56
+ "task": "multiple_choice",
57
+ "acc": 0.37614259597806216,
58
+ "acc_norm": 0.42458866544789764
59
+ },
60
+ {
61
+ "name": "relevance_judgment",
62
+ "task": "multiple_choice",
63
+ "acc": 0.4218464351005484,
64
+ "acc_norm": 0.49588665447897623
65
+ },
66
+ {
67
+ "name": "tquad",
68
+ "task": "extractive_question_answering",
69
+ "exact_match": 0.2096412556053812,
70
+ "f1": 0.4767364701184728
71
+ },
72
+ {
73
+ "name": "sts_tr",
74
+ "task": "text_classification",
75
+ "acc": 0.135605511240029,
76
+ "acc_norm": 0.20522117476432197
77
+ },
78
+ {
79
+ "name": "offenseval_tr",
80
+ "task": "text_classification",
81
+ "acc": 0.2046485260770975,
82
+ "acc_norm": 0.3735827664399093
83
+ },
84
+ {
85
+ "name": "mnli_tr",
86
+ "task": "natural_language_inference",
87
+ "acc": 0.3194,
88
+ "acc_norm": 0.3267
89
+ },
90
+ {
91
+ "name": "snli_tr",
92
+ "task": "natural_language_inference",
93
+ "acc": 0.3196,
94
+ "acc_norm": 0.3201
95
+ },
96
+ {
97
+ "name": "xnli_tr",
98
+ "task": "natural_language_inference",
99
+ "acc": 0.331936127744511,
100
+ "acc_norm": 0.34910179640718564
101
+ },
102
+ {
103
+ "name": "news_cat",
104
+ "task": "text_classification",
105
+ "acc": 0.652,
106
+ "acc_norm": 0.44
107
+ },
108
+ {
109
+ "name": "mkqa_tr",
110
+ "task": "extractive_question_answering",
111
+ "exact_match": 0.12030186445693992,
112
+ "f1": 0.16163416207615164
113
+ },
114
+ {
115
+ "name": "ironytr",
116
+ "task": "text_classification",
117
+ "acc": 0.5016666666666667,
118
+ "acc_norm": 0.52
119
+ },
120
+ {
121
+ "name": "exams_tr",
122
+ "task": "multiple_choice",
123
+ "acc": 0.24173027989821882,
124
+ "acc_norm": 0.30279898218829515
125
+ },
126
+ {
127
+ "name": "belebele_tr",
128
+ "task": "multiple_choice",
129
+ "acc": 0.37444444444444447,
130
+ "acc_norm": 0.37444444444444447
131
+ },
132
+ {
133
+ "name": "gecturk_generation",
134
+ "task": "grammatical_error_correction",
135
+ "exact_match": 0.20660599932591844
136
+ },
137
+ {
138
+ "name": "mlsum_tr",
139
+ "task": "summarization",
140
+ "rouge1": 0.09403885616158554,
141
+ "rouge2": 0.06300721907752257,
142
+ "rougeL": 0.08169726458665999
143
+ },
144
+ {
145
+ "name": "wiki_lingua_tr",
146
+ "task": "summarization",
147
+ "rouge1": 0.1905392717787084,
148
+ "rouge2": 0.05957088325130176,
149
+ "rougeL": 0.1472985242082243
150
+ },
151
+ {
152
+ "name": "wmt-tr-en-prompt",
153
+ "task": "machine_translation",
154
+ "wer": 1.0876062644712858,
155
+ "bleu": 0.04973628734419603
156
+ },
157
+ {
158
+ "name": "xlsum_tr",
159
+ "task": "summarization",
160
+ "rouge1": 0.02720399421152351,
161
+ "rouge2": 0.012032606076011431,
162
+ "rougeL": 0.02311080687545987
163
+ }
164
+ ]
165
+ }
results/zero-shot/trendyol-7b.json ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "dtype": "bfloat16",
4
+ "max_length": "4096",
5
+ "model": "Trendyol/Trendyol-LLM-7b-base-v1.0",
6
+ "api": "hf",
7
+ "architecture": "MixtralForCausalLM",
8
+ "type": "instruction-tuned",
9
+ "num_parameters": "7b"
10
+ },
11
+ "results": [
12
+ {
13
+ "name": "xquad_tr",
14
+ "task": "extractive_question_answering",
15
+ "exact_match": 0.0,
16
+ "f1": 0.15289561928390746
17
+ },
18
+ {
19
+ "name": "xlsum_tr",
20
+ "task": "summarization",
21
+ "rouge1": 0.12128827095936726,
22
+ "rouge2": 0.05041801264157676,
23
+ "rougeL": 0.09604301857137748
24
+ },
25
+ {
26
+ "name": "xcopa_tr",
27
+ "task": "multiple_choice",
28
+ "acc": 0.61,
29
+ "acc_norm": 0.61
30
+ },
31
+ {
32
+ "name": "wmt-tr-en-prompt",
33
+ "task": "machine_translation",
34
+ "wer": 13.038665635458035,
35
+ "bleu": 0.010261135899096054
36
+ },
37
+ {
38
+ "name": "wiki_lingua_tr",
39
+ "task": "summarization",
40
+ "rouge1": 0.09429776166714862,
41
+ "rouge2": 0.02873358785517343,
42
+ "rougeL": 0.07767336257524773
43
+ },
44
+ {
45
+ "name": "turkish_plu",
46
+ "task": "multiple_choice",
47
+ "acc": 0.46944,
48
+ "acc_norm": 0.49952
49
+ },
50
+ {
51
+ "name": "turkish_plu_goal_inference",
52
+ "task": "multiple_choice",
53
+ "acc": 0.4635603345280765,
54
+ "acc_norm": 0.44683393070489846
55
+ },
56
+ {
57
+ "name": "turkish_plu_next_event_prediction",
58
+ "task": "multiple_choice",
59
+ "acc": 0.43206106870229005,
60
+ "acc_norm": 0.48854961832061067
61
+ },
62
+ {
63
+ "name": "turkish_plu_step_inference",
64
+ "task": "multiple_choice",
65
+ "acc": 0.3235294117647059,
66
+ "acc_norm": 0.4395424836601307
67
+ },
68
+ {
69
+ "name": "turkish_plu_step_ordering",
70
+ "task": "multiple_choice",
71
+ "acc": 0.5857002938295789,
72
+ "acc_norm": 0.5857002938295789
73
+ },
74
+ {
75
+ "name": "check_worthiness",
76
+ "task": "multiple_choice",
77
+ "acc": 0.37614259597806216,
78
+ "acc_norm": 0.37614259597806216
79
+ },
80
+ {
81
+ "name": "relevance_judgment",
82
+ "task": "multiple_choice",
83
+ "acc": 0.4218464351005484,
84
+ "acc_norm": 0.4218464351005484
85
+ },
86
+ {
87
+ "name": "tr-wikihow-summ",
88
+ "task": "summarization",
89
+ "rouge1": 0.1602888221320987,
90
+ "rouge2": 0.04616347811027626,
91
+ "rougeL": 0.12482407983918105
92
+ },
93
+ {
94
+ "name": "tquad",
95
+ "task": "extractive_question_answering",
96
+ "exact_match": 0.007847533632286996,
97
+ "f1": 0.26089513093937805
98
+ },
99
+ {
100
+ "name": "sts_tr",
101
+ "task": "text_classification",
102
+ "acc": 0.1551849166062364,
103
+ "acc_norm": 0.22697606961566352
104
+ },
105
+ {
106
+ "name": "offenseval_tr",
107
+ "task": "text_classification",
108
+ "acc": 0.20294784580498867,
109
+ "acc_norm": 0.20294784580498867
110
+ },
111
+ {
112
+ "name": "mnli_tr",
113
+ "task": "natural_language_inference",
114
+ "acc": 0.3134,
115
+ "acc_norm": 0.2942
116
+ },
117
+ {
118
+ "name": "snli_tr",
119
+ "task": "natural_language_inference",
120
+ "acc": 0.3204,
121
+ "acc_norm": 0.2894
122
+ },
123
+ {
124
+ "name": "xnli_tr",
125
+ "task": "natural_language_inference",
126
+ "acc": 0.32974051896207585,
127
+ "acc_norm": 0.300998003992016
128
+ },
129
+ {
130
+ "name": "news_cat",
131
+ "task": "text_classification",
132
+ "acc": 0.812,
133
+ "acc_norm": 0.628
134
+ },
135
+ {
136
+ "name": "mlsum_tr",
137
+ "task": "summarization",
138
+ "rouge1": 0.15450187559493767,
139
+ "rouge2": 0.08797823051939649,
140
+ "rougeL": 0.1350441813405041
141
+ },
142
+ {
143
+ "name": "mkqa_tr",
144
+ "task": "extractive_question_answering",
145
+ "exact_match": 0.001479727730097662,
146
+ "f1": 0.037161672000373895
147
+ },
148
+ {
149
+ "name": "ironytr",
150
+ "task": "text_classification",
151
+ "acc": 0.5,
152
+ "acc_norm": 0.5
153
+ },
154
+ {
155
+ "name": "gecturk_generation",
156
+ "task": "grammatical_error_correction",
157
+ "exact_match": 0.00048148683133516297
158
+ },
159
+ {
160
+ "name": "exams_tr",
161
+ "task": "multiple_choice",
162
+ "acc": 0.28498727735368956,
163
+ "acc_norm": 0.3486005089058524
164
+ },
165
+ {
166
+ "name": "belebele_tr",
167
+ "task": "multiple_choice",
168
+ "acc": 0.3622222222222222,
169
+ "acc_norm": 0.3622222222222222
170
+ }
171
+ ]
172
+ }
results/zero-shot/turna.json ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "dtype": "auto",
4
+ "max_length": "1024",
5
+ "model": "boun-tabi-LMG/TURNA",
6
+ "api": "hf",
7
+ "architecture": "T5ForCondtiionalGeneration",
8
+ "type": "pretrained",
9
+ "num_parameters": "7b"
10
+ },
11
+ "results": [
12
+ {
13
+ "name": "xquad_tr",
14
+ "task": "extractive_question_answering",
15
+ "exact_match": 0.0,
16
+ "f1": 0.0
17
+ },
18
+ {
19
+ "name": "xlsum_tr",
20
+ "task": "summarization",
21
+ "rouge1": 0.1904384366601188,
22
+ "rouge2": 0.060686113611140166,
23
+ "rougeL": 0.1311090280660866
24
+ },
25
+ {
26
+ "name": "xcopa_tr",
27
+ "task": "multiple_choice",
28
+ "acc": 0.558,
29
+ "acc_norm": 0.558
30
+ },
31
+ {
32
+ "name": "wmt-tr-en-prompt",
33
+ "task": "machine_translation",
34
+ "wer": 3.9036796738046218,
35
+ "bleu": 0.0008286617236874524
36
+ },
37
+ {
38
+ "name": "wiki_lingua_tr",
39
+ "task": "summarization",
40
+ "rouge1": 0.18435291474691423,
41
+ "rouge2": 0.05584649726914134,
42
+ "rougeL": 0.13446021077350823
43
+ },
44
+ {
45
+ "name": "turkish_plu",
46
+ "task": "multiple_choice",
47
+ "acc": 0.40288,
48
+ "acc_norm": 0.44608
49
+ },
50
+ {
51
+ "name": "turkish_plu_goal_inference",
52
+ "task": "multiple_choice",
53
+ "acc": 0.37992831541218636,
54
+ "acc_norm": 0.35722819593787336
55
+ },
56
+ {
57
+ "name": "turkish_plu_next_event_prediction",
58
+ "task": "multiple_choice",
59
+ "acc": 0.383206106870229,
60
+ "acc_norm": 0.4488549618320611
61
+ },
62
+ {
63
+ "name": "turkish_plu_step_inference",
64
+ "task": "multiple_choice",
65
+ "acc": 0.272875816993464,
66
+ "acc_norm": 0.4542483660130719
67
+ },
68
+ {
69
+ "name": "turkish_plu_step_ordering",
70
+ "task": "multiple_choice",
71
+ "acc": 0.5122428991185113,
72
+ "acc_norm": 0.5122428991185113
73
+ },
74
+ {
75
+ "name": "check_worthiness",
76
+ "task": "multiple_choice",
77
+ "acc": 0.42230347349177333,
78
+ "acc_norm": 0.620201096892139
79
+ },
80
+ {
81
+ "name": "relevance_judgment",
82
+ "task": "multiple_choice",
83
+ "acc": 0.4904021937842779,
84
+ "acc_norm": 0.5781535648994516
85
+ },
86
+ {
87
+ "name": "tr-wikihow-summ",
88
+ "task": "summarization",
89
+ "rouge1": 0.20515501424269858,
90
+ "rouge2": 0.05693981251975118,
91
+ "rougeL": 0.1449313333992171
92
+ },
93
+ {
94
+ "name": "tquad",
95
+ "task": "extractive_question_answering",
96
+ "exact_match": 0.0,
97
+ "f1": 0.0003736920777279522
98
+ },
99
+ {
100
+ "name": "sts_tr",
101
+ "task": "text_classification",
102
+ "acc": 0.14213197969543148,
103
+ "acc_norm": 0.19506889050036258
104
+ },
105
+ {
106
+ "name": "offenseval_tr",
107
+ "task": "text_classification",
108
+ "acc": 0.5099206349206349,
109
+ "acc_norm": 0.7970521541950113
110
+ },
111
+ {
112
+ "name": "mnli_tr",
113
+ "task": "natural_language_inference",
114
+ "acc": 0.3203,
115
+ "acc_norm": 0.3159
116
+ },
117
+ {
118
+ "name": "snli_tr",
119
+ "task": "natural_language_inference",
120
+ "acc": 0.3223,
121
+ "acc_norm": 0.3278
122
+ },
123
+ {
124
+ "name": "xnli_tr",
125
+ "task": "natural_language_inference",
126
+ "acc": 0.32974051896207585,
127
+ "acc_norm": 0.3277445109780439
128
+ },
129
+ {
130
+ "name": "news_cat",
131
+ "task": "text_classification",
132
+ "acc": 0.328,
133
+ "acc_norm": 0.208
134
+ },
135
+ {
136
+ "name": "mlsum_tr",
137
+ "task": "summarization",
138
+ "rouge1": 0.20830277213555015,
139
+ "rouge2": 0.11040542892341527,
140
+ "rougeL": 0.16135585618616377
141
+ },
142
+ {
143
+ "name": "mkqa_tr",
144
+ "task": "extractive_question_answering",
145
+ "exact_match": 0.0011837821840781297,
146
+ "f1": 0.006720430107526878
147
+ },
148
+ {
149
+ "name": "ironytr",
150
+ "task": "text_classification",
151
+ "acc": 0.48333333333333334,
152
+ "acc_norm": 0.5033333333333333
153
+ },
154
+ {
155
+ "name": "gecturk_generation",
156
+ "task": "grammatical_error_correction",
157
+ "exact_match": 0.0
158
+ },
159
+ {
160
+ "name": "exams_tr",
161
+ "task": "multiple_choice",
162
+ "acc": 0.2366412213740458,
163
+ "acc_norm": 0.2748091603053435
164
+ },
165
+ {
166
+ "name": "belebele_tr",
167
+ "task": "multiple_choice",
168
+ "acc": 0.22555555555555556,
169
+ "acc_norm": 0.22555555555555556
170
+ }
171
+ ]
172
+ }
utils.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import os.path as osp
3
+ import json
4
+
5
+
6
+ def preprocess_path(path):
7
+ path = osp.expanduser(path)
8
+ path = osp.abspath(path)
9
+ return path
10
+
11
+
12
+ def get_model_url(entry):
13
+ if entry['api'] == 'hf':
14
+ return f'https://huggingface.co/{entry["model"]}'
15
+ return entry.get('url', f'https://localhost/{entry["model"]}')
16
+
17
+
18
+ def read_results(path):
19
+ path = preprocess_path(path)
20
+ file_list = sorted(os.listdir(path))
21
+ results = list()
22
+ for file_name in file_list:
23
+ file_path = osp.join(path, file_name)
24
+ with open(file_path, 'r') as f:
25
+ this = json.load(f)
26
+ results.append(this)
27
+ return results
28
+