Spaces:

KUIS-AI
/

Cetvel

Running

App Files Files Community

Ilker Kesen commited on Dec 10, 2024

Commit

500fbd7

1 Parent(s): 74daf31

initialize the first version

Browse files

Files changed (25) hide show

.gitignore +162 -0
LICENSE +21 -0
README.md +1 -14
app.py +197 -0
assets/kuis-ai-logo.png +0 -0
data.py +121 -0
data/datasets.json +185 -0
environment.yaml +93 -0
process_result.py +72 -0
requirements.txt +8 -0
results/zero-shot/aya-23-8b.json +161 -0
results/zero-shot/aya-expanse-8b.json +159 -0
results/zero-shot/aya101.json +172 -0
results/zero-shot/commencis-7b.json +172 -0
results/zero-shot/kanarya-2b.json +171 -0
results/zero-shot/llama-3-8b-instruct.json +160 -0
results/zero-shot/llama-3-8b.json +159 -0
results/zero-shot/llama-3.1-8b-instruct.json +159 -0
results/zero-shot/llama-3.1-8b.json +127 -0
results/zero-shot/llama-3.2-1b.json +191 -0
results/zero-shot/llama-3.2-3b-instruct.json +191 -0
results/zero-shot/mistral-7b.json +165 -0
results/zero-shot/trendyol-7b.json +172 -0
results/zero-shot/turna.json +172 -0
utils.py +28 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 KUIS AI Center
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,14 +1 @@
----
-title: Pergel
-emoji: 📈
-colorFrom: blue
-colorTo: pink
-sdk: streamlit
-sdk_version: 1.40.2
-app_file: app.py
-pinned: false
-license: mit
-short_description: 'Pergel: A Unified Benchmark for Evaluating Turkish LLMs'
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # Cetvel-leaderboard

app.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import streamlit as st
+import pandas as pd
+import json
+from utils import read_results, preprocess_path, get_model_url
+from data import Tasks, Metrics, DATASET_TASK_DICT, TASK_METRIC_DICT, DATASET_GROUPS
+st.set_page_config(
+    page_title='Cetvel 📏',
+    layout='centered',
+)
+@st.cache_data
+def cache_results(path):
+    json_results = read_results(path)
+    results = list()
+    for entry in json_results:
+        row = {
+            'model': entry['model']['model'],
+            'num_parameters': entry['model']['num_parameters'],
+            'url': get_model_url(entry['model']),
+            'architecture': entry['model']['architecture'],
+            'type': entry['model']['type'],
+            'precision': entry['model']['dtype'],
+        }
+        for result in entry['results']:
+            task = result['task']
+            metric = TASK_METRIC_DICT.get(task)
+            score = result.get(metric)
+            score = 100 * score if metric != Metrics.WER and score is not None else score
+            row[result['name']] = score
+        results.append(row)
+    df = pd.DataFrame(results)
+    for group, metadata in DATASET_GROUPS.items():
+        df[group] = df[metadata['datasets']].mean(axis=1)
+    return df
+@st.cache_data
+def cache_datasets(path):
+    path = preprocess_path(path)
+    with open(path, 'r') as f:
+        datasets = json.load(f)
+    for key in datasets.keys():
+        datasets[key]['dataset'] = key
+    return datasets
+def create_column_configs(items):
+    column_configs = dict()
+    for key, metadata in items.items():
+        column_configs[key] = st.column_config.NumberColumn(
+            metadata.get('name', key),
+            help=metadata['description'],
+            min_value=0,
+            format="%2.2f"
+        )
+    return column_configs
+def insert_average(df, keys):
+    df = df.copy(deep=True)
+    df['average'] = df.loc[:, [x for x in df.columns if x in keys]].mean(axis=1)
+    df.insert(1, 'average', df.pop('average'))
+    df.index += 1
+    return df.sort_values(by=['average'], ascending=False)
+MODEL_SPEC_CONFIGS = {
+    'model': st.column_config.TextColumn(
+        'Model',
+        help='Large Language Model (LLM) used for the experiments.',
+        max_chars=120,
+    ),
+    'url': st.column_config.LinkColumn(
+        'URL',
+        help='Model URL.',
+        display_text='Click',
+    ),
+    'num_parameters': st.column_config.TextColumn(
+        '#params',
+        help='Approximate number of parameters.',
+    ),
+    'type': st.column_config.TextColumn(
+        'Type',
+        help='Model type based on training objective.',
+    ),
+    'average': st.column_config.NumberColumn(
+        'Avg.',
+        help='Average across task or dataset performances.',
+        format="%2.2f",
+    )
+}
+def filter_visible_model_specs():
+    specs = {
+        'URL': ('url', 1),
+        '#params': ('num_parameters', 2),
+        'Architecture': ('architecture', 3),
+        'Type': ('type', 4),
+        'Precision': ('precision', 5),
+    }
+    visible_specs = st.multiselect(
+        'Select model specs to be shown in the table.',
+        options=sorted(specs.keys(), key=lambda x: specs[x][1]),
+    )
+    # visible_specs = sorted(visible_specs, key=lambda x: specs[x][1])
+    return [specs[x][0] for x in visible_specs]
+def filter_by_model_spec():
+    pass
+def filter_visible_datasets(datasets):
+    col1, col2 = st.columns(2)
+    with col1:
+        dataset_grouping = st.selectbox(
+            'Dataset Grouping',
+            [
+                'Group Datasets',
+                'Show All Datasets',
+            ],
+        )
+    with col2:
+        filter_by_task = st.selectbox(
+            'Filter by Task',
+            [
+                'All',
+                'Understanding Tasks',
+                'Generation Tasks',
+                'Multiple Choice',
+                'Extractive Question Answering',
+                'Natural Language Inference',
+                'Text Classification',
+                'Summarization',
+            ],
+            disabled=dataset_grouping == "Group Datasets",
+        )
+    if dataset_grouping == 'Group Datasets':
+        return list(DATASET_GROUPS.keys())
+    elif dataset_grouping == 'Show All Datasets':
+        if filter_by_task == 'All':
+            return list(datasets.keys())
+        elif filter_by_task == 'Understanding Tasks':
+            this_datasets = [k for (k, v) in datasets.items() if not v['generative']]
+            return this_datasets
+        elif filter_by_task == 'Generation Tasks':
+            this_datasets = [k for (k, v) in datasets.items() if v['generative']]
+            return this_datasets
+        elif filter_by_task == 'Multiple Choice':
+            return DATASET_GROUPS['MCQA']['datasets']
+        elif filter_by_task == 'Extractive Question Answering':
+            return DATASET_GROUPS['QA']['datasets']
+        elif filter_by_task == 'Natural Language Inference':
+            return DATASET_GROUPS['NLI']['datasets']
+        elif filter_by_task == 'Text Classification':
+            return DATASET_GROUPS['TC']['datasets']
+        elif filter_by_task == 'Summarization':
+            return DATASET_GROUPS['SUM']['datasets']
+def introduction():
+    st.title(':blue[Cetvel :straight_ruler:]')
+    st.subheader('A Unified Benchmark for Evaluating Turkish LLMs', anchor=False)
+    st.markdown('''Cetvel is an extended version of the [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness) tool, specifically includes tasks/datasets for benchmarking Turkish Large Language Models (LLMs). Cetvel includes a variety of tasks curated to assess different aspects of model performance in the Turkish language. Our primary goal is to objectively evaluate the capabilities of large language models in understanding and processing Turkish. For documentation and more details about the benchmark and the experiments, you can check the [GitHub repository](https://github.com/KUIS-AI/Cetvel).''')
+def main():
+    introduction()
+    results_df = cache_results('./results/zero-shot')
+    datasets = cache_datasets('./data/datasets.json')
+    dataset_column_configs = create_column_configs(datasets)
+    group_column_configs = create_column_configs(DATASET_GROUPS)
+    # score_columns = list(dataset_column_configs.keys()) + list(group_column_configs.keys())
+    column_configs = MODEL_SPEC_CONFIGS | group_column_configs | dataset_column_configs
+    visible_data_columns = sorted(filter_visible_datasets(datasets), key=str.casefold)
+    visible_model_columns = filter_visible_model_specs()
+    results_df = insert_average(results_df, visible_data_columns)
+    st.dataframe(
+        results_df,
+        use_container_width=True,
+        hide_index=True,
+        column_config=column_configs,
+        column_order=['model', 'average',] + visible_model_columns + visible_data_columns,
+    )
+    st.image('./assets/kuis-ai-logo.png', width=240)
+main()

assets/kuis-ai-logo.png ADDED Viewed

data.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from enum import StrEnum, auto
+class Tasks(StrEnum):
+    EXTRACTIVE_QUESTION_ANSWERING = auto()
+    MULTIPLE_CHOICE = auto()
+    SUMMARIZATION = auto()
+    NATURAL_LANGUAGE_INFERENCE = auto()
+    TEXT_CLASSIFICATION = auto()
+    MACHINE_TRANSLATION = auto()
+    GRAMMATICAL_ERROR_CORRECTION = auto()
+class Metrics(StrEnum):
+    F1 = "f1"
+    EXACT_MATCH = "exact_match"
+    ROGUE1 = "rouge1"
+    ROUGE2 = "rouge2"
+    ROUGEL = "rougeL"
+    ACCURACY = "acc"
+    WER = "wer"
+    BLEU = "bleu"
+DATASET_TASK_DICT = {
+    # extractive qa
+    'xquad_tr': Tasks.EXTRACTIVE_QUESTION_ANSWERING,
+    'tquad': Tasks.EXTRACTIVE_QUESTION_ANSWERING,
+    'mkqa_tr': Tasks.EXTRACTIVE_QUESTION_ANSWERING,  # not exactly
+    # summarization
+    'xlsum_tr': Tasks.SUMMARIZATION,
+    'mlsum_tr': Tasks.SUMMARIZATION,
+    'wiki_lingua_tr': Tasks.SUMMARIZATION,
+    'tr-wikihow-summ': Tasks.SUMMARIZATION,
+    # NLI
+    #'nli_tr': Tasks.NATURAL_LANGUAGE_INFERENCE,
+    'mnli_tr': Tasks.NATURAL_LANGUAGE_INFERENCE,
+    'snli_tr': Tasks.NATURAL_LANGUAGE_INFERENCE,
+    'xnli_tr': Tasks.NATURAL_LANGUAGE_INFERENCE,
+    # multiple-choice
+    'xcopa_tr': Tasks.MULTIPLE_CHOICE,
+    'exams_tr': Tasks.MULTIPLE_CHOICE,
+    'belebele_tr': Tasks.MULTIPLE_CHOICE,
+    'turkish_plu': Tasks.MULTIPLE_CHOICE,
+    'turkish_plu_goal_inference': Tasks.MULTIPLE_CHOICE,
+    'turkish_plu_next_event_prediction': Tasks.MULTIPLE_CHOICE,
+    'turkish_plu_step_inference': Tasks.MULTIPLE_CHOICE,
+    'turkish_plu_step_ordering': Tasks.MULTIPLE_CHOICE,
+    # fact-checking, not sure whether these are multi-choice
+    # 'trclaim19': Tasks.MULTIPLE_CHOICE,
+    'check_worthiness': Tasks.MULTIPLE_CHOICE,
+    'relevance_judgment': Tasks.MULTIPLE_CHOICE,
+    # text classification
+    'sts_tr': Tasks.TEXT_CLASSIFICATION,
+    'offenseval_tr': Tasks.TEXT_CLASSIFICATION,
+    'news_cat': Tasks.TEXT_CLASSIFICATION,
+    'ironytr': Tasks.TEXT_CLASSIFICATION,
+    # other generation
+    'wmt-tr-en-prompt': Tasks.MACHINE_TRANSLATION,
+    'gecturk_generation': Tasks.GRAMMATICAL_ERROR_CORRECTION,
+}
+TASK_METRIC_DICT = {
+    Tasks.EXTRACTIVE_QUESTION_ANSWERING: Metrics.EXACT_MATCH,
+    Tasks.MULTIPLE_CHOICE: Metrics.ACCURACY,
+    Tasks.TEXT_CLASSIFICATION: Metrics.ACCURACY,
+    Tasks.NATURAL_LANGUAGE_INFERENCE: Metrics.ACCURACY,
+    Tasks.SUMMARIZATION: Metrics.ROUGE2,
+    Tasks.MACHINE_TRANSLATION: Metrics.BLEU,
+    Tasks.GRAMMATICAL_ERROR_CORRECTION: Metrics.EXACT_MATCH,
+}
+GENERATIVE_TASKS = (
+    Tasks.SUMMARIZATION,
+    Tasks.MACHINE_TRANSLATION,
+    Tasks.GRAMMATICAL_ERROR_CORRECTION,
+)
+DATASET_GROUPS = {
+    'QA': {
+        'datasets': ['xquad_tr', 'tquad', 'mkqa_tr'],
+        'description': 'Turkish splits of SQuAD-like datasets XQuAD and TQUAD.',
+    },
+    'MCQA': {
+        'datasets': ['xcopa_tr', 'exams_tr', 'belebele_tr'] + [x for x in DATASET_TASK_DICT.keys() if x.startswith('turkish_plu')],
+        'description': 'Multiple Choice Question Answering datasets: XCOPA, Exams, Belebele and Turkish PLU.'
+    },
+    'TC': {
+        'datasets': ['sts_tr', 'offenseval_tr', 'news_cat', 'ironytr', ],
+        'description': 'Text Classification datasets.',
+    },
+    'NLI': {
+        'datasets': ['mnli_tr', 'snli_tr', 'xnli_tr'],
+        'description': 'Natural Language Inference (NLI) datasets in Turkish: XNLI, SNLI and MNLI.',
+    },
+    'SUM': {
+        'datasets': [name for name, task in DATASET_TASK_DICT.items() if task == Tasks.SUMMARIZATION],
+        'description': 'Summarization datasets in Turkish (XLSum, MLSum, WikiLingua and TrWikiHowSumm).',
+    },
+    'GEC': {
+        'datasets': ['gecturk_generation',],
+        'description': 'Grammatical Error Correction task.',
+    },
+    'MT': {
+        'datasets': ['wmt-tr-en-prompt'],
+        'description': 'Machine Translation on WMT-16 dataset (English-to-Turkish).',
+    },
+    #  'TrClaim19': {
+    #     'datasets': ['check_worthiness', 'relevance_judgment'],
+    #     'description': 'TrClaim19 dataset for fact-checking.',
+    # },
+}

data/datasets.json ADDED Viewed

	@@ -0,0 +1,185 @@

+{
+    "tquad": {
+        "name": "TQUAD",
+        "task": "extractive_question_answering",
+        "description": "This dataset is the Turkish Question & Answer dataset on Turkish & Islamic Science History within the scope of Teknofest 2018 Artificial Intelligence competition.",
+        "url": "https://github.com/TQuad/turkish-nlp-qa-dataset",
+        "hf_name": "mcemilg/tquad",
+        "generative": false
+    },
+    "xquad_tr": {
+        "name": "XQUAD",
+        "task": "extractive_question_answering",
+        "description": "XQuAD (Cross-lingual Question Answering Dataset) is a benchmark dataset for evaluating cross-lingual question answering performance. The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the development set of SQuAD v1.1 together with their professional translations into ten languages: Spanish, German, Greek, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, and Hindi..",
+        "url": "https://github.com/google-deepmind/xquad",
+        "hf_name": "google/xquad",
+        "generative": false
+    },
+    "mkqa_tr": {
+        "name": "MKQA",
+        "task": "extractive_question_answering",
+        "description": "MKQA: Multilingual Knowledge Questions & Answers. MKQA includes 10k open-domain question-answer pairs in 26 languages, resulting 260k examples in total.",
+        "url": "https://github.com/apple/ml-mkqa",
+        "hf_name": "mcemilg/mkqa_tr",
+        "generative": false
+    },
+    "xlsum_tr": {
+        "name": "XLSum",
+        "task": "summarization",
+        "description": "Abstractive summarization dataset for 44 languages.",
+        "url": "https://github.com/csebuetnlp/xl-sum",
+        "hf_name": "csebuetnlp/xlsum",
+        "generative": true
+    },
+    "mlsum_tr": {
+        "name": "MLSum",
+        "task": "summarization",
+        "description": "A multilingual summarization dataset collected from the newspapers' websites. MLSum contains 1.5M examples in 5 languages including Turkish.",
+        "url": "https://huggingface.co/datasets/reciTAL/mlsum",
+        "hf_name": "reciTAL/mlsum",
+        "generative": true
+    },
+    "wiki_lingua_tr": {
+        "name": "WikiLingua",
+        "task": "summarization",
+        "description": "A multilingual abstractive summarization dataset covering 17 languages.",
+        "url": "https://github.com/esdurmus/Wikilingua",
+        "hf_name": "GEM/wiki_lingua",
+        "generative": true
+    },
+    "tr-wikihow-summ": {
+        "name": "WikiHowSumm",
+        "task": "summarization",
+        "description": "A summarization dataset obtained from WikiHow website.",
+        "url": "https://huggingface.co/datasets/ardauzunoglu/tr-wikihow-summ",
+        "hf_name": "ardauzunoglu/tr-wikihow-summ",
+        "generative": true
+    },
+    "mnli_tr": {
+        "name": "MNLI",
+        "task": "natural_language_inference",
+        "description": "Multi-Genre NLI (MNLI) dataset.",
+        "url": "https://cims.nyu.edu/~sbowman/multinli/",
+        "hf_name": "boun-tabi/nli_tr",
+        "generative": false
+    },
+    "snli_tr": {
+        "name": "SNLI",
+        "task": "natural_language_inference",
+        "description": "The Stanford NLI (SNLI) dataset.",
+        "url": "https://nlp.stanford.edu/projects/snli/",
+        "hf_name": "boun-tabi/nli_tr",
+        "generative": false
+    },
+    "xnli_tr": {
+        "name": "XNLI",
+        "task": "natural_language_inference",
+        "description": "The Cross-Lingual NLI (XNLI) dataset.",
+        "url": "https://github.com/facebookresearch/XNLI",
+        "hf_name": "boun-tabi/nli_tr",
+        "generative": false
+    },
+    "xcopa_tr": {
+        "name": "XCOPA",
+        "task": "multiple_choice",
+        "description": "A multilingual dataset for evaluating causal commonsense reasoning capabilities of language models.",
+        "url": "https://github.com/cambridgeltl/xcopa",
+        "hf_name": "cambridgeltl/xcopa",
+        "generative": false
+    },
+    "exams_tr": {
+        "name": "Exams",
+        "task": "multiple_choice",
+        "description": "A question answering dataset covering high school exams.",
+        "url": "https://huggingface.co/datasets/exams",
+        "hf_name": "exams",
+        "generative": false
+    },
+    "belebele_tr": {
+        "name": "Belebele",
+        "task": "multiple_choice",
+        "description": "A multiple choice question answering dataset to evaluate machine comprehension.",
+        "url": "https://github.com/facebookresearch/belebele",
+        "generative": false
+    },
+    "turkish_plu_goal_inference": {
+        "name": "PLU-GI",
+        "task": "multiple_choice",
+        "description": "TurkishPLU - Goal Inference task.",
+        "url": "https://github.com/GGLAB-KU/turkish-plu",
+        "hf_name": "mcemilg/turkish-plu-goal-inference",
+        "generative": false
+    },
+    "turkish_plu_next_event_prediction": {
+        "name": "PLU-NE",
+        "task": "multiple_choice",
+        "description": "TurkishPLU - Next Event Prediction task.",
+        "url": "https://github.com/GGLAB-KU/turkish-plu",
+        "hf_name": "mcemilg/turkish-plu-next-event-prediction",
+        "generative": false
+    },
+    "turkish_plu_step_inference": {
+        "name": "PLU-SI",
+        "task": "multiple_choice",
+        "description": "TurkishPLU - Step Inference task.",
+        "url": "https://github.com/GGLAB-KU/turkish-plu",
+        "hf_name": "mcemilg/turkish-plu-step-inference",
+        "generative": false
+    },
+    "turkish_plu_step_ordering": {
+        "name": "PLU-SO",
+        "task": "multiple_choice",
+        "description": "TurkishPLU - Step Ordering task.",
+        "url": "https://github.com/GGLAB-KU/turkish-plu",
+        "hf_name": "mcemilg/turkish-plu-step-ordering",
+        "generative": false
+    },
+    "sts_tr": {
+        "name": "STS",
+        "task": "text_classification",
+        "description": "The machine-translated Semantic Textual Similarity dataset in Turkish.",
+        "url": "https://github.com/emrecncelik/sts-benchmark-tr",
+        "hf_name": "emrecan/stsb-mt-turkish",
+        "generative": false
+    },
+    "offenseval_tr": {
+        "name": "OffensEval",
+        "task": "text_classification",
+        "description": "A dataset for offensive speech recognition in Turkish.",
+        "url": "https://sites.google.com/site/offensevalsharedtask/offenseval-2020",
+        "hf_name": "coltekin/offenseval2020_tr",
+        "generative": false
+    },
+    "news_cat": {
+        "name": "NewsCat",
+        "task": "text_classification",
+        "description": "News classification dataset collected from Turkish newspapers websites.",
+        "url": "http://www.kemik.yildiz.edu.tr/veri_kumelerimiz.html",
+        "hf_name": "mcemilg/news-cat",
+        "generative": false
+    },
+    "ironytr": {
+        "name": "IronyTR",
+        "task": "text_classification",
+        "description": "Irony detection dataset in Turkish.",
+        "url": "https://github.com/teghub/IronyTR",
+        "hf_name": "mcemilg/IronyTR",
+        "generative": false
+    },
+    "wmt-tr-en-prompt": {
+        "name": "WMT",
+        "task": "machine_translation",
+        "description": "English-to-Turkish machine translation dataset.",
+        "url": "http://www.aclweb.org/anthology/W/W16/W16-2301",
+        "hf_name": "wmt/wmt16",
+        "generative": true
+    },
+    "gecturk_generation": {
+        "name": "GECTurk",
+        "task": "grammatical_error_correction",
+        "description": "A dataset for grammatical error correction.",
+        "url": "https://github.com/GGLAB-KU/gecturk",
+        "hf_name": "mcemilg/GECTurk-generation",
+        "generative": true
+    }
+}

environment.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+name: Cetvel-leaderboard
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2024.7.2=h06a4308_0
+  - expat=2.6.2=h6a678d5_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.14=h5eee18b_0
+  - python=3.12.4=h5148396_1
+  - readline=8.2=h5eee18b_0
+  - sqlite=3.45.3=h5eee18b_0
+  - tk=8.6.14=h39e8969_0
+  - wheel=0.43.0=py312h06a4308_0
+  - xz=5.4.6=h5eee18b_1
+  - zlib=1.2.13=h5eee18b_1
+  - pip:
+      - altair==5.3.0
+      - asttokens==2.4.1
+      - attrs==23.2.0
+      - blinker==1.8.2
+      - cachetools==5.3.3
+      - certifi==2024.7.4
+      - charset-normalizer==3.3.2
+      - click==8.1.7
+      - contourpy==1.2.1
+      - cycler==0.12.1
+      - decorator==5.1.1
+      - executing==2.0.1
+      - fonttools==4.53.1
+      - gitdb==4.0.11
+      - gitpython==3.1.43
+      - idna==3.7
+      - ipdb==0.13.13
+      - ipython==8.26.0
+      - jedi==0.19.1
+      - jinja2==3.1.4
+      - jsonschema==4.23.0
+      - jsonschema-specifications==2023.12.1
+      - kiwisolver==1.4.5
+      - markdown-it-py==3.0.0
+      - markupsafe==2.1.5
+      - matplotlib==3.9.1
+      - matplotlib-inline==0.1.7
+      - mdurl==0.1.2
+      - numpy==2.0.0
+      - packaging==24.1
+      - pandas==2.2.2
+      - parso==0.8.4
+      - pexpect==4.9.0
+      - pillow==10.4.0
+      - pip==24.1.2
+      - prompt-toolkit==3.0.47
+      - protobuf==5.27.2
+      - ptyprocess==0.7.0
+      - pure-eval==0.2.2
+      - pyarrow==16.1.0
+      - pydeck==0.9.1
+      - pygments==2.18.0
+      - pyparsing==3.1.2
+      - python-dateutil==2.9.0.post0
+      - pytz==2024.1
+      - redis==5.0.7
+      - referencing==0.35.1
+      - requests==2.32.3
+      - rich==13.7.1
+      - rpds-py==0.19.0
+      - semantic-version==2.10.0
+      - setuptools==70.3.0
+      - setuptools-rust==1.9.0
+      - six==1.16.0
+      - smmap==5.0.1
+      - stack-data==0.6.3
+      - streamlit==1.36.0
+      - tenacity==8.5.0
+      - toml==0.10.2
+      - toolz==0.12.1
+      - tornado==6.4.1
+      - traitlets==5.14.3
+      - typing-extensions==4.12.2
+      - tzdata==2024.1
+      - urllib3==2.2.2
+      - watchdog==4.0.1
+      - wcwidth==0.2.13
+prefix: /home/ilker/miniconda3/envs/streamlit-tutor

process_result.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os.path as osp
+import argparse
+import json
+from data import Tasks, DATASET_TASK_DICT
+from utils import preprocess_path
+def process_result(entry, name, task):
+    processed = {
+        'name': name,
+        'task': str(task),
+    }
+    if task == Tasks.EXTRACTIVE_QUESTION_ANSWERING:
+        key = 'em,none' if name == 'mkqa_tr' else 'exact,none'
+        scale = 0.01 if name != 'mkqa_tr' else 1
+        processed['exact_match'] = scale * entry[key]
+        processed['f1'] = scale * entry['f1,none']
+    elif task == Tasks.SUMMARIZATION:
+        processed['rouge1'] = entry['rouge1,none']
+        processed['rouge2'] = entry['rouge2,none']
+        processed['rougeL'] = entry['rougeL,none']
+    elif task in (
+        Tasks.MULTIPLE_CHOICE,
+        Tasks.NATURAL_LANGUAGE_INFERENCE,
+        Tasks.TEXT_CLASSIFICATION,
+    ):
+        processed['acc'] = entry['acc,none']
+        processed['acc_norm'] = entry.get('acc_norm,none', processed['acc'])
+    elif task == Tasks.MACHINE_TRANSLATION:
+        processed['wer'] = entry['wer,none']
+        processed['bleu'] = entry['bleu,none']
+    elif task == Tasks.GRAMMATICAL_ERROR_CORRECTION:
+        processed['exact_match'] = entry['exact_match,none']
+    return processed
+def main():
+    parser = argparse.ArgumentParser(description='Results file formatter.')
+    parser.add_argument('-i', '--input-file', type=str, help='Input JSON file for the results.')
+    parser.add_argument('-o', '--output-file', type=str, help='Output JSON file for the formatted results.')
+    args = parser.parse_args()
+    with open(preprocess_path(args.input_file)) as f:
+        raw_data = json.load(f)
+    # first, get model args
+    model_args = raw_data['config']['model_args'].split(',')
+    model_args = dict([tuple(pair.split('=')) for pair in model_args])
+    processed = dict()
+    model_args['model'] = model_args.pop('pretrained')
+    processed['model'] = model_args
+    processed['model']['api'] = raw_data['config']['model']
+    # then, process results
+    results = raw_data['results']
+    processed['results'] = list()
+    for dataset, entry in results.items():
+        if dataset not in DATASET_TASK_DICT.keys():
+            continue
+        task = DATASET_TASK_DICT[dataset]
+        processed['results'].append(process_result(entry, dataset, task))
+    with open(preprocess_path(args.output_file), 'w') as f:
+        json.dump(processed, f, indent=4)
+    print('done')
+if __name__ == '__main__':
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+altair==5.3.0
+click==8.1.7
+matplotlib==3.9.1
+numpy==2.0.0
+pandas==2.2.2
+pillow==10.4.0
+streamlit==1.36.0
+tornado==6.4.1

results/zero-shot/aya-23-8b.json ADDED Viewed

	@@ -0,0 +1,161 @@

+{
+    "model": {
+        "load_in_8bit": "True",
+        "trust_remote_code": "True",
+        "model": "CohereForAI/aya-23-8B",
+        "api": "hf",
+        "architecture": "CohereForCausalLM",
+        "dtype": "float16",
+        "max_length": 8192,
+        "type": "instruction-tuned",
+        "num_parameters": "8b"
+    },
+    "results": [
+        {
+            "name": "belebele_tr",
+            "task": "multiple_choice",
+            "acc": 0.6067,
+            "acc_norm": 0.6067
+        },
+        {
+            "name": "exams_tr",
+            "task": "multiple_choice",
+            "acc": 0.2697,
+            "acc_norm": 0.2901
+        },
+        {
+            "name": "check_worthiness",
+            "task": "multiple_choice",
+            "acc": 0.38345521023765994,
+            "acc_norm": 0.49177330895795246
+        },
+        {
+            "name": "ironytr",
+            "task": "text_classification",
+            "acc": 0.5166666666666667,
+            "acc_norm": 0.5016666666666667
+        },
+        {
+            "name": "mkqa_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.10017756732761172,
+            "f1": 0.16569513329103133
+        },
+        {
+            "name": "mnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3436,
+            "acc_norm": 0.3477
+        },
+        {
+            "name": "news_cat",
+            "task": "text_classification",
+            "acc": 0.724,
+            "acc_norm": 0.632
+        },
+        {
+            "name": "offenseval_tr",
+            "task": "text_classification",
+            "acc": 0.3424036281179138,
+            "acc_norm": 0.7865646258503401
+        },
+        {
+            "name": "relevance_judgment",
+            "task": "multiple_choice",
+            "acc": 0.42550274223034734,
+            "acc_norm": 0.4273308957952468
+        },
+        {
+            "name": "snli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3249,
+            "acc_norm": 0.3367
+        },
+        {
+            "name": "sts_tr",
+            "task": "text_classification",
+            "acc": 0.22987672226250908,
+            "acc_norm": 0.19434372733865118
+        },
+        {
+            "name": "tquad",
+            "task": "extractive_question_answering",
+            "exact_match": 0.2062780269058296,
+            "f1": 0.4653972244152745
+        },
+        {
+            "name": "turkish_plu_goal_inference",
+            "task": "multiple_choice",
+            "acc": 0.3918757467144564,
+            "acc_norm": 0.3859020310633214
+        },
+        {
+            "name": "turkish_plu_next_event_prediction",
+            "task": "multiple_choice",
+            "acc": 0.4687022900763359,
+            "acc_norm": 0.5374045801526718
+        },
+        {
+            "name": "turkish_plu_step_inference",
+            "task": "multiple_choice",
+            "acc": 0.33986928104575165,
+            "acc_norm": 0.45098039215686275
+        },
+        {
+            "name": "turkish_plu_step_ordering",
+            "task": "multiple_choice",
+            "acc": 0.6180215475024485,
+            "acc_norm": 0.6180215475024485
+        },
+        {
+            "name": "xcopa_tr",
+            "task": "multiple_choice",
+            "acc": 0.596,
+            "acc_norm": 0.596
+        },
+        {
+            "name": "xnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.4771084337349398,
+            "acc_norm": 0.4771084337349398
+        },
+        {
+            "name": "xquad_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.24705882352941178,
+            "f1": 0.44192474929656556
+        },
+        {
+            "name": "gecturk_generation",
+            "task": "grammatical_error_correction",
+            "exact_match": 0.008281573498964804
+        },
+        {
+            "name": "mlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.37037019926313125,
+            "rouge2": 0.24005923597941317,
+            "rougeL": 0.31098002776173184
+        },
+        {
+            "name": "wiki_lingua_tr",
+            "task": "summarization",
+            "rouge1": 0.2645070959726481,
+            "rouge2": 0.11354354716145479,
+            "rougeL": 0.21357621995467704
+        },
+        {
+            "name": "wmt-tr-en-prompt",
+            "task": "machine_translation",
+            "wer": 0.7464128097803795,
+            "bleu": 0.16878189334002527
+        },
+        {
+            "name": "xlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.2855728817569547,
+            "rouge2": 0.14081555638864124,
+            "rougeL": 0.23467303626936886
+        }
+    ]
+}

results/zero-shot/aya-expanse-8b.json ADDED Viewed

	@@ -0,0 +1,159 @@

+{
+    "model": {
+        "model": "CohereForAI/aya-expanse-8b",
+        "api": "hf",
+        "architecture": "CohereForCausalLM",
+        "max_length": 8192,
+        "dtype": "float16",
+        "type": "instruction-tuned",
+        "num_parameters": "8b"
+    },
+    "results": [
+        {
+            "name": "belebele_tr",
+            "task": "multiple_choice",
+            "acc": 0.7355555555555555,
+            "acc_norm": 0.7355555555555555
+        },
+        {
+            "name": "exams_tr",
+            "task": "multiple_choice",
+            "acc": 0.3155216284987277,
+            "acc_norm": 0.3460559796437659
+        },
+        {
+            "name": "check_worthiness",
+            "task": "multiple_choice",
+            "acc": 0.4026508226691042,
+            "acc_norm": 0.6224862888482633
+        },
+        {
+            "name": "gecturk_generation",
+            "task": "grammatical_error_correction",
+            "exact_match": 0.0018296499590736194
+        },
+        {
+            "name": "ironytr",
+            "task": "text_classification",
+            "acc": 0.505,
+            "acc_norm": 0.49833333333333335
+        },
+        {
+            "name": "mkqa_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.06954720331459012,
+            "f1": 0.13476533908972033
+        },
+        {
+            "name": "mlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.363610486561065,
+            "rouge2": 0.21362825588593481,
+            "rougeL": 0.29773476508614094
+        },
+        {
+            "name": "mnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3078,
+            "acc_norm": 0.35
+        },
+        {
+            "name": "news_cat",
+            "task": "text_classification",
+            "acc": 0.76,
+            "acc_norm": 0.58
+        },
+        {
+            "name": "offenseval_tr",
+            "task": "text_classification",
+            "acc": 0.2675736961451247,
+            "acc_norm": 0.7956349206349206
+        },
+        {
+            "name": "relevance_judgment",
+            "task": "multiple_choice",
+            "acc": 0.5877513711151737,
+            "acc_norm": 0.579981718464351
+        },
+        {
+            "name": "snli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.344,
+            "acc_norm": 0.3435
+        },
+        {
+            "name": "sts_tr",
+            "task": "text_classification",
+            "acc": 0.2095721537345903,
+            "acc_norm": 0.21029731689630166
+        },
+        {
+            "name": "tquad",
+            "task": "extractive_question_answering",
+            "exact_match": 0.13452914798206278,
+            "f1": 0.435087842533856
+        },
+        {
+            "name": "turkish_plu_goal_inference",
+            "task": "multiple_choice",
+            "acc": 0.4062126642771804,
+            "acc_norm": 0.3930704898446834
+        },
+        {
+            "name": "turkish_plu_next_event_prediction",
+            "task": "multiple_choice",
+            "acc": 0.4900763358778626,
+            "acc_norm": 0.5465648854961832
+        },
+        {
+            "name": "turkish_plu_step_inference",
+            "task": "multiple_choice",
+            "acc": 0.3464052287581699,
+            "acc_norm": 0.4395424836601307
+        },
+        {
+            "name": "turkish_plu_step_ordering",
+            "task": "multiple_choice",
+            "acc": 0.5935357492654261,
+            "acc_norm": 0.5935357492654261
+        },
+        {
+            "name": "wiki_lingua_tr",
+            "task": "summarization",
+            "rouge1": 0.3064320242538614,
+            "rouge2": 0.1340385267540697,
+            "rougeL": 0.24764232131755232
+        },
+        {
+            "name": "wmt-tr-en-prompt",
+            "task": "machine_translation",
+            "wer": 0.7822550373875778,
+            "bleu": 0.17034711245148307
+        },
+        {
+            "name": "xcopa_tr",
+            "task": "multiple_choice",
+            "acc": 0.578,
+            "acc_norm": 0.578
+        },
+        {
+            "name": "xlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.26621653203927675,
+            "rouge2": 0.133428873146516,
+            "rougeL": 0.2083669711429916
+        },
+        {
+            "name": "xnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.4919678714859438,
+            "acc_norm": 0.4919678714859438
+        },
+        {
+            "name": "xquad_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.2495798319327731,
+            "f1": 0.4735125568867167
+        }
+    ]
+}

results/zero-shot/aya101.json ADDED Viewed

	@@ -0,0 +1,172 @@

+{
+    "model": {
+        "dtype": "bfloat16",
+        "max_length": 4096,
+        "model": "CohereForAI/aya-101",
+        "api": "hf",
+        "architecture": "T5ForConditionalGeneration",
+        "type": "instruction-tuned",
+        "num_parameters": "13b"
+    },
+    "results": [
+        {
+            "name": "xquad_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.07563025210084033,
+            "f1": 0.16462359535888943
+        },
+        {
+            "name": "xlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.02416422194769531,
+            "rouge2": 0.00149839274458772,
+            "rougeL": 0.02416422194769531
+        },
+        {
+            "name": "xcopa_tr",
+            "task": "multiple_choice",
+            "acc": 0.596,
+            "acc_norm": 0.596
+        },
+        {
+            "name": "wmt-tr-en-prompt",
+            "task": "machine_translation",
+            "wer": 0.9853633715998092,
+            "bleu": 0.0
+        },
+        {
+            "name": "wiki_lingua_tr",
+            "task": "summarization",
+            "rouge1": 0.029006633700390562,
+            "rouge2": 0.0004998910319276452,
+            "rougeL": 0.028967197984657227
+        },
+        {
+            "name": "turkish_plu",
+            "task": "multiple_choice",
+            "acc": 0.41344,
+            "acc_norm": 0.42816
+        },
+        {
+            "name": "turkish_plu_goal_inference",
+            "task": "multiple_choice",
+            "acc": 0.3739545997610514,
+            "acc_norm": 0.33811230585424135
+        },
+        {
+            "name": "turkish_plu_next_event_prediction",
+            "task": "multiple_choice",
+            "acc": 0.34961832061068704,
+            "acc_norm": 0.38625954198473283
+        },
+        {
+            "name": "turkish_plu_step_inference",
+            "task": "multiple_choice",
+            "acc": 0.272875816993464,
+            "acc_norm": 0.35784313725490197
+        },
+        {
+            "name": "turkish_plu_step_ordering",
+            "task": "multiple_choice",
+            "acc": 0.5710088148873653,
+            "acc_norm": 0.5710088148873653
+        },
+        {
+            "name": "check_worthiness",
+            "task": "multiple_choice",
+            "acc": 0.553473491773309,
+            "acc_norm": 0.6238574040219378
+        },
+        {
+            "name": "relevance_judgment",
+            "task": "multiple_choice",
+            "acc": 0.6709323583180987,
+            "acc_norm": 0.5781535648994516
+        },
+        {
+            "name": "tr-wikihow-summ",
+            "task": "summarization",
+            "rouge1": 0.02053796966151103,
+            "rouge2": 0.00029270301029826366,
+            "rougeL": 0.020495031370814234
+        },
+        {
+            "name": "tquad",
+            "task": "extractive_question_answering",
+            "exact_match": 0.053811659192825115,
+            "f1": 0.09199690627084456
+        },
+        {
+            "name": "sts_tr",
+            "task": "text_classification",
+            "acc": 0.1696881798404641,
+            "acc_norm": 0.18781725888324874
+        },
+        {
+            "name": "offenseval_tr",
+            "task": "text_classification",
+            "acc": 0.7993197278911565,
+            "acc_norm": 0.7970521541950113
+        },
+        {
+            "name": "mnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.279,
+            "acc_norm": 0.3386
+        },
+        {
+            "name": "snli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.2558,
+            "acc_norm": 0.3279
+        },
+        {
+            "name": "xnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.2998003992015968,
+            "acc_norm": 0.34291417165668664
+        },
+        {
+            "name": "news_cat",
+            "task": "text_classification",
+            "acc": 0.2,
+            "acc_norm": 0.2
+        },
+        {
+            "name": "mlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.021746360547255133,
+            "rouge2": 0.003113110667892852,
+            "rougeL": 0.021727065059735186
+        },
+        {
+            "name": "mkqa_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.025451316957679788,
+            "f1": 0.05324060372891391
+        },
+        {
+            "name": "ironytr",
+            "task": "text_classification",
+            "acc": 0.5216666666666666,
+            "acc_norm": 0.5
+        },
+        {
+            "name": "gecturk_generation",
+            "task": "grammatical_error_correction",
+            "exact_match": 0.0
+        },
+        {
+            "name": "exams_tr",
+            "task": "multiple_choice",
+            "acc": 0.22900763358778625,
+            "acc_norm": 0.2366412213740458
+        },
+        {
+            "name": "belebele_tr",
+            "task": "multiple_choice",
+            "acc": 0.2288888888888889,
+            "acc_norm": 0.2288888888888889
+        }
+    ]
+}

results/zero-shot/commencis-7b.json ADDED Viewed

	@@ -0,0 +1,172 @@

+{
+    "model": {
+        "dtype": "bfloat16",
+        "max_length": "4096",
+        "model": "Commencis/Commencis-LLM",
+        "api": "hf",
+        "architecture": "MistralForCausalLM",
+        "type": "instruction-tuned",
+        "num_parameters": "7b"
+    },
+    "results": [
+        {
+            "name": "xquad_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.06638655462184874,
+            "f1": 0.22895337255761397
+        },
+        {
+            "name": "xlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.23661435034483103,
+            "rouge2": 0.09475637339836376,
+            "rougeL": 0.17114647899378693
+        },
+        {
+            "name": "xcopa_tr",
+            "task": "multiple_choice",
+            "acc": 0.58,
+            "acc_norm": 0.58
+        },
+        {
+            "name": "wmt-tr-en-prompt",
+            "task": "machine_translation",
+            "wer": 1.292660190832963,
+            "bleu": 0.046829706960566486
+        },
+        {
+            "name": "wiki_lingua_tr",
+            "task": "summarization",
+            "rouge1": 0.20899244459581318,
+            "rouge2": 0.06262304805792501,
+            "rougeL": 0.15190187433999106
+        },
+        {
+            "name": "turkish_plu",
+            "task": "multiple_choice",
+            "acc": 0.4128,
+            "acc_norm": 0.46176
+        },
+        {
+            "name": "turkish_plu_goal_inference",
+            "task": "multiple_choice",
+            "acc": 0.34767025089605735,
+            "acc_norm": 0.38948626045400236
+        },
+        {
+            "name": "turkish_plu_next_event_prediction",
+            "task": "multiple_choice",
+            "acc": 0.38625954198473283,
+            "acc_norm": 0.46259541984732827
+        },
+        {
+            "name": "turkish_plu_step_inference",
+            "task": "multiple_choice",
+            "acc": 0.2761437908496732,
+            "acc_norm": 0.3872549019607843
+        },
+        {
+            "name": "turkish_plu_step_ordering",
+            "task": "multiple_choice",
+            "acc": 0.56513222331048,
+            "acc_norm": 0.56513222331048
+        },
+        {
+            "name": "check_worthiness",
+            "task": "multiple_choice",
+            "acc": 0.3903107861060329,
+            "acc_norm": 0.4835466179159049
+        },
+        {
+            "name": "relevance_judgment",
+            "task": "multiple_choice",
+            "acc": 0.5077696526508226,
+            "acc_norm": 0.526508226691042
+        },
+        {
+            "name": "tr-wikihow-summ",
+            "task": "summarization",
+            "rouge1": 0.23101542478965895,
+            "rouge2": 0.0718775262261334,
+            "rougeL": 0.16318786708633073
+        },
+        {
+            "name": "tquad",
+            "task": "extractive_question_answering",
+            "exact_match": 0.053811659192825115,
+            "f1": 0.3110458108565287
+        },
+        {
+            "name": "sts_tr",
+            "task": "text_classification",
+            "acc": 0.14865844815083393,
+            "acc_norm": 0.2226250906453952
+        },
+        {
+            "name": "offenseval_tr",
+            "task": "text_classification",
+            "acc": 0.24263038548752835,
+            "acc_norm": 0.29365079365079366
+        },
+        {
+            "name": "mnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3058,
+            "acc_norm": 0.3103
+        },
+        {
+            "name": "snli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.2972,
+            "acc_norm": 0.32
+        },
+        {
+            "name": "xnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3141716566866267,
+            "acc_norm": 0.3281437125748503
+        },
+        {
+            "name": "news_cat",
+            "task": "text_classification",
+            "acc": 0.624,
+            "acc_norm": 0.368
+        },
+        {
+            "name": "mlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.30963778437323686,
+            "rouge2": 0.16100694114326877,
+            "rougeL": 0.23447680384800107
+        },
+        {
+            "name": "mkqa_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.0324060372891388,
+            "f1": 0.07231572678508513
+        },
+        {
+            "name": "ironytr",
+            "task": "text_classification",
+            "acc": 0.56,
+            "acc_norm": 0.54
+        },
+        {
+            "name": "gecturk_generation",
+            "task": "grammatical_error_correction",
+            "exact_match": 0.1701574461938466
+        },
+        {
+            "name": "exams_tr",
+            "task": "multiple_choice",
+            "acc": 0.24681933842239187,
+            "acc_norm": 0.29770992366412213
+        },
+        {
+            "name": "belebele_tr",
+            "task": "multiple_choice",
+            "acc": 0.3233333333333333,
+            "acc_norm": 0.3233333333333333
+        }
+    ]
+}

results/zero-shot/kanarya-2b.json ADDED Viewed

	@@ -0,0 +1,171 @@

+{
+    "model": {
+        "dtype": "float16",
+        "model": "asafaya/kanarya-2b",
+        "api": "hf",
+        "architecture": "GPTJForCausalLM",
+        "type": "pretrained",
+        "num_parameters": "3b"
+    },
+    "results": [
+        {
+            "name": "belebele_tr",
+            "task": "multiple_choice",
+            "acc": 0.2811111111111111,
+            "acc_norm": 0.2811111111111111
+        },
+        {
+            "name": "exams_tr",
+            "task": "multiple_choice",
+            "acc": 0.30025445292620867,
+            "acc_norm": 0.3256997455470738
+        },
+        {
+            "name": "gecturk_generation",
+            "task": "grammatical_error_correction",
+            "exact_match": 9.62973662670326e-05
+        },
+        {
+            "name": "ironytr",
+            "task": "text_classification",
+            "acc": 0.5,
+            "acc_norm": 0.5016666666666667
+        },
+        {
+            "name": "mkqa_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.005770938147380882,
+            "f1": 0.0157485308417537
+        },
+        {
+            "name": "mlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.380182975983147,
+            "rouge2": 0.2469518162622865,
+            "rougeL": 0.30607429328228153
+        },
+        {
+            "name": "news_cat",
+            "task": "text_classification",
+            "acc": 0.668,
+            "acc_norm": 0.556
+        },
+        {
+            "name": "mnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3278,
+            "acc_norm": 0.3463
+        },
+        {
+            "name": "snli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3088,
+            "acc_norm": 0.3109
+        },
+        {
+            "name": "xnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3273453093812375,
+            "acc_norm": 0.3341317365269461
+        },
+        {
+            "name": "offenseval_tr",
+            "task": "text_classification",
+            "acc": 0.6159297052154195,
+            "acc_norm": 0.796485260770975
+        },
+        {
+            "name": "sts_tr",
+            "task": "text_classification",
+            "acc": 0.12907904278462654,
+            "acc_norm": 0.12037708484408992
+        },
+        {
+            "name": "tquad",
+            "task": "extractive_question_answering",
+            "exact_match": 0.016816143497757848,
+            "f1": 0.046325790025566756
+        },
+        {
+            "name": "check_worthiness",
+            "task": "multiple_choice",
+            "acc": 0.623400365630713,
+            "acc_norm": 0.6238574040219378
+        },
+        {
+            "name": "relevance_judgment",
+            "task": "multiple_choice",
+            "acc": 0.5068555758683729,
+            "acc_norm": 0.5758683729433273
+        },
+        {
+            "name": "turkish_plu",
+            "task": "multiple_choice",
+            "acc": 0.4928,
+            "acc_norm": 0.536
+        },
+        {
+            "name": "turkish_plu_goal_inference",
+            "task": "multiple_choice",
+            "acc": 0.45878136200716846,
+            "acc_norm": 0.46714456391875747
+        },
+        {
+            "name": "turkish_plu_next_event_prediction",
+            "task": "multiple_choice",
+            "acc": 0.45648854961832064,
+            "acc_norm": 0.5190839694656488
+        },
+        {
+            "name": "turkish_plu_step_inference",
+            "task": "multiple_choice",
+            "acc": 0.35784313725490197,
+            "acc_norm": 0.5
+        },
+        {
+            "name": "turkish_plu_step_ordering",
+            "task": "multiple_choice",
+            "acc": 0.6248775710088149,
+            "acc_norm": 0.6248775710088149
+        },
+        {
+            "name": "wiki_lingua_tr",
+            "task": "summarization",
+            "rouge1": 0.14941800836498376,
+            "rouge2": 0.04469826846423095,
+            "rougeL": 0.11118162846926655
+        },
+        {
+            "name": "wmt-tr-en-prompt",
+            "task": "machine_translation",
+            "wer": 2.833755212322392,
+            "bleu": 0.030496946295093332
+        },
+        {
+            "name": "xcopa_tr",
+            "task": "multiple_choice",
+            "acc": 0.642,
+            "acc_norm": 0.642
+        },
+        {
+            "name": "xlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.2462743722502333,
+            "rouge2": 0.09312295140534987,
+            "rougeL": 0.1685445897911506
+        },
+        {
+            "name": "tr-wikihow-summ",
+            "task": "summarization",
+            "rouge1": null,
+            "rouge2": null,
+            "rougeL": null
+        },
+        {
+            "name": "xquad_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.008403361344537815,
+            "f1": 0.027799180278171867
+        }
+    ]
+}

results/zero-shot/llama-3-8b-instruct.json ADDED Viewed

	@@ -0,0 +1,160 @@

+{
+    "model": {
+        "trust_remote_code": "True",
+        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "api": "hf",
+        "architecture": "LlamaForCausalLM",
+        "max_length": 8192,
+        "type": "instruction-tuned",
+        "dtype": "bfloat16",
+        "num_parameters": "8b"
+    },
+    "results": [
+        {
+            "name": "belebele_tr",
+            "task": "multiple_choice",
+            "acc": 0.6633333333333333,
+            "acc_norm": 0.6633333333333333
+        },
+        {
+            "name": "exams_tr",
+            "task": "multiple_choice",
+            "acc": 0.2697201017811705,
+            "acc_norm": 0.3104325699745547
+        },
+        {
+            "name": "check_worthiness",
+            "task": "multiple_choice",
+            "acc": 0.4218464351005484,
+            "acc_norm": 0.5644424131627057
+        },
+        {
+            "name": "ironytr",
+            "task": "text_classification",
+            "acc": 0.545,
+            "acc_norm": 0.6466666666666666
+        },
+        {
+            "name": "mkqa_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.0424681858538029,
+            "f1": 0.11050423163975964
+        },
+        {
+            "name": "mnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3201,
+            "acc_norm": 0.3653
+        },
+        {
+            "name": "news_cat",
+            "task": "text_classification",
+            "acc": 0.628,
+            "acc_norm": 0.588
+        },
+        {
+            "name": "offenseval_tr",
+            "task": "text_classification",
+            "acc": 0.3081065759637188,
+            "acc_norm": 0.7304421768707483
+        },
+        {
+            "name": "relevance_judgment",
+            "task": "multiple_choice",
+            "acc": 0.603290676416819,
+            "acc_norm": 0.5790676416819013
+        },
+        {
+            "name": "snli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3283,
+            "acc_norm": 0.353
+        },
+        {
+            "name": "sts_tr",
+            "task": "text_classification",
+            "acc": 0.14213197969543148,
+            "acc_norm": 0.21537345902828137
+        },
+        {
+            "name": "tquad",
+            "task": "extractive_question_answering",
+            "exact_match": 0.1289237668161435,
+            "f1": 0.4134057883004977
+        },
+        {
+            "name": "turkish_plu_goal_inference",
+            "task": "multiple_choice",
+            "acc": 0.38829151732377537,
+            "acc_norm": 0.43130227001194743
+        },
+        {
+            "name": "turkish_plu_next_event_prediction",
+            "task": "multiple_choice",
+            "acc": 0.4549618320610687,
+            "acc_norm": 0.517557251908397
+        },
+        {
+            "name": "turkish_plu_step_inference",
+            "task": "multiple_choice",
+            "acc": 0.3137254901960784,
+            "acc_norm": 0.44281045751633985
+        },
+        {
+            "name": "turkish_plu_step_ordering",
+            "task": "multiple_choice",
+            "acc": 0.6160626836434868,
+            "acc_norm": 0.6160626836434868
+        },
+        {
+            "name": "xcopa_tr",
+            "task": "multiple_choice",
+            "acc": 0.586,
+            "acc_norm": 0.586
+        },
+        {
+            "name": "xnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.4389558232931727,
+            "acc_norm": 0.4389558232931727
+        },
+        {
+            "name": "xquad_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.09747899159663864,
+            "f1": 0.24450355256139333
+        },
+        {
+            "name": "gecturk_generation",
+            "task": "grammatical_error_correction",
+            "exact_match": 0.005007463045885695
+        },
+        {
+            "name": "mlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.40612528796779146,
+            "rouge2": 0.25769550481564407,
+            "rougeL": 0.3281187592669974
+        },
+        {
+            "name": "wiki_lingua_tr",
+            "task": "summarization",
+            "rouge1": 0.23621778991663983,
+            "rouge2": 0.08052321922363763,
+            "rougeL": 0.1710165526266978
+        },
+        {
+            "name": "wmt-tr-en-prompt",
+            "task": "machine_translation",
+            "wer": 0.823814082821166,
+            "bleu": 0.13572050882587958
+        },
+        {
+            "name": "xlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.29619456321037296,
+            "rouge2": 0.13520487191226377,
+            "rougeL": 0.220446635816053
+        }
+    ]
+}

results/zero-shot/llama-3-8b.json ADDED Viewed

	@@ -0,0 +1,159 @@

+{
+    "model": {
+        "model": "meta-llama/Meta-Llama-3-8B",
+        "api": "hf",
+        "architecture": "LlamaForCausalLM",
+        "max_length": 8192,
+        "type": "pretrained",
+        "dtype": "bfloat16",
+        "num_parameters": "8b"
+    },
+    "results": [
+        {
+            "name": "belebele_tr",
+            "task": "multiple_choice",
+            "acc": 0.5144,
+            "acc_norm": 0.5144
+        },
+        {
+            "name": "exams_tr",
+            "task": "multiple_choice",
+            "acc": 0.3028,
+            "acc_norm": 0.3537
+        },
+        {
+            "name": "check_worthiness",
+            "task": "multiple_choice",
+            "acc": 0.37614259597806216,
+            "acc_norm": 0.38391224862888484
+        },
+        {
+            "name": "ironytr",
+            "task": "text_classification",
+            "acc": 0.515,
+            "acc_norm": 0.525
+        },
+        {
+            "name": "mkqa_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.13465522343888725,
+            "f1": 0.19144550324599957
+        },
+        {
+            "name": "mnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3206,
+            "acc_norm": 0.3329
+        },
+        {
+            "name": "news_cat",
+            "task": "text_classification",
+            "acc": 0.724,
+            "acc_norm": 0.656
+        },
+        {
+            "name": "offenseval_tr",
+            "task": "text_classification",
+            "acc": 0.2193877551020408,
+            "acc_norm": 0.48214285714285715
+        },
+        {
+            "name": "relevance_judgment",
+            "task": "multiple_choice",
+            "acc": 0.42550274223034734,
+            "acc_norm": 0.5173674588665448
+        },
+        {
+            "name": "snli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.325,
+            "acc_norm": 0.3766
+        },
+        {
+            "name": "sts_tr",
+            "task": "text_classification",
+            "acc": 0.16388687454677303,
+            "acc_norm": 0.19216823785351705
+        },
+        {
+            "name": "tquad",
+            "task": "extractive_question_answering",
+            "exact_match": 0.28475336322869954,
+            "f1": 0.5013148868557868
+        },
+        {
+            "name": "turkish_plu_goal_inference",
+            "task": "multiple_choice",
+            "acc": 0.38948626045400236,
+            "acc_norm": 0.4169653524492234
+        },
+        {
+            "name": "turkish_plu_next_event_prediction",
+            "task": "multiple_choice",
+            "acc": 0.4488549618320611,
+            "acc_norm": 0.5328244274809161
+        },
+        {
+            "name": "turkish_plu_step_inference",
+            "task": "multiple_choice",
+            "acc": 0.32189542483660133,
+            "acc_norm": 0.47058823529411764
+        },
+        {
+            "name": "turkish_plu_step_ordering",
+            "task": "multiple_choice",
+            "acc": 0.6278158667972575,
+            "acc_norm": 0.6278158667972575
+        },
+        {
+            "name": "xcopa_tr",
+            "task": "multiple_choice",
+            "acc": 0.618,
+            "acc_norm": 0.618
+        },
+        {
+            "name": "xnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.4839357429718876,
+            "acc_norm": 0.4839357429718876
+        },
+        {
+            "name": "xquad_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.20840336134453782,
+            "f1": 0.33796418555415153
+        },
+        {
+            "name": "gecturk_generation",
+            "task": "grammatical_error_correction",
+            "exact_match": 0.006692666955558766
+        },
+        {
+            "name": "mlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.38446881575055203,
+            "rouge2": 0.2503978598237102,
+            "rougeL": 0.319713589198042
+        },
+        {
+            "name": "wiki_lingua_tr",
+            "task": "summarization",
+            "rouge1": 0.2069234464456151,
+            "rouge2": 0.06576422586110373,
+            "rougeL": 0.1516869929958613
+        },
+        {
+            "name": "wmt-tr-en-prompt",
+            "task": "machine_translation",
+            "wer": 0.9262281724087097,
+            "bleu": 0.113320746345327
+        },
+        {
+            "name": "xlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.2615001361521869,
+            "rouge2": 0.11093149007661907,
+            "rougeL": 0.20321693263972507
+        }
+    ]
+}

results/zero-shot/llama-3.1-8b-instruct.json ADDED Viewed

	@@ -0,0 +1,159 @@

+{
+    "model": {
+        "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "api": "hf",
+        "dtype": "bfloat16",
+        "max_length": 131072,
+        "architecture": "LlamaForCausalLM",
+        "type": "instruction-tuned",
+        "num_parameters": "8b"
+    },
+    "results": [
+        {
+            "name": "belebele_tr",
+            "task": "multiple_choice",
+            "acc": 0.7077777777777777,
+            "acc_norm": 0.7077777777777777
+        },
+        {
+            "name": "exams_tr",
+            "task": "multiple_choice",
+            "acc": 0.3231552162849873,
+            "acc_norm": 0.35877862595419846
+        },
+        {
+            "name": "check_worthiness",
+            "task": "multiple_choice",
+            "acc": 0.37614259597806216,
+            "acc_norm": 0.37614259597806216
+        },
+        {
+            "name": "ironytr",
+            "task": "text_classification",
+            "acc": 0.5133333333333333,
+            "acc_norm": 0.5666666666666667
+        },
+        {
+            "name": "mkqa_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.09115122817401598,
+            "f1": 0.15627870028803578
+        },
+        {
+            "name": "mnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3209,
+            "acc_norm": 0.3596
+        },
+        {
+            "name": "news_cat",
+            "task": "text_classification",
+            "acc": 0.66,
+            "acc_norm": 0.604
+        },
+        {
+            "name": "offenseval_tr",
+            "task": "text_classification",
+            "acc": 0.23582766439909297,
+            "acc_norm": 0.3687641723356009
+        },
+        {
+            "name": "relevance_judgment",
+            "task": "multiple_choice",
+            "acc": 0.4648080438756856,
+            "acc_norm": 0.5648994515539305
+        },
+        {
+            "name": "snli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3028,
+            "acc_norm": 0.3528
+        },
+        {
+            "name": "sts_tr",
+            "task": "text_classification",
+            "acc": 0.19579405366207397,
+            "acc_norm": 0.1551849166062364
+        },
+        {
+            "name": "tquad",
+            "task": "extractive_question_answering",
+            "exact_match": 0.23318385650224216,
+            "f1": 0.5062272078338648
+        },
+        {
+            "name": "turkish_plu_goal_inference",
+            "task": "multiple_choice",
+            "acc": 0.40860215053763443,
+            "acc_norm": 0.45997610513739545
+        },
+        {
+            "name": "turkish_plu_next_event_prediction",
+            "task": "multiple_choice",
+            "acc": 0.4442748091603053,
+            "acc_norm": 0.5419847328244275
+        },
+        {
+            "name": "turkish_plu_step_inference",
+            "task": "multiple_choice",
+            "acc": 0.33169934640522875,
+            "acc_norm": 0.4624183006535948
+        },
+        {
+            "name": "turkish_plu_step_ordering",
+            "task": "multiple_choice",
+            "acc": 0.633692458374143,
+            "acc_norm": 0.633692458374143
+        },
+        {
+            "name": "xcopa_tr",
+            "task": "multiple_choice",
+            "acc": 0.608,
+            "acc_norm": 0.608
+        },
+        {
+            "name": "xnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.4807228915662651,
+            "acc_norm": 0.4807228915662651
+        },
+        {
+            "name": "xquad_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.21428571428571427,
+            "f1": 0.4170277103753468
+        },
+        {
+            "name": "gecturk_generation",
+            "task": "grammatical_error_correction",
+            "exact_match": 0.005007463045885695
+        },
+        {
+            "name": "mlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.40612528796779146,
+            "rouge2": 0.25769550481564407,
+            "rougeL": 0.3281187592669974
+        },
+        {
+            "name": "wiki_lingua_tr",
+            "task": "summarization",
+            "rouge1": 0.23621778991663983,
+            "rouge2": 0.08052321922363763,
+            "rougeL": 0.1710165526266978
+        },
+        {
+            "name": "wmt-tr-en-prompt",
+            "task": "machine_translation",
+            "wer": 0.823814082821166,
+            "bleu": 0.13572050882587958
+        },
+        {
+            "name": "xlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.29619456321037296,
+            "rouge2": 0.13520487191226377,
+            "rougeL": 0.220446635816053
+        }
+    ]
+}

results/zero-shot/llama-3.1-8b.json ADDED Viewed

	@@ -0,0 +1,127 @@

+{
+    "model": {
+        "model": "meta-llama/Meta-Llama-3.1-8B",
+        "api": "hf",
+        "dtype": "bfloat16",
+        "max_length": 131072,
+        "architecture": "LlamaForCausalLM",
+        "type": "pretrained",
+        "num_parameters": "8b"
+    },
+    "results": [
+        {
+            "name": "belebele_tr",
+            "task": "multiple_choice",
+            "acc": 0.6144,
+            "acc_norm": 0.6144
+        },
+        {
+            "name": "exams_tr",
+            "task": "multiple_choice",
+            "acc": 0.3130,
+            "acc_norm": 0.3537
+        },
+        {
+            "name": "check_worthiness",
+            "task": "multiple_choice",
+            "acc": 0.37614259597806216,
+            "acc_norm": 0.37751371115173676
+        },
+        {
+            "name": "ironytr",
+            "task": "text_classification",
+            "acc": 0.585,
+            "acc_norm": 0.5183333333333333
+        },
+        {
+            "name": "mkqa_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.09248298313110388,
+            "f1": 0.15127108197296948
+        },
+        {
+            "name": "mnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3495,
+            "acc_norm": 0.3481
+        },
+        {
+            "name": "news_cat",
+            "task": "text_classification",
+            "acc": 0.692,
+            "acc_norm": 0.588
+        },
+        {
+            "name": "offenseval_tr",
+            "task": "text_classification",
+            "acc": 0.3463718820861678,
+            "acc_norm": 0.7636054421768708
+        },
+        {
+            "name": "relevance_judgment",
+            "task": "multiple_choice",
+            "acc": 0.4227605118829982,
+            "acc_norm": 0.506398537477148
+        },
+        {
+            "name": "snli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3169,
+            "acc_norm": 0.3379
+        },
+        {
+            "name": "sts_tr",
+            "task": "text_classification",
+            "acc": 0.17041334300217548,
+            "acc_norm": 0.2001450326323423
+        },
+        {
+            "name": "tquad",
+            "task": "extractive_question_answering",
+            "exact_match": 0.2757847533632287,
+            "f1": 0.5178366277473359
+        },
+        {
+            "name": "turkish_plu_goal_inference",
+            "task": "multiple_choice",
+            "acc": 0.4145758661887694,
+            "acc_norm": 0.4324970131421744
+        },
+        {
+            "name": "turkish_plu_next_event_prediction",
+            "task": "multiple_choice",
+            "acc": 0.4488549618320611,
+            "acc_norm": 0.5358778625954198
+        },
+        {
+            "name": "turkish_plu_step_inference",
+            "task": "multiple_choice",
+            "acc": 0.3382352941176471,
+            "acc_norm": 0.4738562091503268
+        },
+        {
+            "name": "turkish_plu_step_ordering",
+            "task": "multiple_choice",
+            "acc": 0.6425073457394711,
+            "acc_norm": 0.6425073457394711
+        },
+        {
+            "name": "xcopa_tr",
+            "task": "multiple_choice",
+            "acc": 0.626,
+            "acc_norm": 0.626
+        },
+        {
+            "name": "xnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.4947791164658635,
+            "acc_norm": 0.4947791164658635
+        },
+        {
+            "name": "xquad_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.2092436974789916,
+            "f1": 0.35674599908781446
+        }
+    ]
+}

results/zero-shot/llama-3.2-1b.json ADDED Viewed

	@@ -0,0 +1,191 @@

+{
+    "model": {
+        "model": "meta-llama/Llama-3.2-1B",
+        "api": "hf",
+        "dtype": "bfloat16",
+        "max_length": 131072,
+        "architecture": "LlamaForCausalLM",
+        "type": "pretrained",
+        "num_parameters": "1b"
+    },
+    "results": [
+        {
+            "name": "belebele_tr",
+            "task": "multiple_choice",
+            "acc": 0.29555555555555557,
+            "acc_norm": 0.29555555555555557
+        },
+        {
+            "name": "exams_tr",
+            "task": "multiple_choice",
+            "acc": 0.28498727735368956,
+            "acc_norm": 0.3053435114503817
+        },
+        {
+            "name": "check_worthiness",
+            "task": "multiple_choice",
+            "acc": 0.3880255941499086,
+            "acc_norm": 0.623400365630713
+        },
+        {
+            "name": "gecturk_generation",
+            "task": "grammatical_error_correction",
+            "exact_match": 0.00741489720256151
+        },
+        {
+            "name": "ironytr",
+            "task": "text_classification",
+            "acc": 0.5283333333333333,
+            "acc_norm": 0.5033333333333333
+        },
+        {
+            "name": "mkqa_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.007694584196507843,
+            "f1": 0.03304091036050505
+        },
+        {
+            "name": "mlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.23283491254211872,
+            "rouge2": 0.13426790568610214,
+            "rougeL": 0.18915548037371513
+        },
+        {
+            "name": "mnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3232,
+            "acc_norm": 0.334
+        },
+        {
+            "name": "news_cat",
+            "task": "text_classification",
+            "acc": 0.58,
+            "acc_norm": 0.532
+        },
+        {
+            "name": "offenseval_tr",
+            "task": "text_classification",
+            "acc": 0.4671201814058957,
+            "acc_norm": 0.7820294784580499
+        },
+        {
+            "name": "relevance_judgment",
+            "task": "multiple_choice",
+            "acc": 0.56672760511883,
+            "acc_norm": 0.5781535648994516
+        },
+        {
+            "name": "snli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3239,
+            "acc_norm": 0.3105
+        },
+        {
+            "name": "sts_tr",
+            "task": "text_classification",
+            "acc": 0.17113850616388687,
+            "acc_norm": 0.22552574329224076
+        },
+        {
+            "name": "tquad",
+            "task": "extractive_question_answering",
+            "exact_match": 0.06278026905829596,
+            "f1": 0.21486130318406463
+        },
+        {
+            "name": "turkish_plu_goal_inference",
+            "task": "multiple_choice",
+            "acc": 0.35842293906810035,
+            "acc_norm": 0.4026284348864994
+        },
+        {
+            "name": "turkish_plu_next_event_prediction",
+            "task": "multiple_choice",
+            "acc": 0.3709923664122137,
+            "acc_norm": 0.467175572519084
+        },
+        {
+            "name": "turkish_plu_step_inference",
+            "task": "multiple_choice",
+            "acc": 0.27941176470588236,
+            "acc_norm": 0.41830065359477125
+        },
+        {
+            "name": "turkish_plu_step_ordering",
+            "task": "multiple_choice",
+            "acc": 0.5759059745347699,
+            "acc_norm": 0.5759059745347699
+        },
+        {
+            "name": "wiki_lingua_tr",
+            "task": "summarization",
+            "rouge1": 0.10861529436199803,
+            "rouge2": 0.034862923521078545,
+            "rougeL": 0.08692160533533941
+        },
+        {
+            "name": "wmt-tr-en-prompt",
+            "task": "machine_translation",
+            "wer": 3.910683208136067,
+            "bleu": 0.012043288243775466
+        },
+        {
+            "name": "xcopa_tr",
+            "task": "multiple_choice",
+            "acc": 0.556,
+            "acc_norm": 0.556
+        },
+        {
+            "name": "xlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.16924699150407269,
+            "rouge2": 0.07190935921365724,
+            "rougeL": 0.13255123335488528
+        },
+        {
+            "name": "xnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.4389558232931727,
+            "acc_norm": 0.4389558232931727
+        },
+        {
+            "name": "xquad_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.04873949579831932,
+            "f1": 0.11156636293859905
+        },
+        {
+            "name": "gecturk_generation",
+            "task": "grammatical_error_correction",
+            "exact_match": 0.0073185998362944775
+        },
+        {
+            "name": "mlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.35440052022111407,
+            "rouge2": 0.2215476501673455,
+            "rougeL": 0.2911311598176804
+        },
+        {
+            "name": "wiki_lingua_tr",
+            "task": "summarization",
+            "rouge1": 0.18510384577665046,
+            "rouge2": 0.056181066004903614,
+            "rougeL": 0.1392211003290612
+        },
+        {
+            "name": "wmt-tr-en-prompt",
+            "task": "machine_translation",
+            "wer": 1.311990023748812,
+            "bleu": 0.02624044942774961
+        },
+        {
+            "name": "xlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.2429304790539497,
+            "rouge2": 0.09668008744707143,
+            "rougeL": 0.18327092913535944
+        }
+    ]
+}

results/zero-shot/llama-3.2-3b-instruct.json ADDED Viewed

	@@ -0,0 +1,191 @@

+{
+    "model": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "api": "hf",
+        "dtype": "bfloat16",
+        "max_length": 131072,
+        "architecture": "LlamaForCausalLM",
+        "type": "instruction-tuned",
+        "num_parameters": "3b"
+    },
+    "results": [
+        {
+            "name": "belebele_tr",
+            "task": "multiple_choice",
+            "acc": 0.5577777777777778,
+            "acc_norm": 0.5577777777777778
+        },
+        {
+            "name": "exams_tr",
+            "task": "multiple_choice",
+            "acc": 0.26208651399491095,
+            "acc_norm": 0.3053435114503817
+        },
+        {
+            "name": "check_worthiness",
+            "task": "multiple_choice",
+            "acc": 0.37614259597806216,
+            "acc_norm": 0.3807129798903108
+        },
+        {
+            "name": "gecturk_generation",
+            "task": "grammatical_error_correction",
+            "exact_match": 0.007222302470027445
+        },
+        {
+            "name": "ironytr",
+            "task": "text_classification",
+            "acc": 0.5016666666666667,
+            "acc_norm": 0.5083333333333333
+        },
+        {
+            "name": "mkqa_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.04675939627108612,
+            "f1": 0.08114473798410345
+        },
+        {
+            "name": "mlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.2669056212126977,
+            "rouge2": 0.1480446780314802,
+            "rougeL": 0.2106440565987865
+        },
+        {
+            "name": "mnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.32,
+            "acc_norm": 0.3141
+        },
+        {
+            "name": "news_cat",
+            "task": "text_classification",
+            "acc": 0.64,
+            "acc_norm": 0.552
+        },
+        {
+            "name": "offenseval_tr",
+            "task": "text_classification",
+            "acc": 0.20634920634920634,
+            "acc_norm": 0.35600907029478457
+        },
+        {
+            "name": "relevance_judgment",
+            "task": "multiple_choice",
+            "acc": 0.4227605118829982,
+            "acc_norm": 0.42413162705667273
+        },
+        {
+            "name": "snli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.319,
+            "acc_norm": 0.2923
+        },
+        {
+            "name": "sts_tr",
+            "task": "text_classification",
+            "acc": 0.12907904278462654,
+            "acc_norm": 0.16896301667875271
+        },
+        {
+            "name": "tquad",
+            "task": "extractive_question_answering",
+            "exact_match": 0.18721973094170405,
+            "f1": 0.5109898180473623
+        },
+        {
+            "name": "turkish_plu_goal_inference",
+            "task": "multiple_choice",
+            "acc": 0.3321385902031063,
+            "acc_norm": 0.3548387096774194
+        },
+        {
+            "name": "turkish_plu_next_event_prediction",
+            "task": "multiple_choice",
+            "acc": 0.3648854961832061,
+            "acc_norm": 0.4488549618320611
+        },
+        {
+            "name": "turkish_plu_step_inference",
+            "task": "multiple_choice",
+            "acc": 0.24183006535947713,
+            "acc_norm": 0.3758169934640523
+        },
+        {
+            "name": "turkish_plu_step_ordering",
+            "task": "multiple_choice",
+            "acc": 0.5710088148873653,
+            "acc_norm": 0.5710088148873653
+        },
+        {
+            "name": "wiki_lingua_tr",
+            "task": "summarization",
+            "rouge1": 0.1342879173103036,
+            "rouge2": 0.041489300068460175,
+            "rougeL": 0.10482785510181569
+        },
+        {
+            "name": "wmt-tr-en-prompt",
+            "task": "machine_translation",
+            "wer": 1.7706536060519733,
+            "bleu": 0.048843165627950165
+        },
+        {
+            "name": "xcopa_tr",
+            "task": "multiple_choice",
+            "acc": 0.546,
+            "acc_norm": 0.546
+        },
+        {
+            "name": "xlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.17224405229987672,
+            "rouge2": 0.06736413357191079,
+            "rougeL": 0.12750762702828333
+        },
+        {
+            "name": "xnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.42811244979919677,
+            "acc_norm": 0.42811244979919677
+        },
+        {
+            "name": "xquad_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.23025210084033615,
+            "f1": 0.4335914561273987
+        },
+        {
+            "name": "gecturk_generation",
+            "task": "grammatical_error_correction",
+            "exact_match": 0.009726033992970293
+        },
+        {
+            "name": "mlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.36482642805140486,
+            "rouge2": 0.2215366481025873,
+            "rougeL": 0.2964001074060548
+        },
+        {
+            "name": "wiki_lingua_tr",
+            "task": "summarization",
+            "rouge1": 0.21420020104688736,
+            "rouge2": 0.06939715371402275,
+            "rougeL": 0.1623531918550368
+        },
+        {
+            "name": "wmt-tr-en-prompt",
+            "task": "machine_translation",
+            "wer": 0.9910280580654681,
+            "bleu": 0.08179536823012563
+        },
+        {
+            "name": "xlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.2616423061938248,
+            "rouge2": 0.11064039063859936,
+            "rougeL": 0.19686955120787036
+        }
+    ]
+}

results/zero-shot/mistral-7b.json ADDED Viewed

	@@ -0,0 +1,165 @@

+{
+    "model": {
+        "dtype": "bfloat16",
+        "max_length": "4096",
+        "model": "mistralai/Mistral-7B-v0.1",
+        "api": "hf",
+        "architecture": "MixtralForCausalLM",
+        "type": "pretrained",
+        "num_parameters": "7b"
+    },
+    "results": [
+        {
+            "name": "xquad_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.16722689075630254,
+            "f1": 0.32150094374615246
+        },
+        {
+            "name": "xcopa_tr",
+            "task": "multiple_choice",
+            "acc": 0.566,
+            "acc_norm": 0.566
+        },
+        {
+            "name": "turkish_plu",
+            "task": "multiple_choice",
+            "acc": 0.45152,
+            "acc_norm": 0.5136
+        },
+        {
+            "name": "turkish_plu_goal_inference",
+            "task": "multiple_choice",
+            "acc": 0.42771804062126645,
+            "acc_norm": 0.46714456391875747
+        },
+        {
+            "name": "turkish_plu_next_event_prediction",
+            "task": "multiple_choice",
+            "acc": 0.39541984732824426,
+            "acc_norm": 0.5022900763358779
+        },
+        {
+            "name": "turkish_plu_step_inference",
+            "task": "multiple_choice",
+            "acc": 0.29248366013071897,
+            "acc_norm": 0.4411764705882353
+        },
+        {
+            "name": "turkish_plu_step_ordering",
+            "task": "multiple_choice",
+            "acc": 0.6023506366307542,
+            "acc_norm": 0.6023506366307542
+        },
+        {
+            "name": "check_worthiness",
+            "task": "multiple_choice",
+            "acc": 0.37614259597806216,
+            "acc_norm": 0.42458866544789764
+        },
+        {
+            "name": "relevance_judgment",
+            "task": "multiple_choice",
+            "acc": 0.4218464351005484,
+            "acc_norm": 0.49588665447897623
+        },
+        {
+            "name": "tquad",
+            "task": "extractive_question_answering",
+            "exact_match": 0.2096412556053812,
+            "f1": 0.4767364701184728
+        },
+        {
+            "name": "sts_tr",
+            "task": "text_classification",
+            "acc": 0.135605511240029,
+            "acc_norm": 0.20522117476432197
+        },
+        {
+            "name": "offenseval_tr",
+            "task": "text_classification",
+            "acc": 0.2046485260770975,
+            "acc_norm": 0.3735827664399093
+        },
+        {
+            "name": "mnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3194,
+            "acc_norm": 0.3267
+        },
+        {
+            "name": "snli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3196,
+            "acc_norm": 0.3201
+        },
+        {
+            "name": "xnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.331936127744511,
+            "acc_norm": 0.34910179640718564
+        },
+        {
+            "name": "news_cat",
+            "task": "text_classification",
+            "acc": 0.652,
+            "acc_norm": 0.44
+        },
+        {
+            "name": "mkqa_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.12030186445693992,
+            "f1": 0.16163416207615164
+        },
+        {
+            "name": "ironytr",
+            "task": "text_classification",
+            "acc": 0.5016666666666667,
+            "acc_norm": 0.52
+        },
+        {
+            "name": "exams_tr",
+            "task": "multiple_choice",
+            "acc": 0.24173027989821882,
+            "acc_norm": 0.30279898218829515
+        },
+        {
+            "name": "belebele_tr",
+            "task": "multiple_choice",
+            "acc": 0.37444444444444447,
+            "acc_norm": 0.37444444444444447
+        },
+        {
+            "name": "gecturk_generation",
+            "task": "grammatical_error_correction",
+            "exact_match": 0.20660599932591844
+        },
+        {
+            "name": "mlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.09403885616158554,
+            "rouge2": 0.06300721907752257,
+            "rougeL": 0.08169726458665999
+        },
+        {
+            "name": "wiki_lingua_tr",
+            "task": "summarization",
+            "rouge1": 0.1905392717787084,
+            "rouge2": 0.05957088325130176,
+            "rougeL": 0.1472985242082243
+        },
+        {
+            "name": "wmt-tr-en-prompt",
+            "task": "machine_translation",
+            "wer": 1.0876062644712858,
+            "bleu": 0.04973628734419603
+        },
+        {
+            "name": "xlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.02720399421152351,
+            "rouge2": 0.012032606076011431,
+            "rougeL": 0.02311080687545987
+        }
+    ]
+}

results/zero-shot/trendyol-7b.json ADDED Viewed

	@@ -0,0 +1,172 @@

+{
+    "model": {
+        "dtype": "bfloat16",
+        "max_length": "4096",
+        "model": "Trendyol/Trendyol-LLM-7b-base-v1.0",
+        "api": "hf",
+        "architecture": "MixtralForCausalLM",
+        "type": "instruction-tuned",
+        "num_parameters": "7b"
+    },
+    "results": [
+        {
+            "name": "xquad_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.0,
+            "f1": 0.15289561928390746
+        },
+        {
+            "name": "xlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.12128827095936726,
+            "rouge2": 0.05041801264157676,
+            "rougeL": 0.09604301857137748
+        },
+        {
+            "name": "xcopa_tr",
+            "task": "multiple_choice",
+            "acc": 0.61,
+            "acc_norm": 0.61
+        },
+        {
+            "name": "wmt-tr-en-prompt",
+            "task": "machine_translation",
+            "wer": 13.038665635458035,
+            "bleu": 0.010261135899096054
+        },
+        {
+            "name": "wiki_lingua_tr",
+            "task": "summarization",
+            "rouge1": 0.09429776166714862,
+            "rouge2": 0.02873358785517343,
+            "rougeL": 0.07767336257524773
+        },
+        {
+            "name": "turkish_plu",
+            "task": "multiple_choice",
+            "acc": 0.46944,
+            "acc_norm": 0.49952
+        },
+        {
+            "name": "turkish_plu_goal_inference",
+            "task": "multiple_choice",
+            "acc": 0.4635603345280765,
+            "acc_norm": 0.44683393070489846
+        },
+        {
+            "name": "turkish_plu_next_event_prediction",
+            "task": "multiple_choice",
+            "acc": 0.43206106870229005,
+            "acc_norm": 0.48854961832061067
+        },
+        {
+            "name": "turkish_plu_step_inference",
+            "task": "multiple_choice",
+            "acc": 0.3235294117647059,
+            "acc_norm": 0.4395424836601307
+        },
+        {
+            "name": "turkish_plu_step_ordering",
+            "task": "multiple_choice",
+            "acc": 0.5857002938295789,
+            "acc_norm": 0.5857002938295789
+        },
+        {
+            "name": "check_worthiness",
+            "task": "multiple_choice",
+            "acc": 0.37614259597806216,
+            "acc_norm": 0.37614259597806216
+        },
+        {
+            "name": "relevance_judgment",
+            "task": "multiple_choice",
+            "acc": 0.4218464351005484,
+            "acc_norm": 0.4218464351005484
+        },
+        {
+            "name": "tr-wikihow-summ",
+            "task": "summarization",
+            "rouge1": 0.1602888221320987,
+            "rouge2": 0.04616347811027626,
+            "rougeL": 0.12482407983918105
+        },
+        {
+            "name": "tquad",
+            "task": "extractive_question_answering",
+            "exact_match": 0.007847533632286996,
+            "f1": 0.26089513093937805
+        },
+        {
+            "name": "sts_tr",
+            "task": "text_classification",
+            "acc": 0.1551849166062364,
+            "acc_norm": 0.22697606961566352
+        },
+        {
+            "name": "offenseval_tr",
+            "task": "text_classification",
+            "acc": 0.20294784580498867,
+            "acc_norm": 0.20294784580498867
+        },
+        {
+            "name": "mnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3134,
+            "acc_norm": 0.2942
+        },
+        {
+            "name": "snli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3204,
+            "acc_norm": 0.2894
+        },
+        {
+            "name": "xnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.32974051896207585,
+            "acc_norm": 0.300998003992016
+        },
+        {
+            "name": "news_cat",
+            "task": "text_classification",
+            "acc": 0.812,
+            "acc_norm": 0.628
+        },
+        {
+            "name": "mlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.15450187559493767,
+            "rouge2": 0.08797823051939649,
+            "rougeL": 0.1350441813405041
+        },
+        {
+            "name": "mkqa_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.001479727730097662,
+            "f1": 0.037161672000373895
+        },
+        {
+            "name": "ironytr",
+            "task": "text_classification",
+            "acc": 0.5,
+            "acc_norm": 0.5
+        },
+        {
+            "name": "gecturk_generation",
+            "task": "grammatical_error_correction",
+            "exact_match": 0.00048148683133516297
+        },
+        {
+            "name": "exams_tr",
+            "task": "multiple_choice",
+            "acc": 0.28498727735368956,
+            "acc_norm": 0.3486005089058524
+        },
+        {
+            "name": "belebele_tr",
+            "task": "multiple_choice",
+            "acc": 0.3622222222222222,
+            "acc_norm": 0.3622222222222222
+        }
+    ]
+}

results/zero-shot/turna.json ADDED Viewed

	@@ -0,0 +1,172 @@

+{
+    "model": {
+        "dtype": "auto",
+        "max_length": "1024",
+        "model": "boun-tabi-LMG/TURNA",
+        "api": "hf",
+        "architecture": "T5ForCondtiionalGeneration",
+        "type": "pretrained",
+        "num_parameters": "7b"
+    },
+    "results": [
+        {
+            "name": "xquad_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.0,
+            "f1": 0.0
+        },
+        {
+            "name": "xlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.1904384366601188,
+            "rouge2": 0.060686113611140166,
+            "rougeL": 0.1311090280660866
+        },
+        {
+            "name": "xcopa_tr",
+            "task": "multiple_choice",
+            "acc": 0.558,
+            "acc_norm": 0.558
+        },
+        {
+            "name": "wmt-tr-en-prompt",
+            "task": "machine_translation",
+            "wer": 3.9036796738046218,
+            "bleu": 0.0008286617236874524
+        },
+        {
+            "name": "wiki_lingua_tr",
+            "task": "summarization",
+            "rouge1": 0.18435291474691423,
+            "rouge2": 0.05584649726914134,
+            "rougeL": 0.13446021077350823
+        },
+        {
+            "name": "turkish_plu",
+            "task": "multiple_choice",
+            "acc": 0.40288,
+            "acc_norm": 0.44608
+        },
+        {
+            "name": "turkish_plu_goal_inference",
+            "task": "multiple_choice",
+            "acc": 0.37992831541218636,
+            "acc_norm": 0.35722819593787336
+        },
+        {
+            "name": "turkish_plu_next_event_prediction",
+            "task": "multiple_choice",
+            "acc": 0.383206106870229,
+            "acc_norm": 0.4488549618320611
+        },
+        {
+            "name": "turkish_plu_step_inference",
+            "task": "multiple_choice",
+            "acc": 0.272875816993464,
+            "acc_norm": 0.4542483660130719
+        },
+        {
+            "name": "turkish_plu_step_ordering",
+            "task": "multiple_choice",
+            "acc": 0.5122428991185113,
+            "acc_norm": 0.5122428991185113
+        },
+        {
+            "name": "check_worthiness",
+            "task": "multiple_choice",
+            "acc": 0.42230347349177333,
+            "acc_norm": 0.620201096892139
+        },
+        {
+            "name": "relevance_judgment",
+            "task": "multiple_choice",
+            "acc": 0.4904021937842779,
+            "acc_norm": 0.5781535648994516
+        },
+        {
+            "name": "tr-wikihow-summ",
+            "task": "summarization",
+            "rouge1": 0.20515501424269858,
+            "rouge2": 0.05693981251975118,
+            "rougeL": 0.1449313333992171
+        },
+        {
+            "name": "tquad",
+            "task": "extractive_question_answering",
+            "exact_match": 0.0,
+            "f1": 0.0003736920777279522
+        },
+        {
+            "name": "sts_tr",
+            "task": "text_classification",
+            "acc": 0.14213197969543148,
+            "acc_norm": 0.19506889050036258
+        },
+        {
+            "name": "offenseval_tr",
+            "task": "text_classification",
+            "acc": 0.5099206349206349,
+            "acc_norm": 0.7970521541950113
+        },
+        {
+            "name": "mnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3203,
+            "acc_norm": 0.3159
+        },
+        {
+            "name": "snli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.3223,
+            "acc_norm": 0.3278
+        },
+        {
+            "name": "xnli_tr",
+            "task": "natural_language_inference",
+            "acc": 0.32974051896207585,
+            "acc_norm": 0.3277445109780439
+        },
+        {
+            "name": "news_cat",
+            "task": "text_classification",
+            "acc": 0.328,
+            "acc_norm": 0.208
+        },
+        {
+            "name": "mlsum_tr",
+            "task": "summarization",
+            "rouge1": 0.20830277213555015,
+            "rouge2": 0.11040542892341527,
+            "rougeL": 0.16135585618616377
+        },
+        {
+            "name": "mkqa_tr",
+            "task": "extractive_question_answering",
+            "exact_match": 0.0011837821840781297,
+            "f1": 0.006720430107526878
+        },
+        {
+            "name": "ironytr",
+            "task": "text_classification",
+            "acc": 0.48333333333333334,
+            "acc_norm": 0.5033333333333333
+        },
+        {
+            "name": "gecturk_generation",
+            "task": "grammatical_error_correction",
+            "exact_match": 0.0
+        },
+        {
+            "name": "exams_tr",
+            "task": "multiple_choice",
+            "acc": 0.2366412213740458,
+            "acc_norm": 0.2748091603053435
+        },
+        {
+            "name": "belebele_tr",
+            "task": "multiple_choice",
+            "acc": 0.22555555555555556,
+            "acc_norm": 0.22555555555555556
+        }
+    ]
+}

utils.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+import os.path as osp
+import json
+def preprocess_path(path):
+    path = osp.expanduser(path)
+    path = osp.abspath(path)
+    return path
+def get_model_url(entry):
+    if entry['api'] == 'hf':
+        return f'https://huggingface.co/{entry["model"]}'
+    return entry.get('url', f'https://localhost/{entry["model"]}')
+def read_results(path):
+    path = preprocess_path(path)
+    file_list = sorted(os.listdir(path))
+    results = list()
+    for file_name in file_list:
+        file_path = osp.join(path, file_name)
+        with open(file_path, 'r') as f:
+            this = json.load(f)
+        results.append(this)
+    return results