|
import streamlit as st |
|
import pandas as pd |
|
import json |
|
from utils import read_results, preprocess_path, get_model_url |
|
from data import Tasks, Metrics, DATASET_TASK_DICT, TASK_METRIC_DICT, DATASET_GROUPS |
|
|
|
|
|
st.set_page_config( |
|
page_title='Cetvel π', |
|
layout='centered', |
|
) |
|
|
|
|
|
@st.cache_data |
|
def cache_results(path): |
|
json_results = read_results(path) |
|
results = list() |
|
for entry in json_results: |
|
row = { |
|
'model': entry['model']['model'], |
|
'num_parameters': entry['model']['num_parameters'], |
|
'url': get_model_url(entry['model']), |
|
'architecture': entry['model']['architecture'], |
|
'type': entry['model']['type'], |
|
'precision': entry['model']['dtype'], |
|
} |
|
for result in entry['results']: |
|
task = result['task'] |
|
metric = TASK_METRIC_DICT.get(task) |
|
score = result.get(metric) |
|
score = 100 * score if metric != Metrics.WER and score is not None else score |
|
row[result['name']] = score |
|
results.append(row) |
|
df = pd.DataFrame(results) |
|
for group, metadata in DATASET_GROUPS.items(): |
|
df[group] = df[metadata['datasets']].mean(axis=1) |
|
return df |
|
|
|
|
|
@st.cache_data |
|
def cache_datasets(path): |
|
path = preprocess_path(path) |
|
with open(path, 'r') as f: |
|
datasets = json.load(f) |
|
for key in datasets.keys(): |
|
datasets[key]['dataset'] = key |
|
return datasets |
|
|
|
|
|
def create_column_configs(items): |
|
column_configs = dict() |
|
for key, metadata in items.items(): |
|
column_configs[key] = st.column_config.NumberColumn( |
|
metadata.get('name', key), |
|
help=metadata['description'], |
|
min_value=0, |
|
format="%2.2f" |
|
) |
|
return column_configs |
|
|
|
|
|
def insert_average(df, keys): |
|
df = df.copy(deep=True) |
|
df['average'] = df.loc[:, [x for x in df.columns if x in keys]].mean(axis=1) |
|
df.insert(1, 'average', df.pop('average')) |
|
df.index += 1 |
|
return df.sort_values(by=['average'], ascending=False) |
|
|
|
|
|
MODEL_SPEC_CONFIGS = { |
|
'model': st.column_config.TextColumn( |
|
'Model', |
|
help='Large Language Model (LLM) used for the experiments.', |
|
max_chars=120, |
|
|
|
), |
|
'url': st.column_config.LinkColumn( |
|
'URL', |
|
help='Model URL.', |
|
display_text='Click', |
|
), |
|
'num_parameters': st.column_config.TextColumn( |
|
'#params', |
|
help='Approximate number of parameters.', |
|
), |
|
'type': st.column_config.TextColumn( |
|
'Type', |
|
help='Model type based on training objective.', |
|
), |
|
'average': st.column_config.NumberColumn( |
|
'Avg.', |
|
help='Average across task or dataset performances.', |
|
format="%2.2f", |
|
) |
|
} |
|
|
|
|
|
def filter_visible_model_specs(): |
|
specs = { |
|
'URL': ('url', 1), |
|
'#params': ('num_parameters', 2), |
|
'Architecture': ('architecture', 3), |
|
'Type': ('type', 4), |
|
'Precision': ('precision', 5), |
|
} |
|
visible_specs = st.multiselect( |
|
'Select model specs to be shown in the table.', |
|
options=sorted(specs.keys(), key=lambda x: specs[x][1]), |
|
) |
|
|
|
return [specs[x][0] for x in visible_specs] |
|
|
|
|
|
def filter_by_model_spec(): |
|
pass |
|
|
|
|
|
def filter_visible_datasets(datasets): |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
dataset_grouping = st.selectbox( |
|
'Dataset Grouping', |
|
[ |
|
'Group Datasets', |
|
'Show All Datasets', |
|
], |
|
) |
|
|
|
with col2: |
|
filter_by_task = st.selectbox( |
|
'Filter by Task', |
|
[ |
|
'All', |
|
'Understanding Tasks', |
|
'Generation Tasks', |
|
'Multiple Choice', |
|
'Extractive Question Answering', |
|
'Natural Language Inference', |
|
'Text Classification', |
|
'Summarization', |
|
], |
|
disabled=dataset_grouping == "Group Datasets", |
|
) |
|
|
|
if dataset_grouping == 'Group Datasets': |
|
return list(DATASET_GROUPS.keys()) |
|
elif dataset_grouping == 'Show All Datasets': |
|
if filter_by_task == 'All': |
|
return list(datasets.keys()) |
|
elif filter_by_task == 'Understanding Tasks': |
|
this_datasets = [k for (k, v) in datasets.items() if not v['generative']] |
|
return this_datasets |
|
elif filter_by_task == 'Generation Tasks': |
|
this_datasets = [k for (k, v) in datasets.items() if v['generative']] |
|
return this_datasets |
|
elif filter_by_task == 'Multiple Choice': |
|
return DATASET_GROUPS['MCQA']['datasets'] |
|
elif filter_by_task == 'Extractive Question Answering': |
|
return DATASET_GROUPS['QA']['datasets'] |
|
elif filter_by_task == 'Natural Language Inference': |
|
return DATASET_GROUPS['NLI']['datasets'] |
|
elif filter_by_task == 'Text Classification': |
|
return DATASET_GROUPS['TC']['datasets'] |
|
elif filter_by_task == 'Summarization': |
|
return DATASET_GROUPS['SUM']['datasets'] |
|
|
|
|
|
def introduction(): |
|
st.title(':blue[Cetvel :straight_ruler:]') |
|
st.subheader('A Unified Benchmark for Evaluating Turkish LLMs', anchor=False) |
|
st.markdown('''Cetvel is an extended version of the [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness) tool, specifically includes tasks/datasets for benchmarking Turkish Large Language Models (LLMs). Cetvel includes a variety of tasks curated to assess different aspects of model performance in the Turkish language. Our primary goal is to objectively evaluate the capabilities of large language models in understanding and processing Turkish. For documentation and more details about the benchmark and the experiments, you can check the [GitHub repository](https://github.com/KUIS-AI/Cetvel).''') |
|
|
|
|
|
def main(): |
|
introduction() |
|
results_df = cache_results('./results/zero-shot') |
|
datasets = cache_datasets('./data/datasets.json') |
|
dataset_column_configs = create_column_configs(datasets) |
|
group_column_configs = create_column_configs(DATASET_GROUPS) |
|
|
|
column_configs = MODEL_SPEC_CONFIGS | group_column_configs | dataset_column_configs |
|
|
|
visible_data_columns = sorted(filter_visible_datasets(datasets), key=str.casefold) |
|
visible_model_columns = filter_visible_model_specs() |
|
results_df = insert_average(results_df, visible_data_columns) |
|
|
|
st.dataframe( |
|
results_df, |
|
use_container_width=True, |
|
hide_index=True, |
|
column_config=column_configs, |
|
column_order=['model', 'average',] + visible_model_columns + visible_data_columns, |
|
) |
|
st.image('./assets/kuis-ai-logo.png', width=240) |
|
|
|
|
|
main() |