|
"""Script to produce radial plots.""" |
|
|
|
from functools import partial |
|
import plotly.graph_objects as go |
|
import json |
|
import numpy as np |
|
from collections import defaultdict |
|
import pandas as pd |
|
from pydantic import BaseModel |
|
import gradio as gr |
|
import requests |
|
import random |
|
import logging |
|
import datetime as dt |
|
import scipy.stats as stats |
|
import itertools as it |
|
|
|
|
|
fmt = "%(asctime)s [%(levelname)s] <%(name)s> %(message)s" |
|
logging.basicConfig(level=logging.INFO, format=fmt) |
|
logger = logging.getLogger("radial_plot_generator") |
|
|
|
|
|
INTRO_MARKDOWN = """ |
|
# Radial Plot Generator |
|
|
|
This demo allows you to generate a radial plot comparing the performance of different |
|
language models on different tasks. It is based on the generative results from the |
|
[ScandEval benchmark](https://scandeval.com). |
|
""" |
|
|
|
|
|
ABOUT_MARKDOWN = """ |
|
## About the ScandEval Benchmark |
|
|
|
The [ScandEval benchmark](https://scandeval.com) is used compare pretrained language |
|
models on tasks in Danish, Swedish, Norwegian Bokmål, Norwegian Nynorsk, Icelandic, |
|
Faroese, German, Dutch and English. The benchmark supports both encoder models (such as |
|
BERT) and generative models (such as GPT), and leaderboards for both kinds [are |
|
available](https://scandeval.com). |
|
|
|
The generative models are evaluated using in-context learning with few-shot prompts. |
|
The few-shot examples are sampled randomly from the training split, and we benchmark |
|
the models 10 times with bootstrapped test sets and different few-shot examples in each |
|
iteration. This allows us to better measure the uncertainty of the results. We use the |
|
uncertainty in the radial plot when we compute the rank scores for the models. Namely, |
|
we compute the rank score by firstly computing the rank of the model on each task, |
|
where two models are considered to have the same rank if there is not a statistically |
|
significant difference between their scores (one-tailed t-test with p < 0.05). We next |
|
apply a logaritmic transformation to the ranks, to downplay the importance of the |
|
poorly performing models. Lastly, we invert and normalise the logaritmic ranks to the |
|
range [0, 1], resulting in the best performing models having rank scores close to 1 and |
|
the worst performing models having rank scores close to 0. |
|
|
|
## The Benchmark Datasets |
|
|
|
The ScandEval generative benchmark currently covers the languages Danish, Swedish, |
|
Norwegian, Icelandic, German, Dutch and English. For each language, the benchmark |
|
consists of 7 different tasks, each of which consists of 1-2 datasets. The tasks are |
|
the following: |
|
|
|
### Text Classification |
|
Given a piece of text, classify it into a number of classes. For this task we extract |
|
the first token of the possible labels, and choose the label whose first token has the |
|
highest probability. All datasets in this category are currently trinary sentiment |
|
classification datasets. We use the Matthews Correlation Coefficient (MCC) as the |
|
evaluation metric. |
|
|
|
### Information Extraction |
|
Given a piece of text, extract a number of entities from the text. As the model needs |
|
to extract multiple entities, we use [structured |
|
generation](https://github.com/noamgat/lm-format-enforcer) to make the model generate a |
|
JSON dictionary with keys being the entity categories and values being lists of the |
|
identified entities. All datasets in this task are named entity recognition datasets. |
|
We use the micro-averaged F1 score as the evaluation metric, where we ignore the |
|
Miscellaneous category. |
|
|
|
### Grammar |
|
Given a piece of text, determine whether it is grammatically correct or not. All |
|
datasets in this task are built from the dependency treebanks of the languages, where |
|
words are removed or swapped, in a way that makes the sentence ungrammatical. We use |
|
the Matthews Correlation Coefficient (MCC) as the evaluation metric. |
|
|
|
### Question Answering |
|
Given a question and a piece of text, extract the answer to the question from the text. |
|
All datasets in this task are extractive question answering datasets. We use the exact |
|
match (EM) score as the evaluation metric. |
|
|
|
### Summarisation |
|
Given a piece of text, generate a summary of the text. All the datasets come from |
|
either news articles or WikiHow articles. We use the BERTScore metric as the evaluation |
|
metric, where the encoder model used is |
|
[microsoft/mdeberta-v3-base](https://huggingface.co/microsoft/mdeberta-v3-base). |
|
|
|
### Knowledge |
|
Given a trivia-style question with multiple choice answers, choose the correct answer. |
|
As with text classification, we use the probabilities of the answer letter (a, b, c or |
|
d) to choose the answer. The datasets in this task are machine translated versions of |
|
the [MMLU](https://doi.org/10.48550/arXiv.2009.03300) and |
|
[ARC](https://allenai.org/data/arc) datasets. We use the Matthews Correlation |
|
Coefficient (MCC) as the evaluation metric. |
|
|
|
### Reasoning |
|
Given a scenario and multiple possible endings, choose the correct ending. As with text |
|
classification, we use the probabilities of the answer letter (a, b, c or d) to choose |
|
the answer. The datasets in this task are machine translated versions of the |
|
[HellaSwag](https://rowanzellers.com/hellaswag/) dataset. We use the Matthews |
|
Correlation Coefficient (MCC) as the evaluation metric. |
|
|
|
|
|
## Citation |
|
|
|
If you use the ScandEval benchmark in your work, please cite [the |
|
paper](https://aclanthology.org/2023.nodalida-1.20): |
|
|
|
``` |
|
@inproceedings{nielsen2023scandeval, |
|
title={ScandEval: A Benchmark for Scandinavian Natural Language Processing}, |
|
author={Nielsen, Dan}, |
|
booktitle={Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)}, |
|
pages={185--201}, |
|
year={2023} |
|
} |
|
``` |
|
""" |
|
|
|
|
|
UPDATE_FREQUENCY_MINUTES = 5 |
|
MIN_COLOUR_DISTANCE_BETWEEN_MODELS = 200 |
|
|
|
|
|
class Task(BaseModel): |
|
"""Class to hold task information.""" |
|
|
|
name: str |
|
metric: str |
|
|
|
def __hash__(self): |
|
return hash(self.name) |
|
|
|
|
|
class Language(BaseModel): |
|
"""Class to hold language information.""" |
|
|
|
code: str |
|
name: str |
|
|
|
def __hash__(self): |
|
return hash(self.code) |
|
|
|
|
|
class Dataset(BaseModel): |
|
"""Class to hold dataset information.""" |
|
|
|
name: str |
|
language: Language |
|
task: Task |
|
|
|
def __hash__(self): |
|
return hash(self.name) |
|
|
|
|
|
SUMMARISATION = Task(name="summarisation", metric="bertscore") |
|
KNOWLEDGE = Task(name="knowledge", metric="mcc") |
|
REASONING = Task(name="reasoning", metric="mcc") |
|
GRAMMAR = Task(name="grammar", metric="mcc") |
|
QUESTION_ANSWERING = Task(name="question answering", metric="em") |
|
TEXT_CLASSIFICATION = Task(name="text classification", metric="mcc") |
|
INFORMATION_EXTRACTION = Task(name="information extraction", metric="micro_f1_no_misc") |
|
ALL_TASKS = [obj for obj in globals().values() if isinstance(obj, Task)] |
|
|
|
DANISH = Language(code="da", name="Danish") |
|
NORWEGIAN = Language(code="no", name="Norwegian") |
|
SWEDISH = Language(code="sv", name="Swedish") |
|
ICELANDIC = Language(code="is", name="Icelandic") |
|
GERMAN = Language(code="de", name="German") |
|
DUTCH = Language(code="nl", name="Dutch") |
|
ENGLISH = Language(code="en", name="English") |
|
ALL_LANGUAGES = { |
|
obj.name: obj for obj in globals().values() if isinstance(obj, Language) |
|
} |
|
|
|
DATASETS = [ |
|
Dataset(name="swerec", language=SWEDISH, task=TEXT_CLASSIFICATION), |
|
Dataset(name="angry-tweets", language=DANISH, task=TEXT_CLASSIFICATION), |
|
Dataset(name="norec", language=NORWEGIAN, task=TEXT_CLASSIFICATION), |
|
Dataset(name="sb10k", language=GERMAN, task=TEXT_CLASSIFICATION), |
|
Dataset(name="dutch-social", language=DUTCH, task=TEXT_CLASSIFICATION), |
|
Dataset(name="sst5", language=ENGLISH, task=TEXT_CLASSIFICATION), |
|
Dataset(name="suc3", language=SWEDISH, task=INFORMATION_EXTRACTION), |
|
Dataset(name="dansk", language=DANISH, task=INFORMATION_EXTRACTION), |
|
Dataset(name="norne-nb", language=NORWEGIAN, task=INFORMATION_EXTRACTION), |
|
Dataset(name="norne-nn", language=NORWEGIAN, task=INFORMATION_EXTRACTION), |
|
Dataset(name="mim-gold-ner", language=ICELANDIC, task=INFORMATION_EXTRACTION), |
|
Dataset(name="germeval", language=GERMAN, task=INFORMATION_EXTRACTION), |
|
Dataset(name="conll-nl", language=DUTCH, task=INFORMATION_EXTRACTION), |
|
Dataset(name="conll-en", language=ENGLISH, task=INFORMATION_EXTRACTION), |
|
Dataset(name="scala-sv", language=SWEDISH, task=GRAMMAR), |
|
Dataset(name="scala-da", language=DANISH, task=GRAMMAR), |
|
Dataset(name="scala-nb", language=NORWEGIAN, task=GRAMMAR), |
|
Dataset(name="scala-nn", language=NORWEGIAN, task=GRAMMAR), |
|
Dataset(name="scala-is", language=ICELANDIC, task=GRAMMAR), |
|
Dataset(name="scala-de", language=GERMAN, task=GRAMMAR), |
|
Dataset(name="scala-nl", language=DUTCH, task=GRAMMAR), |
|
Dataset(name="scala-en", language=ENGLISH, task=GRAMMAR), |
|
Dataset(name="scandiqa-da", language=DANISH, task=QUESTION_ANSWERING), |
|
Dataset(name="norquad", language=NORWEGIAN, task=QUESTION_ANSWERING), |
|
Dataset(name="scandiqa-sv", language=SWEDISH, task=QUESTION_ANSWERING), |
|
Dataset(name="nqii", language=ICELANDIC, task=QUESTION_ANSWERING), |
|
Dataset(name="germanquad", language=GERMAN, task=QUESTION_ANSWERING), |
|
Dataset(name="squad", language=ENGLISH, task=QUESTION_ANSWERING), |
|
Dataset(name="squad-nl", language=DUTCH, task=QUESTION_ANSWERING), |
|
Dataset(name="nordjylland-news", language=DANISH, task=SUMMARISATION), |
|
Dataset(name="mlsum", language=GERMAN, task=SUMMARISATION), |
|
Dataset(name="rrn", language=ICELANDIC, task=SUMMARISATION), |
|
Dataset(name="no-sammendrag", language=NORWEGIAN, task=SUMMARISATION), |
|
Dataset(name="wiki-lingua-nl", language=DUTCH, task=SUMMARISATION), |
|
Dataset(name="swedn", language=SWEDISH, task=SUMMARISATION), |
|
Dataset(name="cnn-dailymail", language=ENGLISH, task=SUMMARISATION), |
|
Dataset(name="danish-citizen-tests", language=DANISH, task=KNOWLEDGE), |
|
Dataset(name="danske-talemaader", language=DANISH, task=KNOWLEDGE), |
|
Dataset(name="mmlu-no", language=NORWEGIAN, task=KNOWLEDGE), |
|
Dataset(name="mmlu-sv", language=SWEDISH, task=KNOWLEDGE), |
|
Dataset(name="mmlu-is", language=ICELANDIC, task=KNOWLEDGE), |
|
Dataset(name="mmlu-de", language=GERMAN, task=KNOWLEDGE), |
|
Dataset(name="mmlu-nl", language=DUTCH, task=KNOWLEDGE), |
|
Dataset(name="mmlu", language=ENGLISH, task=KNOWLEDGE), |
|
Dataset(name="hellaswag-da", language=DANISH, task=REASONING), |
|
Dataset(name="hellaswag-no", language=NORWEGIAN, task=REASONING), |
|
Dataset(name="hellaswag-sv", language=SWEDISH, task=REASONING), |
|
Dataset(name="hellaswag-is", language=ICELANDIC, task=REASONING), |
|
Dataset(name="hellaswag-de", language=GERMAN, task=REASONING), |
|
Dataset(name="hellaswag-nl", language=DUTCH, task=REASONING), |
|
Dataset(name="hellaswag", language=ENGLISH, task=REASONING), |
|
] |
|
|
|
|
|
def main() -> None: |
|
"""Produce a radial plot.""" |
|
|
|
global last_fetch |
|
results_dfs = fetch_results() |
|
last_fetch = dt.datetime.now() |
|
|
|
all_languages = sorted( |
|
[language.name for language in ALL_LANGUAGES.values()], |
|
key=lambda language_name: language_name.lower(), |
|
) |
|
danish_models = sorted( |
|
list({model_id for model_id in results_dfs[DANISH].index}), |
|
key=lambda model_id: model_id.lower(), |
|
) |
|
|
|
global colour_mapping |
|
global seed |
|
seed = 4242 |
|
update_colour_mapping(results_dfs=results_dfs) |
|
|
|
with gr.Blocks(theme=gr.themes.Monochrome()) as demo: |
|
gr.Markdown(INTRO_MARKDOWN) |
|
|
|
with gr.Tab(label="Build a Radial Plot"): |
|
with gr.Column(): |
|
with gr.Row(): |
|
language_names_dropdown = gr.Dropdown( |
|
choices=all_languages, |
|
multiselect=True, |
|
label="Languages", |
|
value=["Danish"], |
|
interactive=True, |
|
scale=2, |
|
) |
|
model_ids_dropdown = gr.Dropdown( |
|
choices=danish_models, |
|
multiselect=True, |
|
label="Models", |
|
value=["gpt-4-0613", "mistralai/Mistral-7B-v0.1"], |
|
interactive=True, |
|
scale=2, |
|
) |
|
with gr.Row(): |
|
use_rank_score_checkbox = gr.Checkbox( |
|
label="Compare models with rank scores (as opposed to raw " |
|
"scores)", |
|
value=True, |
|
interactive=True, |
|
scale=1, |
|
) |
|
show_scale_checkbox = gr.Checkbox( |
|
label="Show the scale on the plot (always 0-100)", |
|
value=False, |
|
interactive=True, |
|
scale=1, |
|
) |
|
plot_width_slider = gr.Slider( |
|
label="Plot width", |
|
minimum=600, |
|
maximum=1000, |
|
step=10, |
|
value=800, |
|
interactive=True, |
|
scale=1, |
|
) |
|
plot_height_slider = gr.Slider( |
|
label="Plot height", |
|
minimum=300, |
|
maximum=700, |
|
step=10, |
|
value=500, |
|
interactive=True, |
|
scale=1, |
|
) |
|
update_colours_button = gr.Button( |
|
value="Update colours", |
|
interactive=True, |
|
scale=1, |
|
) |
|
with gr.Row(): |
|
plot = gr.Plot( |
|
value=produce_radial_plot( |
|
model_ids_dropdown.value, |
|
language_names=language_names_dropdown.value, |
|
use_rank_score=use_rank_score_checkbox.value, |
|
show_scale=show_scale_checkbox.value, |
|
plot_width=plot_width_slider.value, |
|
plot_height=plot_height_slider.value, |
|
results_dfs=results_dfs, |
|
), |
|
) |
|
with gr.Tab(label="About"): |
|
gr.Markdown(ABOUT_MARKDOWN) |
|
|
|
gr.Markdown( |
|
"<center>Made with ❤️ by the <a href=\"https://alexandra.dk\">" |
|
"Alexandra Institute</a>.</center>" |
|
) |
|
|
|
language_names_dropdown.change( |
|
fn=partial(update_model_ids_dropdown, results_dfs=results_dfs), |
|
inputs=[language_names_dropdown, model_ids_dropdown], |
|
outputs=model_ids_dropdown, |
|
) |
|
|
|
|
|
update_plot_kwargs = dict( |
|
fn=partial( |
|
produce_radial_plot, |
|
results_dfs=results_dfs, |
|
), |
|
inputs=[ |
|
model_ids_dropdown, |
|
language_names_dropdown, |
|
use_rank_score_checkbox, |
|
show_scale_checkbox, |
|
plot_width_slider, |
|
plot_height_slider, |
|
], |
|
outputs=plot, |
|
) |
|
language_names_dropdown.change(**update_plot_kwargs) |
|
model_ids_dropdown.change(**update_plot_kwargs) |
|
use_rank_score_checkbox.change(**update_plot_kwargs) |
|
show_scale_checkbox.change(**update_plot_kwargs) |
|
plot_width_slider.change(**update_plot_kwargs) |
|
plot_height_slider.change(**update_plot_kwargs) |
|
|
|
|
|
update_colours_button.click( |
|
fn=partial(update_colour_mapping, results_dfs=results_dfs), |
|
).then(**update_plot_kwargs) |
|
|
|
demo.launch() |
|
|
|
|
|
def update_model_ids_dropdown( |
|
language_names: list[str], |
|
model_ids: list[str], |
|
results_dfs: dict[Language, pd.DataFrame] | None, |
|
) -> dict: |
|
"""When the language names are updated, update the model ids dropdown. |
|
|
|
Args: |
|
language_names: |
|
The names of the languages to include in the plot. |
|
model_ids: |
|
The ids of the models to include in the plot. |
|
results_dfs: |
|
The results dataframes for each language. |
|
|
|
Returns: |
|
The Gradio update to the model ids dropdown. |
|
""" |
|
global last_fetch |
|
minutes_since_last_fetch = (dt.datetime.now() - last_fetch).total_seconds() / 60 |
|
if minutes_since_last_fetch > UPDATE_FREQUENCY_MINUTES: |
|
results_dfs = fetch_results() |
|
last_fetch = dt.datetime.now() |
|
|
|
if results_dfs is None or len(language_names) == 0: |
|
if results_dfs is None: |
|
logger.info("No results fetched yet. Resetting model ids dropdown.") |
|
else: |
|
logger.info("No languages selected. Resetting model ids dropdown.") |
|
return gr.update(choices=[], value=[]) |
|
|
|
tasks = [ |
|
task |
|
for task in ALL_TASKS |
|
if all( |
|
task in df.columns |
|
for language, df in results_dfs.items() |
|
if language.name in language_names |
|
) |
|
] |
|
|
|
filtered_results_dfs = { |
|
language: df[tasks] |
|
for language, df in results_dfs.items() |
|
if language.name in language_names |
|
} |
|
|
|
unique_models: set[str] = { |
|
str(model_id) |
|
for df in filtered_results_dfs.values() |
|
for model_id in df.index |
|
} |
|
|
|
filtered_models: list[str] = sorted( |
|
[ |
|
model_id |
|
for model_id in unique_models |
|
if all(model_id in df.index for df in filtered_results_dfs.values()) |
|
], |
|
key=lambda model_id: model_id.lower(), |
|
) |
|
|
|
if len(filtered_models) == 0: |
|
logger.info( |
|
"No valid models for the selected languages. Resetting model ids dropdown." |
|
) |
|
return gr.update(choices=[], value=[]) |
|
|
|
valid_selected_models: list[str] = [ |
|
model_id for model_id in model_ids if model_id in filtered_models |
|
] |
|
if not valid_selected_models: |
|
if len(filtered_models) > 1: |
|
valid_selected_models = random.sample(population=filtered_models, k=2) |
|
elif len(filtered_models) == 1: |
|
valid_selected_models = random.sample(population=filtered_models, k=1) |
|
|
|
logger.info( |
|
f"Updated model ids dropdown with {len(filtered_models):,} valid models for " |
|
f"the selected languages, with {valid_selected_models} selected." |
|
) |
|
|
|
return gr.update(choices=filtered_models, value=valid_selected_models) |
|
|
|
|
|
def produce_radial_plot( |
|
model_ids: list[str], |
|
language_names: list[str], |
|
use_rank_score: bool, |
|
show_scale: bool, |
|
plot_width: int, |
|
plot_height: int, |
|
results_dfs: dict[Language, pd.DataFrame] | None, |
|
) -> go.Figure: |
|
"""Produce a radial plot as a plotly figure. |
|
|
|
Args: |
|
model_ids: |
|
The ids of the models to include in the plot. |
|
language_names: |
|
The names of the languages to include in the plot. |
|
use_rank_score: |
|
Whether to use rank scores (as opposed to raw scores). |
|
show_scale: |
|
Whether to show the scale on the plot. |
|
plot_width: |
|
The width of the plot. |
|
plot_height: |
|
The height of the plot. |
|
results_dfs: |
|
The results dataframes for each language. |
|
|
|
Returns: |
|
A plotly figure. |
|
""" |
|
global last_fetch |
|
minutes_since_last_fetch = (dt.datetime.now() - last_fetch).total_seconds() / 60 |
|
if minutes_since_last_fetch > UPDATE_FREQUENCY_MINUTES: |
|
results_dfs = fetch_results() |
|
last_fetch = dt.datetime.now() |
|
|
|
if results_dfs is None or len(language_names) == 0 or len(model_ids) == 0: |
|
if results_dfs is None: |
|
logger.info("No results fetched yet. Resetting plot.") |
|
elif len(language_names) == 0: |
|
logger.info("No languages selected. Resetting plot.") |
|
else: |
|
logger.info("No models selected. Resetting plot.") |
|
return go.Figure() |
|
|
|
logger.info( |
|
f"Producing radial plot for models {model_ids!r} on languages " |
|
f"{language_names!r}..." |
|
) |
|
|
|
languages = [ALL_LANGUAGES[language_name] for language_name in language_names] |
|
|
|
results_dfs_filtered = { |
|
language: df |
|
for language, df in results_dfs.items() |
|
if language.name in language_names |
|
} |
|
|
|
tasks = [ |
|
task |
|
for task in ALL_TASKS |
|
if all(task in df.columns for df in results_dfs_filtered.values()) |
|
] |
|
|
|
|
|
logger.info("Computing rank scores...") |
|
all_rank_scores: dict[Task, dict[Language, dict[str, float]]] = { |
|
task: { |
|
language: dict() |
|
for language in languages |
|
} |
|
for task in tasks |
|
} |
|
for task in tasks: |
|
for language in languages: |
|
df = results_dfs_filtered[language][task].dropna() |
|
model_ids_sorted: list[str] = ( |
|
df.map(np.mean).sort_values(ascending=False).index.tolist() |
|
) |
|
ranks = list() |
|
rank = 0 |
|
best_scores = None |
|
for model_id in model_ids_sorted: |
|
if best_scores is None: |
|
best_scores = df.loc[model_id] |
|
rank = 1 |
|
else: |
|
scores = df.loc[model_id] |
|
worse_than_previous_models = stats.ttest_rel( |
|
a=best_scores, b=scores, alternative="greater" |
|
).pvalue < 0.05 |
|
if worse_than_previous_models: |
|
rank += 1 |
|
best_scores = scores |
|
ranks.append(rank) |
|
|
|
log_ranks = np.log(ranks) |
|
scores = 1 - (log_ranks / log_ranks.max()) |
|
for model_id, score in zip(model_ids_sorted, scores): |
|
all_rank_scores[task][language][model_id] = score |
|
logger.info("Successfully computed rank scores.") |
|
|
|
|
|
results: list[list[float]] = list() |
|
for model_id in model_ids: |
|
result_list = list() |
|
for task in tasks: |
|
|
|
rank_scores = list() |
|
scores = list() |
|
for language in languages: |
|
if model_id not in results_dfs_filtered[language].index: |
|
continue |
|
|
|
rank_score = 100 * all_rank_scores[task][language][model_id] |
|
rank_scores.append(rank_score) |
|
|
|
score_arr = np.array(results_dfs_filtered[language].loc[model_id][task]) |
|
if score_arr.mean() < 1: |
|
score_arr *= 100 |
|
scores.append(score_arr.mean()) |
|
if use_rank_score: |
|
result_list.append(np.mean(rank_scores)) |
|
else: |
|
result_list.append(np.mean(scores)) |
|
results.append(result_list) |
|
|
|
|
|
|
|
result_matrix = np.array(results) |
|
num_models = result_matrix.shape[0] |
|
num_tasks = result_matrix.shape[1] |
|
num_models_beaten = np.zeros((num_models, num_tasks)) |
|
for i in range(num_models): |
|
for j in range(num_tasks): |
|
num_models_beaten[i, j] = np.sum( |
|
result_matrix[i, j] > result_matrix[:, j] |
|
) |
|
|
|
|
|
|
|
|
|
sorted_idxs = num_models_beaten.sum(axis=1).argsort()[::-1] |
|
model_ids = np.asarray(model_ids)[sorted_idxs].tolist() |
|
results = result_matrix[sorted_idxs].tolist() |
|
|
|
|
|
fig = go.Figure() |
|
for model_id, result_list in zip(model_ids, results): |
|
r, g, b = colour_mapping[model_id] |
|
fig.add_trace(go.Scatterpolar( |
|
r=result_list, |
|
theta=[task.name for task in tasks], |
|
name=model_id, |
|
fill='toself', |
|
fillcolor=f'rgba({r}, {g}, {b}, 0.6)', |
|
line=dict(color=f'rgb({r}, {g}, {b})'), |
|
)) |
|
|
|
languages_str = "" |
|
if len(languages) > 1: |
|
languages_str = ", ".join([language.name for language in languages[:-1]]) |
|
languages_str += " and " |
|
languages_str += languages[-1].name |
|
|
|
if use_rank_score: |
|
title = f'Rank Score on on {languages_str} Language Tasks' |
|
else: |
|
title = f'Raw Score on on {languages_str} Language Tasks' |
|
|
|
|
|
fig.update_layout( |
|
polar=dict(radialaxis=dict(visible=show_scale, range=[0, 100])), |
|
showlegend=True, |
|
title=title, |
|
width=plot_width, |
|
height=plot_height, |
|
) |
|
|
|
logger.info("Successfully produced radial plot.") |
|
|
|
return fig |
|
|
|
|
|
def fetch_results() -> dict[Language, pd.DataFrame]: |
|
"""Fetch the results from the ScandEval benchmark. |
|
|
|
Returns: |
|
A dictionary of languages -> results-dataframes, whose indices are the |
|
models and columns are the tasks. |
|
""" |
|
logger.info("Fetching results from ScandEval benchmark...") |
|
|
|
response = requests.get( |
|
"https://www.scandeval.com/scandeval_benchmark_results.jsonl" |
|
) |
|
response.raise_for_status() |
|
records = [ |
|
json.loads(dct_str) |
|
for dct_str in response.text.split("\n") |
|
if dct_str.strip("\n") |
|
] |
|
|
|
|
|
|
|
results_dfs = dict() |
|
for language in {dataset.language for dataset in DATASETS}: |
|
possible_dataset_names = { |
|
dataset.name for dataset in DATASETS if dataset.language == language |
|
} |
|
data_dict = defaultdict(dict) |
|
for record in records: |
|
model_name = record["model"] |
|
|
|
|
|
if "gpt-3.5" in model_name or "gpt-4" in model_name: |
|
if not record.get("validation_split", False): |
|
continue |
|
|
|
dataset_name = record["dataset"] |
|
if dataset_name in possible_dataset_names: |
|
dataset = next( |
|
dataset for dataset in DATASETS if dataset.name == dataset_name |
|
) |
|
scores = [ |
|
test_score_dict.get( |
|
f"test_{dataset.task.metric}", |
|
test_score_dict.get(dataset.task.metric) |
|
) |
|
for test_score_dict in record["results"]["raw"]["test"] |
|
] |
|
if dataset.task in data_dict[model_name]: |
|
data_dict[model_name][dataset.task].append(scores) |
|
else: |
|
data_dict[model_name][dataset.task] = [scores] |
|
results_df = pd.DataFrame(data_dict).T.map( |
|
lambda lists_or_nan: |
|
list(it.chain(lists_or_nan)) |
|
if lists_or_nan == lists_or_nan |
|
else lists_or_nan |
|
).dropna().map(lambda lst: lst[0]) |
|
results_dfs[language] = results_df |
|
|
|
logger.info("Successfully fetched results from ScandEval benchmark.") |
|
|
|
return results_dfs |
|
|
|
|
|
def update_colour_mapping(results_dfs: dict[Language, pd.DataFrame]) -> None: |
|
"""Get a mapping from model ids to RGB triplets. |
|
|
|
Args: |
|
results_dfs: |
|
The results dataframes for each language. |
|
""" |
|
global colour_mapping |
|
global seed |
|
seed += 1 |
|
|
|
gr.Info(f"Updating colour mapping...") |
|
|
|
|
|
all_models = list( |
|
{model_id for df in results_dfs.values() for model_id in df.index} |
|
) |
|
colour_mapping = dict() |
|
|
|
for i in it.count(): |
|
min_colour_distance = MIN_COLOUR_DISTANCE_BETWEEN_MODELS - i |
|
retries_left = 10 * len(all_models) |
|
for model_id in all_models: |
|
random.seed(hash(model_id) + i + seed) |
|
r, g, b = 0, 0, 0 |
|
too_bright, similar_to_other_model = True, True |
|
while (too_bright or similar_to_other_model) and retries_left > 0: |
|
r, g, b = tuple(random.randint(0, 255) for _ in range(3)) |
|
too_bright = np.min([r, g, b]) > 200 |
|
similar_to_other_model = any( |
|
np.abs( |
|
np.array(colour) - np.array([r, g, b]) |
|
).sum() < min_colour_distance |
|
for colour in colour_mapping.values() |
|
) |
|
retries_left -= 1 |
|
colour_mapping[model_id] = (r, g, b) |
|
|
|
if retries_left: |
|
logger.info( |
|
f"Successfully found a colour mapping with min colour distance " |
|
f"{min_colour_distance}." |
|
) |
|
break |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|