"""Script to produce radial plots.""" from functools import partial import plotly.graph_objects as go import json import numpy as np from collections import defaultdict import pandas as pd from pydantic import BaseModel import gradio as gr import requests import random class Task(BaseModel): """Class to hold task information.""" name: str metric: str def __hash__(self): return hash(self.name) class Language(BaseModel): """Class to hold language information.""" code: str name: str def __hash__(self): return hash(self.code) class Dataset(BaseModel): """Class to hold dataset information.""" name: str language: Language task: Task def __hash__(self): return hash(self.name) TEXT_CLASSIFICATION = Task(name="text classification", metric="mcc") INFORMATION_EXTRACTION = Task(name="information extraction", metric="micro_f1_no_misc") GRAMMAR = Task(name="grammar", metric="mcc") QUESTION_ANSWERING = Task(name="question answering", metric="em") SUMMARISATION = Task(name="summarisation", metric="bertscore") KNOWLEDGE = Task(name="knowledge", metric="mcc") REASONING = Task(name="reasoning", metric="mcc") ALL_TASKS = [obj for obj in globals().values() if isinstance(obj, Task)] DANISH = Language(code="da", name="Danish") NORWEGIAN = Language(code="no", name="Norwegian") SWEDISH = Language(code="sv", name="Swedish") ICELANDIC = Language(code="is", name="Icelandic") FAROESE = Language(code="fo", name="Faroese") GERMAN = Language(code="de", name="German") DUTCH = Language(code="nl", name="Dutch") ENGLISH = Language(code="en", name="English") ALL_LANGUAGES = { obj.name: obj for obj in globals().values() if isinstance(obj, Language) } DATASETS = [ Dataset(name="swerec", language=SWEDISH, task=TEXT_CLASSIFICATION), Dataset(name="angry-tweets", language=DANISH, task=TEXT_CLASSIFICATION), Dataset(name="norec", language=NORWEGIAN, task=TEXT_CLASSIFICATION), Dataset(name="sb10k", language=GERMAN, task=TEXT_CLASSIFICATION), Dataset(name="dutch-social", language=DUTCH, task=TEXT_CLASSIFICATION), Dataset(name="sst5", language=ENGLISH, task=TEXT_CLASSIFICATION), Dataset(name="suc3", language=SWEDISH, task=INFORMATION_EXTRACTION), Dataset(name="dansk", language=DANISH, task=INFORMATION_EXTRACTION), Dataset(name="norne-nb", language=NORWEGIAN, task=INFORMATION_EXTRACTION), Dataset(name="norne-nn", language=NORWEGIAN, task=INFORMATION_EXTRACTION), Dataset(name="mim-gold-ner", language=ICELANDIC, task=INFORMATION_EXTRACTION), Dataset(name="fone", language=FAROESE, task=INFORMATION_EXTRACTION), Dataset(name="germeval", language=GERMAN, task=INFORMATION_EXTRACTION), Dataset(name="conll-nl", language=DUTCH, task=INFORMATION_EXTRACTION), Dataset(name="conll-en", language=ENGLISH, task=INFORMATION_EXTRACTION), Dataset(name="scala-sv", language=SWEDISH, task=GRAMMAR), Dataset(name="scala-da", language=DANISH, task=GRAMMAR), Dataset(name="scala-nb", language=NORWEGIAN, task=GRAMMAR), Dataset(name="scala-nn", language=NORWEGIAN, task=GRAMMAR), Dataset(name="scala-is", language=ICELANDIC, task=GRAMMAR), Dataset(name="scala-fo", language=FAROESE, task=GRAMMAR), Dataset(name="scala-de", language=GERMAN, task=GRAMMAR), Dataset(name="scala-nl", language=DUTCH, task=GRAMMAR), Dataset(name="scala-en", language=ENGLISH, task=GRAMMAR), Dataset(name="scandiqa-da", language=DANISH, task=QUESTION_ANSWERING), Dataset(name="norquad", language=NORWEGIAN, task=QUESTION_ANSWERING), Dataset(name="scandiqa-sv", language=SWEDISH, task=QUESTION_ANSWERING), Dataset(name="nqii", language=ICELANDIC, task=QUESTION_ANSWERING), Dataset(name="germanquad", language=GERMAN, task=QUESTION_ANSWERING), Dataset(name="squad", language=ENGLISH, task=QUESTION_ANSWERING), Dataset(name="squad-nl", language=DUTCH, task=QUESTION_ANSWERING), Dataset(name="nordjylland-news", language=DANISH, task=SUMMARISATION), Dataset(name="mlsum", language=GERMAN, task=SUMMARISATION), Dataset(name="rrn", language=ICELANDIC, task=SUMMARISATION), Dataset(name="no-sammendrag", language=NORWEGIAN, task=SUMMARISATION), Dataset(name="wiki-lingua-nl", language=DUTCH, task=SUMMARISATION), Dataset(name="swedn", language=SWEDISH, task=SUMMARISATION), Dataset(name="cnn-dailymail", language=ENGLISH, task=SUMMARISATION), Dataset(name="mmlu-da", language=DANISH, task=KNOWLEDGE), Dataset(name="mmlu-no", language=NORWEGIAN, task=KNOWLEDGE), Dataset(name="mmlu-sv", language=SWEDISH, task=KNOWLEDGE), Dataset(name="mmlu-is", language=ICELANDIC, task=KNOWLEDGE), Dataset(name="mmlu-de", language=GERMAN, task=KNOWLEDGE), Dataset(name="mmlu-nl", language=DUTCH, task=KNOWLEDGE), Dataset(name="mmlu", language=ENGLISH, task=KNOWLEDGE), Dataset(name="arc-da", language=DANISH, task=KNOWLEDGE), Dataset(name="arc-no", language=NORWEGIAN, task=KNOWLEDGE), Dataset(name="arc-sv", language=SWEDISH, task=KNOWLEDGE), Dataset(name="arc-is", language=ICELANDIC, task=KNOWLEDGE), Dataset(name="arc-de", language=GERMAN, task=KNOWLEDGE), Dataset(name="arc-nl", language=DUTCH, task=KNOWLEDGE), Dataset(name="arc", language=ENGLISH, task=KNOWLEDGE), Dataset(name="hellaswag-da", language=DANISH, task=REASONING), Dataset(name="hellaswag-no", language=NORWEGIAN, task=REASONING), Dataset(name="hellaswag-sv", language=SWEDISH, task=REASONING), Dataset(name="hellaswag-is", language=ICELANDIC, task=REASONING), Dataset(name="hellaswag-de", language=GERMAN, task=REASONING), Dataset(name="hellaswag-nl", language=DUTCH, task=REASONING), Dataset(name="hellaswag", language=ENGLISH, task=REASONING), ] def main() -> None: """Produce a radial plot.""" # Download all the newest records response = requests.get("https://scandeval.com/scandeval_benchmark_results.jsonl") response.raise_for_status() records = [ json.loads(dct_str) for dct_str in response.text.split("\n") if dct_str.strip("\n") ] # Build a dictionary of languages -> results-dataframes, whose indices are the # models and columns are the tasks. results_dfs = dict() for language in {dataset.language for dataset in DATASETS}: possible_dataset_names = { dataset.name for dataset in DATASETS if dataset.language == language } data_dict = defaultdict(dict) for record in records: model_name = record["model"] dataset_name = record["dataset"] if dataset_name in possible_dataset_names: dataset = next( dataset for dataset in DATASETS if dataset.name == dataset_name ) results_dict = record['results']['total'] score = results_dict.get( f"test_{dataset.task.metric}", results_dict.get(dataset.task.metric) ) if dataset.task in data_dict[model_name]: data_dict[model_name][dataset.task].append(score) else: data_dict[model_name][dataset.task] = [score] results_df = pd.DataFrame(data_dict).T.map( lambda list_or_nan: np.mean(list_or_nan) if list_or_nan == list_or_nan else list_or_nan ).dropna() if any(task not in results_df.columns for task in ALL_TASKS): results_dfs[language] = pd.DataFrame() else: results_dfs[language] = results_df all_languages: list[str | int | float | tuple[str, str | int | float]] | None = [ language.name for language in ALL_LANGUAGES.values() ] all_models: list[str | int | float | tuple[str, str | int | float]] | None = list({ model_id for df in results_dfs.values() for model_id in df.index }) with gr.Blocks(theme=gr.themes.Monochrome()) as demo: gr.Markdown("# Radial Plot Generator") gr.Markdown( "This demo allows you to generate a radial plot comparing the performance " "of different language models on different tasks. It is based on the " "generative results from the [ScandEval benchmark](https://scandeval.com)." ) with gr.Row(): with gr.Column(): language_names_dropdown = gr.Dropdown( choices=all_languages, multiselect=True, label="Languages", value=["Danish"], interactive=True, ) model_ids_dropdown = gr.Dropdown( choices=all_models, multiselect=True, label="Models", value=["gpt-3.5-turbo-0613", "mistralai/Mistral-7B-v0.1"], interactive=True, ) use_win_ratio_checkbox = gr.Checkbox( label="Compare models with win ratios (as opposed to raw scores)", value=True, interactive=True, ) gr.Markdown( "