Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
KlaudiaTH
commited on
Commit
·
2b62c4c
1
Parent(s):
6ee7d57
Release version of leaderboard implementation
Browse files- .gitattributes +36 -0
- .github/workflows/check_large_files-action.yml +16 -0
- .github/workflows/push_to_hfspace-action.yml +21 -0
- .gitignore +2 -0
- README.md +49 -1
- app.py +160 -0
- core.py +235 -0
- pyproject.toml +2 -0
- requirements.txt +20 -0
- style.py +16 -0
.gitattributes
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
small_merged_data.xlsx filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/check_large_files-action.yml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Check file size
|
2 |
+
on: # or directly `on: [push]` to run the action on every push on any branch
|
3 |
+
pull_request:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- name: Check large files
|
14 |
+
uses: ActionsDesk/lfs-warning@v2.0
|
15 |
+
with:
|
16 |
+
filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
|
.github/workflows/push_to_hfspace-action.yml
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- uses: actions/checkout@v3
|
14 |
+
with:
|
15 |
+
token: ${{ secrets.GITHUB_TOKEN }}
|
16 |
+
fetch-depth: 0
|
17 |
+
lfs: true
|
18 |
+
- name: Push to hub
|
19 |
+
env:
|
20 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
21 |
+
run: git push https://gptxuser:$HF_TOKEN@huggingface.co/spaces/openGPT-X/leaderboard main
|
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.vscode/
|
2 |
+
__pycache__/
|
README.md
CHANGED
@@ -1 +1,49 @@
|
|
1 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# New data model
|
2 |
+
|
3 |
+
The new model is constructed by taking individual json files in data/new_eval, combining them together into
|
4 |
+
a simple format, and from the combined df, we create individual files for each models.
|
5 |
+
|
6 |
+
For the new eval runs which has to be appended, we first analyze the model associated with the json file
|
7 |
+
produced from eval harness, select the corresponding model file to append, find the unique rows (unique configuration
|
8 |
+
of model name, language, task group and few shot) in the json file, append if unique rows are not 0.
|
9 |
+
|
10 |
+
|
11 |
+
---
|
12 |
+
title: Leaderboard
|
13 |
+
emoji: 👁
|
14 |
+
colorFrom: blue
|
15 |
+
colorTo: blue
|
16 |
+
sdk: gradio
|
17 |
+
sdk_version: 4.19.2
|
18 |
+
app_file: app.py
|
19 |
+
pinned: false
|
20 |
+
license: unknown
|
21 |
+
---
|
22 |
+
|
23 |
+
# Introduction
|
24 |
+
|
25 |
+
This is the OpenGPT-X mutlilingual leaderboard source code repository.
|
26 |
+
The leaderboard aims to provied an overview of LLM performance over various languages.
|
27 |
+
The basic task set consists of MMLU, ARC, HellaSwag, GSM8k, TruthfulQA and belebele.
|
28 |
+
To make the results comparable to the Open LLM leaderboard (https://huggingface.co/open-llm-leaderboard) we selected the former five tasks based on our internal machine translations of the English base tasks, in addition to the high-quality multilingual benchmark belebele by Meta.
|
29 |
+
|
30 |
+
# Usage
|
31 |
+
|
32 |
+
The actually hosted leaderboard can be found under https://huggingface.co/spaces/openGPT-X/leaderboard.
|
33 |
+
In order to extend its functionality please create a PR.
|
34 |
+
|
35 |
+
# Adding new tasks
|
36 |
+
|
37 |
+
In order to add new evaluation tasks proceed as follows:
|
38 |
+
|
39 |
+
1. Add task information to `TASK_INFO` in `src/data.py`. It should be a dict mapping the task display name to the metric to be shown, as well as a dict containing mappings from two-letter language codes to the corresponding lm-eval-harness task selection string. See existing task information for reference.
|
40 |
+
2. Add evaluation results as detailed below.
|
41 |
+
|
42 |
+
# Adding new models
|
43 |
+
|
44 |
+
It is possible to change the display name of a particular model.
|
45 |
+
Simply add an entry to `_MODEL_NAMES` in `src/data.py`.
|
46 |
+
|
47 |
+
# Adding evaluation results
|
48 |
+
|
49 |
+
Copy the `.json`-output generated by the lm-eval-harness into `data`.
|
app.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
import core as core
|
4 |
+
from style import CSS, T_SYMBOLS, TITLE
|
5 |
+
|
6 |
+
demo = gr.Blocks(css=CSS)
|
7 |
+
with demo:
|
8 |
+
gr.HTML(TITLE)
|
9 |
+
gr.Markdown(
|
10 |
+
"This is a (WIP) collection of multilingual evaluation results obtained using our fork of the LM-evaluation-harness (https://github.com/OpenGPTX/lm-evaluation-harness), based on https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard.\
|
11 |
+
Note that currently, not all benchmarks are available in all languages, results are averaged over those languages under the selected ones for which the benchmark is available.",
|
12 |
+
elem_classes="markdown-text",
|
13 |
+
)
|
14 |
+
|
15 |
+
with gr.Column():
|
16 |
+
with gr.Row():
|
17 |
+
with gr.Column():
|
18 |
+
with gr.Row():
|
19 |
+
search_bar = gr.Textbox(
|
20 |
+
label="Search models",
|
21 |
+
placeholder=" 🔍 Separate multiple queries with ';' and press ENTER...",
|
22 |
+
show_label=True,
|
23 |
+
elem_id="search-bar",
|
24 |
+
)
|
25 |
+
|
26 |
+
model_types = gr.CheckboxGroup(
|
27 |
+
label="Select model type",
|
28 |
+
choices=[
|
29 |
+
(
|
30 |
+
f"Pretrained {T_SYMBOLS['pretrained']}",
|
31 |
+
T_SYMBOLS["pretrained"],
|
32 |
+
),
|
33 |
+
(f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
|
34 |
+
],
|
35 |
+
value=list(T_SYMBOLS.values()),
|
36 |
+
)
|
37 |
+
with gr.Row():
|
38 |
+
langs_bar = gr.CheckboxGroup(
|
39 |
+
choices=core.languages_list,
|
40 |
+
value=core.languages_list,
|
41 |
+
label="Select languages to average over",
|
42 |
+
elem_id="column-select",
|
43 |
+
interactive=True,
|
44 |
+
scale=6,
|
45 |
+
)
|
46 |
+
with gr.Column(scale=1):
|
47 |
+
clear = gr.ClearButton(
|
48 |
+
langs_bar,
|
49 |
+
value="Deselect all languages",
|
50 |
+
size="sm",
|
51 |
+
scale=1,
|
52 |
+
)
|
53 |
+
select = gr.Button(
|
54 |
+
value="Select all languages", size="sm", scale=1
|
55 |
+
)
|
56 |
+
|
57 |
+
def update_bar():
|
58 |
+
langs_bar = gr.CheckboxGroup(
|
59 |
+
choices=core.languages_list,
|
60 |
+
value=core.languages_list,
|
61 |
+
label="Select languages to average over",
|
62 |
+
elem_id="column-select",
|
63 |
+
interactive=True,
|
64 |
+
)
|
65 |
+
return langs_bar
|
66 |
+
|
67 |
+
select.click(update_bar, inputs=[], outputs=langs_bar)
|
68 |
+
|
69 |
+
with gr.Row():
|
70 |
+
acc_task_group_names = core.task_groups_with_task_type("accuracy")
|
71 |
+
shown_tasks = gr.CheckboxGroup(
|
72 |
+
choices=acc_task_group_names,
|
73 |
+
value=acc_task_group_names,
|
74 |
+
label="Select tasks to show",
|
75 |
+
elem_id="column-select",
|
76 |
+
interactive=True,
|
77 |
+
scale=50,
|
78 |
+
)
|
79 |
+
fewshot = gr.Radio(
|
80 |
+
choices=[("0-Shot", False), ("Few-shot", True)],
|
81 |
+
value=True,
|
82 |
+
label="Select evaluation type",
|
83 |
+
interactive=True,
|
84 |
+
scale=29,
|
85 |
+
)
|
86 |
+
fewshot.change(
|
87 |
+
core.fix_zeroshot, [shown_tasks, fewshot], shown_tasks
|
88 |
+
)
|
89 |
+
clear = gr.ClearButton(
|
90 |
+
shown_tasks, value="Deselect all tasks", size="sm", scale=21
|
91 |
+
)
|
92 |
+
|
93 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
94 |
+
with gr.TabItem(
|
95 |
+
"🏅 LLM accuracy benchmark", elem_id="llm-benchmark-tab-table-acc", id=0
|
96 |
+
) as acc:
|
97 |
+
leaderboard_table = gr.Dataframe()
|
98 |
+
with gr.TabItem(
|
99 |
+
"🌐 LLM translation benchmark",
|
100 |
+
elem_id="llm-benchmark-tab-table-misc",
|
101 |
+
id=1,
|
102 |
+
) as misc:
|
103 |
+
leaderboard_table_misc = gr.Dataframe()
|
104 |
+
with gr.TabItem("Plots", elem_id="llm-plot-tab", id=2) as plot:
|
105 |
+
leaderboard_plot = gr.Plot(elem_id="plot")
|
106 |
+
acc.select(
|
107 |
+
lambda x: core.update_tab_tasks(0, x),
|
108 |
+
inputs=fewshot,
|
109 |
+
outputs=[shown_tasks, fewshot],
|
110 |
+
)
|
111 |
+
misc.select(
|
112 |
+
lambda x: core.update_tab_tasks(1, x),
|
113 |
+
inputs=fewshot,
|
114 |
+
outputs=[shown_tasks, fewshot],
|
115 |
+
)
|
116 |
+
for comp, fn in [
|
117 |
+
(search_bar, "submit"),
|
118 |
+
(langs_bar, "change"),
|
119 |
+
(shown_tasks, "change"),
|
120 |
+
(fewshot, "change"),
|
121 |
+
(model_types, "change"),
|
122 |
+
]:
|
123 |
+
getattr(comp, fn)(
|
124 |
+
core.update_df,
|
125 |
+
[shown_tasks, search_bar, langs_bar, model_types, fewshot],
|
126 |
+
leaderboard_table,
|
127 |
+
)
|
128 |
+
getattr(comp, fn)(
|
129 |
+
core.update_df,
|
130 |
+
[shown_tasks, search_bar, langs_bar, model_types, fewshot],
|
131 |
+
leaderboard_table_misc,
|
132 |
+
)
|
133 |
+
getattr(comp, fn)(
|
134 |
+
core.update_plot,
|
135 |
+
[shown_tasks, search_bar, langs_bar, model_types, fewshot],
|
136 |
+
leaderboard_plot,
|
137 |
+
)
|
138 |
+
|
139 |
+
gr.Blocks.load(
|
140 |
+
block=demo,
|
141 |
+
fn=core.update_df,
|
142 |
+
inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
|
143 |
+
outputs=leaderboard_table,
|
144 |
+
)
|
145 |
+
|
146 |
+
gr.Blocks.load(
|
147 |
+
block=demo,
|
148 |
+
fn=core.update_df,
|
149 |
+
inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
|
150 |
+
outputs=leaderboard_table_misc,
|
151 |
+
)
|
152 |
+
|
153 |
+
gr.Blocks.load(
|
154 |
+
block=demo,
|
155 |
+
fn=core.update_plot,
|
156 |
+
inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
|
157 |
+
outputs=leaderboard_plot,
|
158 |
+
)
|
159 |
+
|
160 |
+
demo.launch()
|
core.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import itertools
|
2 |
+
import os
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
import plotly.express as px
|
8 |
+
from datasets import load_dataset
|
9 |
+
|
10 |
+
import style
|
11 |
+
|
12 |
+
TAB_STATE = 0 # FIXME
|
13 |
+
GSM8K_TASK_GROUP_NAME = "GSM8K" # FIXME
|
14 |
+
|
15 |
+
|
16 |
+
def init():
|
17 |
+
global repo_id, config_name, split_name, hidden_df, task_group_names_list, task_group_type_dict, task_groups_shots_dict, languages_list, model_type_dict
|
18 |
+
|
19 |
+
repo_id = os.getenv("OGX_LEADERBOARD_DATASET_NAME")
|
20 |
+
config_name = os.getenv("OGX_LEADERBOARD_DATASET_CONFIG")
|
21 |
+
split_name = os.getenv("OGX_LEADERBOARD_DATASET_SPLIT")
|
22 |
+
|
23 |
+
dataset = load_dataset(repo_id, config_name, split=split_name)
|
24 |
+
hidden_df = dataset.to_pandas()
|
25 |
+
|
26 |
+
task_group_names_list = hidden_df["Task_Group"].unique().tolist()
|
27 |
+
task_group_type_df = hidden_df[["Task_Group", "Task_Type"]].drop_duplicates()
|
28 |
+
task_group_type_dict = task_group_type_df.set_index("Task_Group")["Task_Type"].to_dict()
|
29 |
+
task_groups_shots_df = hidden_df[hidden_df["Few_Shot"] == True][["Task_Group", "Number_Shots"]].drop_duplicates()
|
30 |
+
task_groups_shots_dict = task_groups_shots_df.set_index("Task_Group")["Number_Shots"].to_dict()
|
31 |
+
languages_list = hidden_df["Language"].drop_duplicates().str.upper().tolist()
|
32 |
+
model_type_df = hidden_df[["Model_Name", "Model_Type"]].drop_duplicates()
|
33 |
+
model_type_dict = model_type_df.set_index("Model_Name")["Model_Type"].to_dict()
|
34 |
+
|
35 |
+
hidden_df = hidden_df.pivot_table(
|
36 |
+
columns=["Task_Group", "Few_Shot", "Language"],
|
37 |
+
index=["Model_Name"],
|
38 |
+
values="Value",
|
39 |
+
dropna=False,
|
40 |
+
).reset_index(inplace=False)
|
41 |
+
|
42 |
+
hidden_df["Type"] = hidden_df["Model_Name"].apply(lambda x: style.T_SYMBOLS[model_type_dict[x]])
|
43 |
+
|
44 |
+
|
45 |
+
def sort_cols(df: pd.DataFrame, fewshot: bool = False) -> pd.DataFrame:
|
46 |
+
task_cols = get_task_columns(df)
|
47 |
+
if fewshot:
|
48 |
+
renamer = {col: f"{col} ({task_groups_shots_dict[col]}-shot)" for col in task_cols if col in task_groups_shots_dict}
|
49 |
+
df.rename(columns=renamer, inplace=True)
|
50 |
+
task_cols = renamer.values()
|
51 |
+
return df.reindex(["Type", "Model_Name", "Average"] + sorted(task_cols), axis=1)
|
52 |
+
|
53 |
+
|
54 |
+
def get_task_columns(df: pd.DataFrame) -> pd.DataFrame:
|
55 |
+
l = list(df.columns)
|
56 |
+
l.remove("Model_Name")
|
57 |
+
l.remove("Average")
|
58 |
+
l.remove("Type")
|
59 |
+
return l
|
60 |
+
|
61 |
+
|
62 |
+
def get_models(df: pd.DataFrame) -> pd.DataFrame:
|
63 |
+
return df["Model_Name"].unique()
|
64 |
+
|
65 |
+
|
66 |
+
def filter_type(df: pd.DataFrame, model_types: list[str]) -> pd.DataFrame:
|
67 |
+
"""Keep only rows for which model type is in list of types"""
|
68 |
+
return df[df["Type"].isin(model_types)]
|
69 |
+
|
70 |
+
|
71 |
+
def search_model(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
72 |
+
"""Keep only rows for which model name matches search query"""
|
73 |
+
query = query.replace(";", "|")
|
74 |
+
return df[df["Model_Name"].str.contains(query, case=False)]
|
75 |
+
|
76 |
+
|
77 |
+
def aggregate_langs(df: pd.DataFrame, tasks: list, langs: list):
|
78 |
+
"""Aggregates results over langs for each task in tasks.
|
79 |
+
If a language does not exist for a task, the aggregate for
|
80 |
+
that task will be shown as NaN.
|
81 |
+
"""
|
82 |
+
|
83 |
+
langs_lower = [item.lower() for item in langs]
|
84 |
+
df.columns = ["_".join(filter(None, col)) for col in df.columns]
|
85 |
+
colset = set(df.columns)
|
86 |
+
for t in tasks:
|
87 |
+
cols = [(f"{a}_{b}") for a, b in itertools.product([t], langs_lower)]
|
88 |
+
if set(cols).issubset(colset):
|
89 |
+
df.loc[:, t] = df[cols].mean(axis=1, skipna=False)
|
90 |
+
else:
|
91 |
+
df.loc[:, t] = np.nan
|
92 |
+
df.loc[:, "Average"] = df[tasks].mean(axis=1)
|
93 |
+
return df[["Type", "Model_Name", "Average"] + tasks]
|
94 |
+
|
95 |
+
|
96 |
+
def select_shots(df: pd.DataFrame, fewshot: bool = False):
|
97 |
+
cols = [col for col in df.columns if col[1] == fewshot] + []
|
98 |
+
# Move model name and type icon to the end
|
99 |
+
cols.append(("Model_Name", "", ""))
|
100 |
+
cols.append(("Type", "", ""))
|
101 |
+
return df[cols].droplevel(level=1, axis="columns")
|
102 |
+
|
103 |
+
|
104 |
+
def update_df(
|
105 |
+
tasks: list[str],
|
106 |
+
model_query: str,
|
107 |
+
langs: list[str],
|
108 |
+
model_types: list[str],
|
109 |
+
fewshot: bool = False,
|
110 |
+
format: bool = True,
|
111 |
+
) -> pd.DataFrame:
|
112 |
+
"""Return a filtered dataframe according to selected models, tasks and
|
113 |
+
languages. The format flag controls whether the output dataframe should
|
114 |
+
be formatted to tw significant figures.
|
115 |
+
"""
|
116 |
+
# keep only selected shots
|
117 |
+
df = select_shots(hidden_df, fewshot)
|
118 |
+
|
119 |
+
# aggregate results over languages per task
|
120 |
+
df = aggregate_langs(df, tasks, langs)
|
121 |
+
|
122 |
+
# filter models by search bar and model type
|
123 |
+
df = search_model(df, model_query)
|
124 |
+
df = filter_type(df, model_types)
|
125 |
+
|
126 |
+
if format:
|
127 |
+
return sort_cols(df, fewshot).style.format(precision=2, decimal=".")
|
128 |
+
else:
|
129 |
+
return sort_cols(df, fewshot)
|
130 |
+
|
131 |
+
|
132 |
+
def make_plot(df: pd.DataFrame):
|
133 |
+
df.columns = df.loc["Model_Name"]
|
134 |
+
df = df.drop("Model_Name")
|
135 |
+
df = df.reset_index(names="task")
|
136 |
+
if len(df.columns) > 2:
|
137 |
+
fig = px.line(data_frame=df, x="task", y=df.columns, markers=True, width=1200)
|
138 |
+
else:
|
139 |
+
fig = px.bar(data_frame=df, x="task", y=df.columns[-1], width=1200)
|
140 |
+
fig.update_xaxes(type="category")
|
141 |
+
return fig
|
142 |
+
|
143 |
+
|
144 |
+
def update_plot(
|
145 |
+
tasks: list[str],
|
146 |
+
model_query: str,
|
147 |
+
langs: list[str],
|
148 |
+
model_types: list[str],
|
149 |
+
fewshot: bool = False,
|
150 |
+
):
|
151 |
+
df = update_df(tasks, model_query, langs, model_types, fewshot, False).transpose()
|
152 |
+
plot = make_plot(df)
|
153 |
+
return plot
|
154 |
+
|
155 |
+
|
156 |
+
def fix_zeroshot(tasks: list[str | int | float], fewshot: bool = False):
|
157 |
+
global TAB_STATE
|
158 |
+
selected_task_type = get_selected_task_type(TAB_STATE)
|
159 |
+
choices = task_groups_with_task_type(selected_task_type)
|
160 |
+
if not fewshot:
|
161 |
+
try:
|
162 |
+
choices.remove(GSM8K_TASK_GROUP_NAME)
|
163 |
+
except ValueError:
|
164 |
+
pass
|
165 |
+
value = [v for v in tasks if v in choices]
|
166 |
+
else:
|
167 |
+
if TAB_STATE == 0:
|
168 |
+
value = [v for v in tasks if v in choices] + [GSM8K_TASK_GROUP_NAME]
|
169 |
+
elif TAB_STATE == 1:
|
170 |
+
value = [v for v in tasks if v in choices]
|
171 |
+
shown_tasks = gr.CheckboxGroup(
|
172 |
+
choices=choices,
|
173 |
+
value=value,
|
174 |
+
label="Select tasks to show",
|
175 |
+
elem_id="column-select",
|
176 |
+
interactive=True,
|
177 |
+
scale=50,
|
178 |
+
)
|
179 |
+
return shown_tasks
|
180 |
+
|
181 |
+
|
182 |
+
def update_tab_tasks(id: int, fewshot: bool = False):
|
183 |
+
# when the tab is changed, update the TAB_STATE accordingly
|
184 |
+
global TAB_STATE
|
185 |
+
TAB_STATE = id
|
186 |
+
selected_task_type = get_selected_task_type(TAB_STATE)
|
187 |
+
choices = task_groups_with_task_type(selected_task_type)
|
188 |
+
if not fewshot:
|
189 |
+
try:
|
190 |
+
choices.remove(GSM8K_TASK_GROUP_NAME)
|
191 |
+
except ValueError:
|
192 |
+
pass
|
193 |
+
values = choices.copy()
|
194 |
+
shown_tasks = gr.CheckboxGroup(
|
195 |
+
choices=choices,
|
196 |
+
value=values,
|
197 |
+
label="Select tasks to show",
|
198 |
+
elem_id="column-select",
|
199 |
+
interactive=True,
|
200 |
+
scale=50,
|
201 |
+
)
|
202 |
+
if id == 0:
|
203 |
+
# switching to accuracy tab, default to fewshot
|
204 |
+
fewshot = gr.Radio(
|
205 |
+
choices=[("0-Shot", False), ("Few-shot", True)],
|
206 |
+
value=True,
|
207 |
+
label="Select evaluation type",
|
208 |
+
interactive=True,
|
209 |
+
scale=29,
|
210 |
+
)
|
211 |
+
elif id == 1:
|
212 |
+
# switching to translation tab, default to 0-shot and disable selection
|
213 |
+
fewshot = gr.Radio(
|
214 |
+
choices=[("0-Shot", False), ("Few-shot", True)],
|
215 |
+
value=False,
|
216 |
+
label="Select evaluation type",
|
217 |
+
interactive=False,
|
218 |
+
scale=29,
|
219 |
+
)
|
220 |
+
return [shown_tasks, fewshot]
|
221 |
+
|
222 |
+
|
223 |
+
def get_selected_task_type(task_type_id):
|
224 |
+
task_types = {0: "accuracy", 1: "misc"}
|
225 |
+
selected_task_type = task_types[task_type_id]
|
226 |
+
return selected_task_type
|
227 |
+
|
228 |
+
|
229 |
+
def task_groups_with_task_type(selected_task_type):
|
230 |
+
choices = [task_group_name for task_group_name, task_type in task_group_type_dict.items() if task_type == selected_task_type]
|
231 |
+
|
232 |
+
return choices
|
233 |
+
|
234 |
+
|
235 |
+
init()
|
pyproject.toml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[tool.black]
|
2 |
+
line-length = 250
|
requirements.txt
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
APScheduler==3.10.1
|
2 |
+
black==23.11.0
|
3 |
+
click==8.1.3
|
4 |
+
datasets==2.14.5
|
5 |
+
gradio==4.19.2
|
6 |
+
gradio_client==0.10.1
|
7 |
+
huggingface-hub>=0.18.0
|
8 |
+
markdown-it-py==2.2.0
|
9 |
+
MarkupSafe==2.1.2
|
10 |
+
matplotlib==3.7.1
|
11 |
+
numpy==1.24.2
|
12 |
+
pandas==2.0.0
|
13 |
+
plotly==5.14.1
|
14 |
+
python-dateutil==2.8.2
|
15 |
+
requests==2.28.2
|
16 |
+
semantic-version==2.10.0
|
17 |
+
tqdm==4.65.0
|
18 |
+
transformers==4.35.2
|
19 |
+
tokenizers>=0.15.0
|
20 |
+
openpyxl>=3.1.2<4.0.0
|
style.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TITLE = """<h1 align="center" id="space-title">OpenGPT-X Multilingual LLM Leaderboard</h1>"""
|
2 |
+
CSS = """
|
3 |
+
#plot {
|
4 |
+
height: 512px;
|
5 |
+
display: flex;
|
6 |
+
justify-content: center;
|
7 |
+
align-items: center;
|
8 |
+
}
|
9 |
+
.modebar{
|
10 |
+
display: none !important;
|
11 |
+
}
|
12 |
+
"""
|
13 |
+
T_SYMBOLS = {
|
14 |
+
"pretrained": "🟢",
|
15 |
+
"chat": "💬"
|
16 |
+
}
|