|
import gradio as gr |
|
|
|
import core as core |
|
from style import CSS, T_SYMBOLS, TITLE |
|
|
|
demo = gr.Blocks(css=CSS) |
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown( |
|
"This is a (WIP) collection of multilingual evaluation results obtained using our fork of the LM-evaluation-harness (https://github.com/OpenGPTX/lm-evaluation-harness), based on https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard.\ |
|
Note that currently, not all benchmarks are available in all languages, results are averaged over those languages under the selected ones for which the benchmark is available.", |
|
elem_classes="markdown-text", |
|
) |
|
|
|
with gr.Column(): |
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Row(): |
|
search_bar = gr.Textbox( |
|
label="Search models", |
|
placeholder=" π Separate multiple queries with ';' and press ENTER...", |
|
show_label=True, |
|
elem_id="search-bar", |
|
) |
|
|
|
model_types = gr.CheckboxGroup( |
|
label="Select model type", |
|
choices=[ |
|
( |
|
f"Pretrained {T_SYMBOLS['pretrained']}", |
|
T_SYMBOLS["pretrained"], |
|
), |
|
(f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]), |
|
], |
|
value=list(T_SYMBOLS.values()), |
|
) |
|
with gr.Row(): |
|
langs_bar = gr.CheckboxGroup( |
|
choices=core.languages_list, |
|
value=core.languages_list, |
|
label="Select languages to average over", |
|
elem_id="column-select", |
|
interactive=True, |
|
scale=6, |
|
) |
|
with gr.Column(scale=1): |
|
clear = gr.ClearButton( |
|
langs_bar, |
|
value="Deselect all languages", |
|
size="sm", |
|
scale=1, |
|
) |
|
select = gr.Button( |
|
value="Select all languages", size="sm", scale=1 |
|
) |
|
|
|
def update_bar(): |
|
langs_bar = gr.CheckboxGroup( |
|
choices=core.languages_list, |
|
value=core.languages_list, |
|
label="Select languages to average over", |
|
elem_id="column-select", |
|
interactive=True, |
|
) |
|
return langs_bar |
|
|
|
select.click(update_bar, inputs=[], outputs=langs_bar) |
|
|
|
with gr.Row(): |
|
acc_task_group_names = core.task_groups_with_task_type("accuracy") |
|
shown_tasks = gr.CheckboxGroup( |
|
choices=acc_task_group_names, |
|
value=acc_task_group_names, |
|
label="Select tasks to show", |
|
elem_id="column-select", |
|
interactive=True, |
|
scale=50, |
|
) |
|
fewshot = gr.Radio( |
|
choices=[("0-Shot", False), ("Few-shot", True)], |
|
value=True, |
|
label="Select evaluation type", |
|
interactive=True, |
|
scale=29, |
|
) |
|
fewshot.change( |
|
core.fix_zeroshot, [shown_tasks, fewshot], shown_tasks |
|
) |
|
clear = gr.ClearButton( |
|
shown_tasks, value="Deselect all tasks", size="sm", scale=21 |
|
) |
|
|
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
with gr.TabItem( |
|
"π
LLM accuracy benchmark", elem_id="llm-benchmark-tab-table-acc", id=0 |
|
) as acc: |
|
leaderboard_table = gr.Dataframe() |
|
with gr.TabItem( |
|
"π LLM translation benchmark", |
|
elem_id="llm-benchmark-tab-table-misc", |
|
id=1, |
|
) as misc: |
|
leaderboard_table_misc = gr.Dataframe() |
|
with gr.TabItem("Plots", elem_id="llm-plot-tab", id=2) as plot: |
|
leaderboard_plot = gr.Plot(elem_id="plot") |
|
acc.select( |
|
lambda x: core.update_tab_tasks(0, x), |
|
inputs=fewshot, |
|
outputs=[shown_tasks, fewshot], |
|
) |
|
misc.select( |
|
lambda x: core.update_tab_tasks(1, x), |
|
inputs=fewshot, |
|
outputs=[shown_tasks, fewshot], |
|
) |
|
for comp, fn in [ |
|
(search_bar, "submit"), |
|
(langs_bar, "change"), |
|
(shown_tasks, "change"), |
|
(fewshot, "change"), |
|
(model_types, "change"), |
|
]: |
|
getattr(comp, fn)( |
|
core.update_df, |
|
[shown_tasks, search_bar, langs_bar, model_types, fewshot], |
|
leaderboard_table, |
|
) |
|
getattr(comp, fn)( |
|
core.update_df, |
|
[shown_tasks, search_bar, langs_bar, model_types, fewshot], |
|
leaderboard_table_misc, |
|
) |
|
getattr(comp, fn)( |
|
core.update_plot, |
|
[shown_tasks, search_bar, langs_bar, model_types, fewshot], |
|
leaderboard_plot, |
|
) |
|
|
|
gr.Blocks.load( |
|
block=demo, |
|
fn=core.update_df, |
|
inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot], |
|
outputs=leaderboard_table, |
|
) |
|
|
|
gr.Blocks.load( |
|
block=demo, |
|
fn=core.update_df, |
|
inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot], |
|
outputs=leaderboard_table_misc, |
|
) |
|
|
|
gr.Blocks.load( |
|
block=demo, |
|
fn=core.update_plot, |
|
inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot], |
|
outputs=leaderboard_plot, |
|
) |
|
|
|
demo.launch() |
|
|