KlaudiaTH
Release version of leaderboard implementation
2b62c4c
raw
history blame
6.45 kB
import gradio as gr
import core as core
from style import CSS, T_SYMBOLS, TITLE
demo = gr.Blocks(css=CSS)
with demo:
gr.HTML(TITLE)
gr.Markdown(
"This is a (WIP) collection of multilingual evaluation results obtained using our fork of the LM-evaluation-harness (https://github.com/OpenGPTX/lm-evaluation-harness), based on https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard.\
Note that currently, not all benchmarks are available in all languages, results are averaged over those languages under the selected ones for which the benchmark is available.",
elem_classes="markdown-text",
)
with gr.Column():
with gr.Row():
with gr.Column():
with gr.Row():
search_bar = gr.Textbox(
label="Search models",
placeholder=" πŸ” Separate multiple queries with ';' and press ENTER...",
show_label=True,
elem_id="search-bar",
)
model_types = gr.CheckboxGroup(
label="Select model type",
choices=[
(
f"Pretrained {T_SYMBOLS['pretrained']}",
T_SYMBOLS["pretrained"],
),
(f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
],
value=list(T_SYMBOLS.values()),
)
with gr.Row():
langs_bar = gr.CheckboxGroup(
choices=core.languages_list,
value=core.languages_list,
label="Select languages to average over",
elem_id="column-select",
interactive=True,
scale=6,
)
with gr.Column(scale=1):
clear = gr.ClearButton(
langs_bar,
value="Deselect all languages",
size="sm",
scale=1,
)
select = gr.Button(
value="Select all languages", size="sm", scale=1
)
def update_bar():
langs_bar = gr.CheckboxGroup(
choices=core.languages_list,
value=core.languages_list,
label="Select languages to average over",
elem_id="column-select",
interactive=True,
)
return langs_bar
select.click(update_bar, inputs=[], outputs=langs_bar)
with gr.Row():
acc_task_group_names = core.task_groups_with_task_type("accuracy")
shown_tasks = gr.CheckboxGroup(
choices=acc_task_group_names,
value=acc_task_group_names,
label="Select tasks to show",
elem_id="column-select",
interactive=True,
scale=50,
)
fewshot = gr.Radio(
choices=[("0-Shot", False), ("Few-shot", True)],
value=True,
label="Select evaluation type",
interactive=True,
scale=29,
)
fewshot.change(
core.fix_zeroshot, [shown_tasks, fewshot], shown_tasks
)
clear = gr.ClearButton(
shown_tasks, value="Deselect all tasks", size="sm", scale=21
)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem(
"πŸ… LLM accuracy benchmark", elem_id="llm-benchmark-tab-table-acc", id=0
) as acc:
leaderboard_table = gr.Dataframe()
with gr.TabItem(
"🌐 LLM translation benchmark",
elem_id="llm-benchmark-tab-table-misc",
id=1,
) as misc:
leaderboard_table_misc = gr.Dataframe()
with gr.TabItem("Plots", elem_id="llm-plot-tab", id=2) as plot:
leaderboard_plot = gr.Plot(elem_id="plot")
acc.select(
lambda x: core.update_tab_tasks(0, x),
inputs=fewshot,
outputs=[shown_tasks, fewshot],
)
misc.select(
lambda x: core.update_tab_tasks(1, x),
inputs=fewshot,
outputs=[shown_tasks, fewshot],
)
for comp, fn in [
(search_bar, "submit"),
(langs_bar, "change"),
(shown_tasks, "change"),
(fewshot, "change"),
(model_types, "change"),
]:
getattr(comp, fn)(
core.update_df,
[shown_tasks, search_bar, langs_bar, model_types, fewshot],
leaderboard_table,
)
getattr(comp, fn)(
core.update_df,
[shown_tasks, search_bar, langs_bar, model_types, fewshot],
leaderboard_table_misc,
)
getattr(comp, fn)(
core.update_plot,
[shown_tasks, search_bar, langs_bar, model_types, fewshot],
leaderboard_plot,
)
gr.Blocks.load(
block=demo,
fn=core.update_df,
inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
outputs=leaderboard_table,
)
gr.Blocks.load(
block=demo,
fn=core.update_df,
inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
outputs=leaderboard_table_misc,
)
gr.Blocks.load(
block=demo,
fn=core.update_plot,
inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
outputs=leaderboard_plot,
)
demo.launch()