Spaces:

open-llm-leaderboard
/

comparator

Running on CPU Upgrade

App Files Files Community

albertvillanova HF staff commited on Oct 29, 2024

Commit

bea7063

verified ·

1 Parent(s): 22fb9eb

Support more than 2 models

Browse files

Files changed (3) hide show

app.py +27 -39
src/details.py +24 -22
src/results.py +22 -20

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from src.details import (
     clear_details,
     display_details,
     display_loading_message_for_details,
-    load_details_dataframes,
     update_load_details_component,
     update_sample_idx_component,
     update_subtasks_component,
@@ -20,7 +20,7 @@ from src.results import (
     display_results,
     download_results,
     fetch_result_paths,
-    load_results_dataframes,
     plot_results,
     sort_result_paths_per_model,
     update_load_results_component,
@@ -30,11 +30,11 @@ from src.results import (
 # if __name__ == "__main__":
 result_paths_per_model = sort_result_paths_per_model(fetch_result_paths())
-load_results_dataframes = partial(load_results_dataframes, result_paths_per_model=result_paths_per_model)
-with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}") as demo:
     gr.HTML("<h1 style='text-align: center;'>Compare Results of the 🤗 Open LLM Leaderboard</h1>")
-    gr.HTML("<h3 style='text-align: center;'>Select 2 models to load and compare their results</h3>")
     gr.HTML(
         "<p style='text-align: center; color:orange;'>⚠ This demo is a beta version, and we're actively working on it, so you might find some tiny bugs! Please report any issues you have in the Community tab to help us make it better for all.</p>"
     )
@@ -43,10 +43,7 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
         "Check out the [documentation](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about) 📄 to find explanations on the evaluations used, their configuration parameters and details on the input/outputs for the models."
     )
     with gr.Row():
-        with gr.Column():
-            model_id_1 = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models")
-        with gr.Column():
-            model_id_2 = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models")
     with gr.Row():
         with gr.Tab("Results"):
@@ -69,8 +66,7 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
                 results_plot_1 = gr.Plot(visible=True)
                 results_plot_2 = gr.Plot(visible=True)
             results = gr.HTML()
-            results_dataframe_1 = gr.Dataframe(visible=False)
-            results_dataframe_2 = gr.Dataframe(visible=False)
             download_results_btn = gr.Button("Download")
             results_file = gr.File(visible=False)
         with gr.Tab("Configs"):
@@ -115,12 +111,10 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
             )
             details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
             details = gr.HTML()
-            details_dataframe_1 = gr.Dataframe(visible=False)
-            details_dataframe_2 = gr.Dataframe(visible=False)
-            details_dataframe = gr.DataFrame(visible=False)
     gr.on(
-        triggers=[model_id_1.input, model_id_2.input],
         fn=update_load_results_component,
         outputs=[load_results_btn, load_configs_btn],
     )
@@ -129,9 +123,9 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
         fn=display_loading_message_for_results,
         outputs=[results, configs],
     ).then(
-        fn=load_results_dataframes,
-        inputs=[model_id_1, model_id_2],
-        outputs=[results_dataframe_1, results_dataframe_2],
     ).then(
         fn=update_tasks_component,
         outputs=[results_task, configs_task],
@@ -152,18 +146,17 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
     # Display results
     gr.on(
         triggers=[
-            results_dataframe_1.change,
-            results_dataframe_2.change,
             results_task.change,
             hide_std_errors.change,
             show_only_differences.change,
         ],
         fn=display_results,
-        inputs=[results_task, hide_std_errors, show_only_differences, results_dataframe_1, results_dataframe_2],
         outputs=[results, configs],
     ).then(
         fn=plot_results,
-        inputs=[results_task, results_dataframe_1, results_dataframe_2],  # results,
         outputs=[results_plot_1, results_plot_2],
     ).then(
         fn=clear_results_file,
@@ -178,10 +171,8 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
         triggers=[clear_results_btn.click, clear_configs_btn.click],
         fn=clear_results,
         outputs=[
-            model_id_1,
-            model_id_2,
-            results_dataframe_1,
-            results_dataframe_2,
             load_results_btn,
             load_configs_btn,
             results_task,
@@ -203,41 +194,38 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
         outputs=[login_btn, subtask],
     )
     gr.on(
-        triggers=[model_id_1.input, model_id_2.input, subtask.input, details_task.input],
         fn=update_load_details_component,
-        inputs=[model_id_1, model_id_2, subtask],
         outputs=load_details_btn,
     )
     load_details_btn.click(
         fn=display_loading_message_for_details,
         outputs=details,
     ).then(
-        fn=load_details_dataframes,
-        inputs=[subtask, model_id_1, model_id_2],
-        outputs=[details_dataframe_1, details_dataframe_2],
     ).then(
         fn=update_sample_idx_component,
-        inputs=[details_dataframe_1, details_dataframe_2],
         outputs=sample_idx,
     )
     gr.on(
         triggers=[
-            details_dataframe_1.change,
-            details_dataframe_2.change,
             sample_idx.change,
             details_show_only_differences.change,
         ],
         fn=display_details,
-        inputs=[sample_idx, details_show_only_differences, details_dataframe_1, details_dataframe_2],
         outputs=details,
     )
     clear_details_btn.click(
         fn=clear_details,
         outputs=[
-            model_id_1,
-            model_id_2,
-            details_dataframe_1,
-            details_dataframe_2,
             details_task,
             subtask,
             load_details_btn,

     clear_details,
     display_details,
     display_loading_message_for_details,
+    load_details,
     update_load_details_component,
     update_sample_idx_component,
     update_subtasks_component,
     display_results,
     download_results,
     fetch_result_paths,
+    load_results,
     plot_results,
     sort_result_paths_per_model,
     update_load_results_component,
 # if __name__ == "__main__":
 result_paths_per_model = sort_result_paths_per_model(fetch_result_paths())
+load_results = partial(load_results, result_paths_per_model=result_paths_per_model)
+with gr.Blocks(fill_height=True, fill_width=True) as demo:
     gr.HTML("<h1 style='text-align: center;'>Compare Results of the 🤗 Open LLM Leaderboard</h1>")
+    gr.HTML("<h3 style='text-align: center;'>Select models to load and compare their results</h3>")
     gr.HTML(
         "<p style='text-align: center; color:orange;'>⚠ This demo is a beta version, and we're actively working on it, so you might find some tiny bugs! Please report any issues you have in the Community tab to help us make it better for all.</p>"
     )
         "Check out the [documentation](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about) 📄 to find explanations on the evaluations used, their configuration parameters and details on the input/outputs for the models."
     )
     with gr.Row():
+        model_ids = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models", multiselect=True)
     with gr.Row():
         with gr.Tab("Results"):
                 results_plot_1 = gr.Plot(visible=True)
                 results_plot_2 = gr.Plot(visible=True)
             results = gr.HTML()
+            results_dataframe = gr.State()
             download_results_btn = gr.Button("Download")
             results_file = gr.File(visible=False)
         with gr.Tab("Configs"):
             )
             details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
             details = gr.HTML()
+            details_dataframe = gr.State()
     gr.on(
+        triggers=[model_ids.input],
         fn=update_load_results_component,
         outputs=[load_results_btn, load_configs_btn],
     )
         fn=display_loading_message_for_results,
         outputs=[results, configs],
     ).then(
+        fn=load_results,
+        inputs=model_ids,
+        outputs=results_dataframe,
     ).then(
         fn=update_tasks_component,
         outputs=[results_task, configs_task],
     # Display results
     gr.on(
         triggers=[
+            results_dataframe.change,
             results_task.change,
             hide_std_errors.change,
             show_only_differences.change,
         ],
         fn=display_results,
+        inputs=[results_dataframe, results_task, hide_std_errors, show_only_differences],
         outputs=[results, configs],
     ).then(
         fn=plot_results,
+        inputs=[results_dataframe, results_task],
         outputs=[results_plot_1, results_plot_2],
     ).then(
         fn=clear_results_file,
         triggers=[clear_results_btn.click, clear_configs_btn.click],
         fn=clear_results,
         outputs=[
+            model_ids,
+            results_dataframe,
             load_results_btn,
             load_configs_btn,
             results_task,
         outputs=[login_btn, subtask],
     )
     gr.on(
+        triggers=[model_ids.input, subtask.input, details_task.input],
         fn=update_load_details_component,
+        inputs=[model_ids, subtask],
         outputs=load_details_btn,
     )
     load_details_btn.click(
         fn=display_loading_message_for_details,
         outputs=details,
     ).then(
+        fn=load_details,
+        inputs=[model_ids, subtask],
+        outputs=details_dataframe,
     ).then(
         fn=update_sample_idx_component,
+        inputs=[details_dataframe],
         outputs=sample_idx,
     )
     gr.on(
         triggers=[
+            details_dataframe.change,
             sample_idx.change,
             details_show_only_differences.change,
         ],
         fn=display_details,
+        inputs=[details_dataframe, sample_idx, details_show_only_differences],
         outputs=details,
     )
     clear_details_btn.click(
         fn=clear_details,
         outputs=[
+            model_ids,
+            details_dataframe,
             details_task,
             subtask,
             load_details_btn,

src/details.py CHANGED Viewed

@@ -32,8 +32,8 @@ def update_subtasks_component(task, profile: gr.OAuthProfile | None):
     )
-def update_load_details_component(model_id_1, model_id_2, subtask):
-    if (model_id_1 or model_id_2) and subtask:
         return gr.Button("Load Details", interactive=True)
     else:
         return gr.Button("Load Details", interactive=False)
@@ -56,24 +56,22 @@ async def load_details_dataframe(model_id, subtask):
     path = max(paths)
     data = await load_jsonlines_file(path)
     df = pd.json_normalize(data)
-    df = df.sort_values(by=["doc_id"])
-    # df = df.rename_axis("Parameters", axis="columns")
-    df["model_name"] = model_id  # Keep model_name
-    return df
-    # return df.set_index(pd.Index([model_id])).reset_index()
-async def load_details_dataframes(subtask, *model_ids):
-    result = await asyncio.gather(*[load_details_dataframe(model_id, subtask) for model_id in model_ids])
-    return result
-def display_details(sample_idx, show_only_differences, *dfs):
-    rows = [df.iloc[sample_idx] for df in dfs if "model_name" in df.columns and sample_idx < len(df)]
-    if not rows:
         return
-    # Pop model_name and add it to the column name
-    df = pd.concat([row.rename(row.pop("model_name")) for row in rows], axis="columns")
     # Style
     # - Option: Show only differences
@@ -92,15 +90,21 @@ def display_details(sample_idx, show_only_differences, *dfs):
                 {
                     "selector": "td",
                     "props": [("overflow-wrap", "break-word"), ("max-width", "1px")],
-                }
             ]
         )
         .to_html()
     )
-def update_sample_idx_component(*dfs):
-    maximum = max([len(df) - 1 for df in dfs])
     return gr.Number(
         label="Sample Index",
         info="Index of the sample to be displayed",
@@ -112,11 +116,9 @@ def update_sample_idx_component(*dfs):
 def clear_details():
-    # model_id_1, model_id_2, details_dataframe_1, details_dataframe_2, details_task, subtask, load_details_btn, sample_idx
     return (
-        None,
-        None,
-        None,
         None,
         None,
         None,

     )
+def update_load_details_component(model_id, subtask):
+    if model_id and subtask:
         return gr.Button("Load Details", interactive=True)
     else:
         return gr.Button("Load Details", interactive=False)
     path = max(paths)
     data = await load_jsonlines_file(path)
     df = pd.json_normalize(data)
+    # Keep model_name:
+    df["model_name"] = model_id
+    return df.sort_values("doc_id").set_index("doc_id", drop=False).set_index("model_name", append=True)
+async def load_details(model_ids, subtask):
+    dfs = await asyncio.gather(*[load_details_dataframe(model_id, subtask) for model_id in model_ids])
+    if dfs:
+        return pd.concat(dfs)
+def display_details(df, sample_idx, show_only_differences):
+    if df is None:
         return
+    df = df.loc[df.index.levels[0][sample_idx]]
+    df = df.T.rename_axis(columns=None)
     # Style
     # - Option: Show only differences
                 {
                     "selector": "td",
                     "props": [("overflow-wrap", "break-word"), ("max-width", "1px")],
+                },
+                {
+                    "selector": ".col_heading",
+                    "props": [("width", f"{100 / len(df.columns)}%")],
+                },
             ]
         )
         .to_html()
     )
+def update_sample_idx_component(df):
+    if df is None:
+        return
+    maximum = len(df) - 1
     return gr.Number(
         label="Sample Index",
         info="Index of the sample to be displayed",
 def clear_details():
+    # model_ids, details_dataframe, details_task, subtask, load_details_btn, sample_idx
     return (
+        gr.Dropdown(value=[]),
         None,
         None,
         None,

src/results.py CHANGED Viewed

@@ -42,24 +42,16 @@ async def load_results_dataframe(model_id, result_paths_per_model=None):
         model_name = result.get("model_name", "Model")
     df = pd.json_normalize([data])
     # df.columns = df.columns.str.split(".")  # .split return a list instead of a tuple
-    return df.set_index(pd.Index([model_name])).reset_index()
-async def load_results_dataframes(*model_ids, result_paths_per_model=None):
-    result = await asyncio.gather(
-        *[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids]
-    )
-    return result
-def concat_results(dfs):
-    dfs = [df.set_index("index") for df in dfs if "index" in df.columns]
     if dfs:
         return pd.concat(dfs)
-def display_results(task, hide_std_errors, show_only_differences, *dfs):
-    df = concat_results(dfs)
     if df is None:
         return None, None
     df = df.T.rename_axis(columns=None)
@@ -111,6 +103,19 @@ def display_tab(tab, df, task, hide_std_errors=True, show_only_differences=False
     # Format index values: remove prefix and suffix
     start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
     df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
     return df.to_html()
@@ -127,11 +132,9 @@ def update_tasks_component():
 def clear_results():
-    # model_id_1, model_id_2, dataframe_1, dataframe_2, load_results_btn, load_configs_btn, results_task, configs_task
     return (
-        None,
-        None,
-        None,
         None,
         *(gr.Button("Load", interactive=False),) * 2,
         *(
@@ -151,8 +154,7 @@ def display_loading_message_for_results():
     return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2
-def plot_results(task, *dfs):
-    df = concat_results(dfs)
     if df is not None:
         df = df[
             [
@@ -190,7 +192,7 @@ def plot_results(task, *dfs):
             df.T.rename_axis(columns="Model"),
             barmode="group",
             labels={"index": "Benchmark" if task == "All" else "Subtask", "value": "Score"},
-            color_discrete_sequence=["#FF9D00", "#32343D"],
         )
         fig_1.update_yaxes(range=[0, 1])
         fig_2 = px.line_polar(
@@ -200,7 +202,7 @@ def plot_results(task, *dfs):
             color="Model",
             line_close=True,
             range_r=[0, 1],
-            color_discrete_sequence=["#FF9D00", "#32343D"],
         )
         # Avoid bug with radar:
         fig_2.update_layout(

         model_name = result.get("model_name", "Model")
     df = pd.json_normalize([data])
     # df.columns = df.columns.str.split(".")  # .split return a list instead of a tuple
+    return df.set_index(pd.Index([model_name]))
+async def load_results(model_ids, result_paths_per_model=None):
+    dfs = await asyncio.gather(*[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids])
     if dfs:
         return pd.concat(dfs)
+def display_results(df, task, hide_std_errors, show_only_differences):
     if df is None:
         return None, None
     df = df.T.rename_axis(columns=None)
     # Format index values: remove prefix and suffix
     start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
     df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
+    # Fix overflow
+    df.set_table_styles(
+        [
+            {
+                "selector": "td",
+                "props": [("overflow-wrap", "break-word"), ("max-width", "1px")],
+            },
+            {
+                "selector": ".col_heading",
+                "props": [("width", f"{100 / len(df.columns)}%")],
+            },
+        ]
+    )
     return df.to_html()
 def clear_results():
+    # model_ids, dataframe, load_results_btn, load_configs_btn, results_task, configs_task
     return (
+        gr.Dropdown(value=[]),
         None,
         *(gr.Button("Load", interactive=False),) * 2,
         *(
     return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2
+def plot_results(df, task):
     if df is not None:
         df = df[
             [
             df.T.rename_axis(columns="Model"),
             barmode="group",
             labels={"index": "Benchmark" if task == "All" else "Subtask", "value": "Score"},
+            color_discrete_sequence=px.colors.qualitative.Safe,  # TODO: https://plotly.com/python/discrete-color/
         )
         fig_1.update_yaxes(range=[0, 1])
         fig_2 = px.line_polar(
             color="Model",
             line_close=True,
             range_r=[0, 1],
+            color_discrete_sequence=px.colors.qualitative.Safe,  # TODO: https://plotly.com/python/discrete-color/
         )
         # Avoid bug with radar:
         fig_2.update_layout(