Spaces:

ricdomolm
/

caselawqa_leaderboard

Runtime error

App Files Files Community

RicardoDominguez commited on Sep 12, 2024

Commit

f224572

1 Parent(s): aa8c9ef

style changes

Browse files

Files changed (4) hide show

README.md +6 -1
app.py +6 -6
src/about.py +2 -4
src/display/utils.py +8 -8

README.md CHANGED Viewed

@@ -41,4 +41,9 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
 You'll find
 - the main table' columns names and properties in `src/display/utils.py`
 - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
-- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

 You'll find
 - the main table' columns names and properties in `src/display/utils.py`
 - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
+- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
+# Todo
+* Change background to white

app.py CHANGED Viewed

@@ -66,13 +66,13 @@ def init_leaderboard(dataframe):
         select_columns=SelectColumns(
             default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
             cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
         ),
         search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
         hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         filter_columns=[
             ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
             ColumnFilter(
                 AutoEvalColumn.params.name,
                 type="slider",
@@ -80,9 +80,9 @@ def init_leaderboard(dataframe):
                 max=150,
                 label="Select the number of parameters (B)",
             ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
         ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
@@ -95,7 +95,7 @@ with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
             leaderboard = init_leaderboard(LEADERBOARD_DF)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):

         select_columns=SelectColumns(
             default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
             cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
+            label="Select columns to display:",
         ),
         search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
         hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         filter_columns=[
             ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
+            # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
             ColumnFilter(
                 AutoEvalColumn.params.name,
                 type="slider",
                 max=150,
                 label="Select the number of parameters (B)",
             ),
+            # ColumnFilter(
+            #     AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
+            # ),
         ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏛️ CaselawQA", elem_id="llm-benchmark-tab-table", id=0):
             leaderboard = init_leaderboard(LEADERBOARD_DF)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):

src/about.py CHANGED Viewed

@@ -25,7 +25,7 @@ TITLE = """<h1 align="center" id="space-title">CaselawQA leaderboard (WIP)</h1>"
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-CaselawQA is a benchmark comprising classification tasks, drawing from the Supreme Court and Songer Court of Appeals legal databases.
 From a technical machine learning perspective, these tasks provide highly non-trivial classification problems where even the best models leave much room for improvement.
 From a substantive legal perspective, efficient solutions to such classification problems have rich and important applications in legal research.
 """
@@ -82,8 +82,7 @@ If everything is done, check you can launch the EleutherAIHarness on your model
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
-```bibtex
-@misc{dominguezolmedo2024lawmapowerspecializationlegal,
       title={Lawma: The Power of Specialization for Legal Tasks},
       author={Ricardo Dominguez-Olmedo and Vedant Nanda and Rediet Abebe and Stefan Bechtold and Christoph Engel and Jens Frankenreiter and Krishna Gummadi and Moritz Hardt and Michael Livermore},
       year={2024},
@@ -92,5 +91,4 @@ CITATION_BUTTON_TEXT = r"""
       primaryClass={cs.CL},
       url={https://arxiv.org/abs/2407.16615},
 }
-```
 """

 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+CaselawQA is a benchmark comprising legal classification tasks derived from the Supreme Court and Songer Court of Appeals legal databases.
 From a technical machine learning perspective, these tasks provide highly non-trivial classification problems where even the best models leave much room for improvement.
 From a substantive legal perspective, efficient solutions to such classification problems have rich and important applications in legal research.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
+@misc{dominguezolmedo2024lawma,
       title={Lawma: The Power of Specialization for Legal Tasks},
       author={Ricardo Dominguez-Olmedo and Vedant Nanda and Rediet Abebe and Stefan Bechtold and Christoph Engel and Jens Frankenreiter and Krishna Gummadi and Moritz Hardt and Michael Livermore},
       year={2024},
       primaryClass={cs.CL},
       url={https://arxiv.org/abs/2407.16615},
 }
 """

src/display/utils.py CHANGED Viewed

@@ -26,19 +26,19 @@ auto_eval_column_dict = []
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
-auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
-auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
-auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
-auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
 auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
-auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
-auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
-auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)

 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
+# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
+# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
+# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
+# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
 auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)