Spaces:
Runtime error
Runtime error
RicardoDominguez
commited on
Commit
·
f224572
1
Parent(s):
aa8c9ef
style changes
Browse files- README.md +6 -1
- app.py +6 -6
- src/about.py +2 -4
- src/display/utils.py +8 -8
README.md
CHANGED
@@ -41,4 +41,9 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
|
|
41 |
You'll find
|
42 |
- the main table' columns names and properties in `src/display/utils.py`
|
43 |
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
44 |
-
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
|
|
|
|
|
|
|
|
|
|
|
41 |
You'll find
|
42 |
- the main table' columns names and properties in `src/display/utils.py`
|
43 |
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
44 |
+
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
45 |
+
|
46 |
+
|
47 |
+
# Todo
|
48 |
+
|
49 |
+
* Change background to white
|
app.py
CHANGED
@@ -66,13 +66,13 @@ def init_leaderboard(dataframe):
|
|
66 |
select_columns=SelectColumns(
|
67 |
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
-
label="Select
|
70 |
),
|
71 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
72 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
filter_columns=[
|
74 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
ColumnFilter(
|
77 |
AutoEvalColumn.params.name,
|
78 |
type="slider",
|
@@ -80,9 +80,9 @@ def init_leaderboard(dataframe):
|
|
80 |
max=150,
|
81 |
label="Select the number of parameters (B)",
|
82 |
),
|
83 |
-
ColumnFilter(
|
84 |
-
|
85 |
-
),
|
86 |
],
|
87 |
bool_checkboxgroup_label="Hide models",
|
88 |
interactive=False,
|
@@ -95,7 +95,7 @@ with demo:
|
|
95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
-
with gr.TabItem("
|
99 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
100 |
|
101 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
|
|
66 |
select_columns=SelectColumns(
|
67 |
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
+
label="Select columns to display:",
|
70 |
),
|
71 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
72 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
filter_columns=[
|
74 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
+
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
ColumnFilter(
|
77 |
AutoEvalColumn.params.name,
|
78 |
type="slider",
|
|
|
80 |
max=150,
|
81 |
label="Select the number of parameters (B)",
|
82 |
),
|
83 |
+
# ColumnFilter(
|
84 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
+
# ),
|
86 |
],
|
87 |
bool_checkboxgroup_label="Hide models",
|
88 |
interactive=False,
|
|
|
95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
+
with gr.TabItem("🏛️ CaselawQA", elem_id="llm-benchmark-tab-table", id=0):
|
99 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
100 |
|
101 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
src/about.py
CHANGED
@@ -25,7 +25,7 @@ TITLE = """<h1 align="center" id="space-title">CaselawQA leaderboard (WIP)</h1>"
|
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
CaselawQA is a benchmark comprising classification tasks
|
29 |
From a technical machine learning perspective, these tasks provide highly non-trivial classification problems where even the best models leave much room for improvement.
|
30 |
From a substantive legal perspective, efficient solutions to such classification problems have rich and important applications in legal research.
|
31 |
"""
|
@@ -82,8 +82,7 @@ If everything is done, check you can launch the EleutherAIHarness on your model
|
|
82 |
|
83 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
84 |
CITATION_BUTTON_TEXT = r"""
|
85 |
-
|
86 |
-
@misc{dominguezolmedo2024lawmapowerspecializationlegal,
|
87 |
title={Lawma: The Power of Specialization for Legal Tasks},
|
88 |
author={Ricardo Dominguez-Olmedo and Vedant Nanda and Rediet Abebe and Stefan Bechtold and Christoph Engel and Jens Frankenreiter and Krishna Gummadi and Moritz Hardt and Michael Livermore},
|
89 |
year={2024},
|
@@ -92,5 +91,4 @@ CITATION_BUTTON_TEXT = r"""
|
|
92 |
primaryClass={cs.CL},
|
93 |
url={https://arxiv.org/abs/2407.16615},
|
94 |
}
|
95 |
-
```
|
96 |
"""
|
|
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
+
CaselawQA is a benchmark comprising legal classification tasks derived from the Supreme Court and Songer Court of Appeals legal databases.
|
29 |
From a technical machine learning perspective, these tasks provide highly non-trivial classification problems where even the best models leave much room for improvement.
|
30 |
From a substantive legal perspective, efficient solutions to such classification problems have rich and important applications in legal research.
|
31 |
"""
|
|
|
82 |
|
83 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
84 |
CITATION_BUTTON_TEXT = r"""
|
85 |
+
@misc{dominguezolmedo2024lawma,
|
|
|
86 |
title={Lawma: The Power of Specialization for Legal Tasks},
|
87 |
author={Ricardo Dominguez-Olmedo and Vedant Nanda and Rediet Abebe and Stefan Bechtold and Christoph Engel and Jens Frankenreiter and Krishna Gummadi and Moritz Hardt and Michael Livermore},
|
88 |
year={2024},
|
|
|
91 |
primaryClass={cs.CL},
|
92 |
url={https://arxiv.org/abs/2407.16615},
|
93 |
}
|
|
|
94 |
"""
|
src/display/utils.py
CHANGED
@@ -26,19 +26,19 @@ auto_eval_column_dict = []
|
|
26 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
33 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
-
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
-
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
-
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
40 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
26 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
+
# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
33 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
+
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
+
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
+
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
+
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
+
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
40 |
+
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
+
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|