RicardoDominguez
commited on
Commit
·
279610b
1
Parent(s):
1358bcc
about
Browse files- README.md +2 -2
- src/about.py +38 -14
- src/envs.py +4 -4
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
|
|
1 |
---
|
2 |
+
title: CaselawQA leaderboard (WIP)
|
3 |
+
emoji: 🏛️
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
src/about.py
CHANGED
@@ -12,29 +12,45 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("
|
16 |
-
task1 = Task("
|
17 |
-
|
|
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
20 |
|
21 |
|
22 |
-
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
|
|
|
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
LLM_BENCHMARKS_TEXT = f"""
|
33 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
## Reproducibility
|
36 |
-
To reproduce our results, here is the commands you can run:
|
37 |
|
|
|
|
|
|
|
|
|
38 |
"""
|
39 |
|
40 |
EVALUATION_QUEUE_TEXT = """
|
@@ -50,16 +66,13 @@ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
|
50 |
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
51 |
|
52 |
Note: make sure your model is public!
|
53 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option
|
54 |
|
55 |
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
56 |
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
57 |
|
58 |
-
### 3)
|
59 |
-
|
60 |
-
|
61 |
-
### 4) Fill up your model card
|
62 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
63 |
|
64 |
## In case of model failure
|
65 |
If your model is displayed in the `FAILED` category, its execution stopped.
|
@@ -69,4 +82,15 @@ If everything is done, check you can launch the EleutherAIHarness on your model
|
|
69 |
|
70 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
71 |
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
"""
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("caselawqa", "exact_match", "CaselawQA")
|
16 |
+
task1 = Task("caselawqa_tiny", "exact_match", "CaselawQA Tiny")
|
17 |
+
task2 = Task("caselawqa_hard", "exact_match", "CaselawQA Hard")
|
18 |
+
|
19 |
NUM_FEWSHOT = 0 # Change with your few shot
|
20 |
# ---------------------------------------------------
|
21 |
|
22 |
|
|
|
23 |
# Your leaderboard name
|
24 |
+
TITLE = """<h1 align="center" id="space-title">CaselawQA leaderboard (WIP)</h1>"""
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
+
CaselawQA is a benchmark comprising classification tasks, drawing from the Supreme Court and Songer Court of Appeals legal databases.
|
29 |
+
From a technical machine learning perspective, these tasks provide highly non-trivial classification problems where even the best models leave much room for improvement.
|
30 |
+
From a substantive legal perspective, efficient solutions to such classification problems have rich and important applications in legal research.
|
31 |
"""
|
32 |
|
33 |
# Which evaluations are you running? how can people reproduce what you have?
|
34 |
LLM_BENCHMARKS_TEXT = f"""
|
35 |
+
## Introduction
|
36 |
+
|
37 |
+
CaselawQA is a benchmark comprising legal classification tasks, drawing from the Supreme Court and Songer Court of Appeals legal databases.
|
38 |
+
The majority of its 10,000 questions are multiple-choice, with 5,000 sourced from each database.
|
39 |
+
The questions are randomly selected from the test sets of the [Lawma tasks](https://huggingface.co/datasets/ricdomolm/lawma-tasks).\
|
40 |
+
From a technical machine learning perspective, these tasks provide highly non-trivial classification problems where even the best models leave much room for improvement.
|
41 |
+
From a substantive legal perspective, efficient solutions to such classification problems have rich and important applications in legal research.
|
42 |
+
CaselawQA also includes two additional subsets: CaselawQA Tiny and CaselawQA Hard.
|
43 |
+
CaselawQA Tiny consists of 49 Lawma tasks with fewer than 150 training examples.
|
44 |
+
CaselawQA Hard comprises tasks where [Lawma 70B](https://huggingface.co/ricdomolm/lawma-70b) achieves less than 70% accuracy.
|
45 |
+
|
46 |
+
You can find more information in the [Lawma arXiv preprint](https://arxiv.org/abs/2407.16615) and [GitHub repository](https://github.com/socialfoundations/lawma).
|
47 |
|
48 |
## Reproducibility
|
|
|
49 |
|
50 |
+
With evaluate CaselawQA using [this](https://github.com/socialfoundations/lm-evaluation-harness/tree/caselawqa) LM Eval Harness implementation:
|
51 |
+
|
52 |
+
```bash
|
53 |
+
lm_eval --model hf --model_args "pretrained=<your_model>,dtype=bfloat16" --tasks caselawqa,caselawqa_tiny,caselawqa_hard --output_path=<output_path>
|
54 |
"""
|
55 |
|
56 |
EVALUATION_QUEUE_TEXT = """
|
|
|
66 |
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
67 |
|
68 |
Note: make sure your model is public!
|
69 |
+
Note: if your model needs `use_remote_code=True`, we do not support this option.
|
70 |
|
71 |
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
72 |
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
73 |
|
74 |
+
### 3) Fill up your model card
|
75 |
+
When we add extra information about models to the leaderboard, it will be automatically taken from the model card.
|
|
|
|
|
|
|
76 |
|
77 |
## In case of model failure
|
78 |
If your model is displayed in the `FAILED` category, its execution stopped.
|
|
|
82 |
|
83 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
84 |
CITATION_BUTTON_TEXT = r"""
|
85 |
+
```bibtex
|
86 |
+
@misc{dominguezolmedo2024lawmapowerspecializationlegal,
|
87 |
+
title={Lawma: The Power of Specialization for Legal Tasks},
|
88 |
+
author={Ricardo Dominguez-Olmedo and Vedant Nanda and Rediet Abebe and Stefan Bechtold and Christoph Engel and Jens Frankenreiter and Krishna Gummadi and Moritz Hardt and Michael Livermore},
|
89 |
+
year={2024},
|
90 |
+
eprint={2407.16615},
|
91 |
+
archivePrefix={arXiv},
|
92 |
+
primaryClass={cs.CL},
|
93 |
+
url={https://arxiv.org/abs/2407.16615},
|
94 |
+
}
|
95 |
+
```
|
96 |
"""
|
src/envs.py
CHANGED
@@ -6,12 +6,12 @@ from huggingface_hub import HfApi
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
-
OWNER = "
|
10 |
# ----------------------------------
|
11 |
|
12 |
-
REPO_ID = f"{OWNER}/
|
13 |
-
QUEUE_REPO = f"{OWNER}/
|
14 |
-
RESULTS_REPO = f"{OWNER}/
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "ricdomolm" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
+
REPO_ID = f"{OWNER}/caselawqa_leaderboard"
|
13 |
+
QUEUE_REPO = f"{OWNER}/caselawqa_leaderboard_requests"
|
14 |
+
RESULTS_REPO = f"{OWNER}/caselawqa_leaderboard_results"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|