RicardoDominguez commited on
Commit
279610b
·
1 Parent(s): 1358bcc
Files changed (3) hide show
  1. README.md +2 -2
  2. src/about.py +38 -14
  3. src/envs.py +4 -4
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Demo Leaderboard
3
- emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
 
1
  ---
2
+ title: CaselawQA leaderboard (WIP)
3
+ emoji: 🏛️
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
src/about.py CHANGED
@@ -12,29 +12,45 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
-
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
20
 
21
 
22
-
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
 
 
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
32
  LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
37
 
 
 
 
 
38
  """
39
 
40
  EVALUATION_QUEUE_TEXT = """
@@ -50,16 +66,13 @@ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
50
  If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
 
52
  Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
 
55
  ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
  It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
 
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
60
-
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
63
 
64
  ## In case of model failure
65
  If your model is displayed in the `FAILED` category, its execution stopped.
@@ -69,4 +82,15 @@ If everything is done, check you can launch the EleutherAIHarness on your model
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
 
 
 
 
72
  """
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("caselawqa", "exact_match", "CaselawQA")
16
+ task1 = Task("caselawqa_tiny", "exact_match", "CaselawQA Tiny")
17
+ task2 = Task("caselawqa_hard", "exact_match", "CaselawQA Hard")
18
+
19
  NUM_FEWSHOT = 0 # Change with your few shot
20
  # ---------------------------------------------------
21
 
22
 
 
23
  # Your leaderboard name
24
+ TITLE = """<h1 align="center" id="space-title">CaselawQA leaderboard (WIP)</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
+ CaselawQA is a benchmark comprising classification tasks, drawing from the Supreme Court and Songer Court of Appeals legal databases.
29
+ From a technical machine learning perspective, these tasks provide highly non-trivial classification problems where even the best models leave much room for improvement.
30
+ From a substantive legal perspective, efficient solutions to such classification problems have rich and important applications in legal research.
31
  """
32
 
33
  # Which evaluations are you running? how can people reproduce what you have?
34
  LLM_BENCHMARKS_TEXT = f"""
35
+ ## Introduction
36
+
37
+ CaselawQA is a benchmark comprising legal classification tasks, drawing from the Supreme Court and Songer Court of Appeals legal databases.
38
+ The majority of its 10,000 questions are multiple-choice, with 5,000 sourced from each database.
39
+ The questions are randomly selected from the test sets of the [Lawma tasks](https://huggingface.co/datasets/ricdomolm/lawma-tasks).\
40
+ From a technical machine learning perspective, these tasks provide highly non-trivial classification problems where even the best models leave much room for improvement.
41
+ From a substantive legal perspective, efficient solutions to such classification problems have rich and important applications in legal research.
42
+ CaselawQA also includes two additional subsets: CaselawQA Tiny and CaselawQA Hard.
43
+ CaselawQA Tiny consists of 49 Lawma tasks with fewer than 150 training examples.
44
+ CaselawQA Hard comprises tasks where [Lawma 70B](https://huggingface.co/ricdomolm/lawma-70b) achieves less than 70% accuracy.
45
+
46
+ You can find more information in the [Lawma arXiv preprint](https://arxiv.org/abs/2407.16615) and [GitHub repository](https://github.com/socialfoundations/lawma).
47
 
48
  ## Reproducibility
 
49
 
50
+ With evaluate CaselawQA using [this](https://github.com/socialfoundations/lm-evaluation-harness/tree/caselawqa) LM Eval Harness implementation:
51
+
52
+ ```bash
53
+ lm_eval --model hf --model_args "pretrained=<your_model>,dtype=bfloat16" --tasks caselawqa,caselawqa_tiny,caselawqa_hard --output_path=<output_path>
54
  """
55
 
56
  EVALUATION_QUEUE_TEXT = """
 
66
  If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
67
 
68
  Note: make sure your model is public!
69
+ Note: if your model needs `use_remote_code=True`, we do not support this option.
70
 
71
  ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
72
  It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
73
 
74
+ ### 3) Fill up your model card
75
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card.
 
 
 
76
 
77
  ## In case of model failure
78
  If your model is displayed in the `FAILED` category, its execution stopped.
 
82
 
83
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
84
  CITATION_BUTTON_TEXT = r"""
85
+ ```bibtex
86
+ @misc{dominguezolmedo2024lawmapowerspecializationlegal,
87
+ title={Lawma: The Power of Specialization for Legal Tasks},
88
+ author={Ricardo Dominguez-Olmedo and Vedant Nanda and Rediet Abebe and Stefan Bechtold and Christoph Engel and Jens Frankenreiter and Krishna Gummadi and Moritz Hardt and Michael Livermore},
89
+ year={2024},
90
+ eprint={2407.16615},
91
+ archivePrefix={arXiv},
92
+ primaryClass={cs.CL},
93
+ url={https://arxiv.org/abs/2407.16615},
94
+ }
95
+ ```
96
  """
src/envs.py CHANGED
@@ -6,12 +6,12 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "ricdomolm" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
+ REPO_ID = f"{OWNER}/caselawqa_leaderboard"
13
+ QUEUE_REPO = f"{OWNER}/caselawqa_leaderboard_requests"
14
+ RESULTS_REPO = f"{OWNER}/caselawqa_leaderboard_results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")