Spaces:
Paused
Paused
Added winograd tasks
Browse files- custom_tasks.py +2 -1
- src/about.py +2 -1
- src/custom_tasks/winograd_task.py +57 -0
custom_tasks.py
CHANGED
@@ -8,11 +8,12 @@ Author:
|
|
8 |
"""
|
9 |
from src.custom_tasks.heq_task import *
|
10 |
from src.custom_tasks.sentiment_task import *
|
|
|
11 |
|
12 |
## MODULE LOGIC
|
13 |
# You should not need to touch this
|
14 |
# Convert to dict for lighteval
|
15 |
-
TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task]]
|
16 |
|
17 |
if __name__ == "__main__":
|
18 |
print(t["name"] for t in TASKS_TABLE)
|
|
|
8 |
"""
|
9 |
from src.custom_tasks.heq_task import *
|
10 |
from src.custom_tasks.sentiment_task import *
|
11 |
+
from src.custom_tasks.winograd_task import *
|
12 |
|
13 |
## MODULE LOGIC
|
14 |
# You should not need to touch this
|
15 |
# Convert to dict for lighteval
|
16 |
+
TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task]]
|
17 |
|
18 |
if __name__ == "__main__":
|
19 |
print(t["name"] for t in TASKS_TABLE)
|
src/about.py
CHANGED
@@ -21,4 +21,5 @@ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
|
21 |
# ---------------------------------------------------
|
22 |
|
23 |
# TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
24 |
-
|
|
|
|
21 |
# ---------------------------------------------------
|
22 |
|
23 |
# TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
24 |
+
tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc']
|
25 |
+
TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)
|
src/custom_tasks/winograd_task.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
4 |
+
from lighteval.metrics import Metrics, MetricCategory
|
5 |
+
from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
|
6 |
+
from aenum import extend_enum
|
7 |
+
import numpy as np
|
8 |
+
from lighteval.tasks.requests import Doc
|
9 |
+
from Levenshtein import distance
|
10 |
+
import collections
|
11 |
+
from lighteval.utils import as_list
|
12 |
+
|
13 |
+
def winograd_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
|
14 |
+
if len(predictions) > 1:
|
15 |
+
raise ValueError("Predictions should have one item")
|
16 |
+
# do some santizations, since some models produce more info
|
17 |
+
pred = re.sub('<[^>]+>', '', predictions[0]) # remove xml tags
|
18 |
+
return 1 if pred == golds[0] else 0
|
19 |
+
|
20 |
+
winograd_acc_metric = CorpusLevelMetric(
|
21 |
+
metric="winograd_acc",
|
22 |
+
higher_is_better=True,
|
23 |
+
category=MetricCategory.GENERATIVE,
|
24 |
+
use_case=MetricUseCase.ACCURACY,
|
25 |
+
corpus_level_fn=np.mean,
|
26 |
+
sample_level_fn=winograd_eval_fn
|
27 |
+
)
|
28 |
+
extend_enum(Metrics, 'winograd_acc_metric', winograd_acc_metric)
|
29 |
+
|
30 |
+
def winograd_prompt_fn(line, task_name: str = None):
|
31 |
+
"""Defines how to go from a dataset line to a doc object.
|
32 |
+
Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
|
33 |
+
about what this function should do in the README.
|
34 |
+
"""
|
35 |
+
return Doc(
|
36 |
+
task_name=task_name,
|
37 |
+
query=line["prompt"],
|
38 |
+
choices=line["response"],
|
39 |
+
gold_index=0,
|
40 |
+
instruction="",
|
41 |
+
)
|
42 |
+
|
43 |
+
# This is how you create a simple tasks (like hellaswag) which has one single subset
|
44 |
+
# attached to it, and one evaluation possible.
|
45 |
+
winograd_task = LightevalTaskConfig(
|
46 |
+
name="winograd-acc",
|
47 |
+
prompt_function="winograd_prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
|
48 |
+
suite=["custom"],
|
49 |
+
hf_repo="dicta-hebrew-llm-leaderboard/tests",
|
50 |
+
hf_subset="default",
|
51 |
+
hf_avail_splits=["winograd"],
|
52 |
+
evaluation_splits=["winograd"],
|
53 |
+
metric=['winograd_acc_metric'],
|
54 |
+
stop_sequence=['\n'],
|
55 |
+
generation_size=32
|
56 |
+
)
|
57 |
+
winograd_task.stop_sequence = as_list(winograd_task.stop_sequence)
|