Shaltiel commited on
Commit
5f1bc85
1 Parent(s): eb2a0ba

Added winograd tasks

Browse files
custom_tasks.py CHANGED
@@ -8,11 +8,12 @@ Author:
8
  """
9
  from src.custom_tasks.heq_task import *
10
  from src.custom_tasks.sentiment_task import *
 
11
 
12
  ## MODULE LOGIC
13
  # You should not need to touch this
14
  # Convert to dict for lighteval
15
- TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task]]
16
 
17
  if __name__ == "__main__":
18
  print(t["name"] for t in TASKS_TABLE)
 
8
  """
9
  from src.custom_tasks.heq_task import *
10
  from src.custom_tasks.sentiment_task import *
11
+ from src.custom_tasks.winograd_task import *
12
 
13
  ## MODULE LOGIC
14
  # You should not need to touch this
15
  # Convert to dict for lighteval
16
+ TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task]]
17
 
18
  if __name__ == "__main__":
19
  print(t["name"] for t in TASKS_TABLE)
src/about.py CHANGED
@@ -21,4 +21,5 @@ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
21
  # ---------------------------------------------------
22
 
23
  # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
24
- TASKS_LIGHTEVAL = "custom|heq-qa-tlnls|0|0,custom|sentiment-acc|0|0"
 
 
21
  # ---------------------------------------------------
22
 
23
  # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
24
+ tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc']
25
+ TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)
src/custom_tasks/winograd_task.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ from lighteval.tasks.lighteval_task import LightevalTaskConfig
4
+ from lighteval.metrics import Metrics, MetricCategory
5
+ from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
6
+ from aenum import extend_enum
7
+ import numpy as np
8
+ from lighteval.tasks.requests import Doc
9
+ from Levenshtein import distance
10
+ import collections
11
+ from lighteval.utils import as_list
12
+
13
+ def winograd_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
14
+ if len(predictions) > 1:
15
+ raise ValueError("Predictions should have one item")
16
+ # do some santizations, since some models produce more info
17
+ pred = re.sub('<[^>]+>', '', predictions[0]) # remove xml tags
18
+ return 1 if pred == golds[0] else 0
19
+
20
+ winograd_acc_metric = CorpusLevelMetric(
21
+ metric="winograd_acc",
22
+ higher_is_better=True,
23
+ category=MetricCategory.GENERATIVE,
24
+ use_case=MetricUseCase.ACCURACY,
25
+ corpus_level_fn=np.mean,
26
+ sample_level_fn=winograd_eval_fn
27
+ )
28
+ extend_enum(Metrics, 'winograd_acc_metric', winograd_acc_metric)
29
+
30
+ def winograd_prompt_fn(line, task_name: str = None):
31
+ """Defines how to go from a dataset line to a doc object.
32
+ Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
33
+ about what this function should do in the README.
34
+ """
35
+ return Doc(
36
+ task_name=task_name,
37
+ query=line["prompt"],
38
+ choices=line["response"],
39
+ gold_index=0,
40
+ instruction="",
41
+ )
42
+
43
+ # This is how you create a simple tasks (like hellaswag) which has one single subset
44
+ # attached to it, and one evaluation possible.
45
+ winograd_task = LightevalTaskConfig(
46
+ name="winograd-acc",
47
+ prompt_function="winograd_prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
48
+ suite=["custom"],
49
+ hf_repo="dicta-hebrew-llm-leaderboard/tests",
50
+ hf_subset="default",
51
+ hf_avail_splits=["winograd"],
52
+ evaluation_splits=["winograd"],
53
+ metric=['winograd_acc_metric'],
54
+ stop_sequence=['\n'],
55
+ generation_size=32
56
+ )
57
+ winograd_task.stop_sequence = as_list(winograd_task.stop_sequence)